Project

General

Profile

Revision cb312388

IDcb312388a600c966cbda35537cb66a6f93416a7c
Parent 8287f797
Child 55d7bb64

Added by Cherian Mathew about 8 years ago

INameService : added findByNameExactSearch to return lucene search documents for name searches
NameServiceImpl : implemented and updated exact / fuzzy name searching

View differences:

cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/INameService.java
23 23
import eu.etaxonomy.cdm.api.service.config.NameDeletionConfigurator;
24 24
import eu.etaxonomy.cdm.api.service.exception.ReferencedObjectUndeletableException;
25 25
import eu.etaxonomy.cdm.api.service.pager.Pager;
26
import eu.etaxonomy.cdm.api.service.search.DocumentSearchResult;
26 27
import eu.etaxonomy.cdm.api.service.search.SearchResult;
27 28
import eu.etaxonomy.cdm.model.common.CdmBase;
28 29
import eu.etaxonomy.cdm.model.common.Language;
......
179 180
            boolean highlightFragments, 
180 181
            List<String> propertyPaths,
181 182
            int maxNoOfResults) throws CorruptIndexException, IOException, ParseException;
183
	
184
	/**
185
	 * Fuzzy matching for the taxon name elements using only the lucene index. 
186
	 * 
187
	 * The input name is first atomised using the {@link NonViralNameParserImpl}
188
	 * into its separate parts (genusOrUninomial,infraGenericEpithet,specificEpithet,infraGenericEpithet,authorshipCache).
189
	 * Each field is then matched separately with the same accuracy parameter.
190
	 *  
191
	 * @param name taxon name to fuzzy match
192
	 * @param accuracy value > 0.0 and < 1.0 which determines the accuracy of the result.
193
	 * @param languages list of languages to consider when matching (currently not used)
194
	 * @param highlightFragments
195
	 * @param maxNoOfResults 
196
	 * @return
197
	 * @throws CorruptIndexException
198
	 * @throws IOException
199
	 * @throws ParseException
200
	 */
201
    public List<DocumentSearchResult> findByNameFuzzySearch(
202
            String name,
203
            float accuracy,
204
            List<Language> languages,
205
            boolean highlightFragments, 
206
            int maxNoOfResults) throws CorruptIndexException, IOException, ParseException;
207
    
208
	/**
209
	 * Exact matching for the taxon name elements using only the lucene index.
210
	 * 
211
	 * The input name is first atomised using the {@link NonViralNameParserImpl}
212
	 * into its separate parts (genusOrUninomial,infraGenericEpithet,specificEpithet,infraGenericEpithet,authorshipCache).
213
	 * Each field is then matched separately with the same accuracy parameter.
214
	 *  
215
	 * @param name taxon name to fuzzy match
216
	 * @param wildcard boolean flag to indicate whether a wildcard '*' should be added at the end of the query
217
	 * @param languages list of languages to consider when matching (currently not used)
218
	 * @param highlightFragments
219
	 * @param maxNoOfResults 
220
	 * @return
221
	 * @throws CorruptIndexException
222
	 * @throws IOException
223
	 * @throws ParseException
224
	 */
225
    
226
    public List<DocumentSearchResult> findByNameExactSearch(
227
            String name,      
228
            boolean wildcard,
229
            List<Language> languages,
230
            boolean highlightFragments, 
231
            int maxNoOfResults) throws CorruptIndexException, IOException, ParseException;
182 232

  
183 233
	// TODO: Remove getNamesByName() methods. Use findNamesByTitle() instead.
184 234

  
cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NameServiceImpl.java
21 21
import java.util.UUID;
22 22

  
23 23
import org.apache.log4j.Logger;
24
import org.apache.lucene.analysis.SimpleAnalyzer;
24 25
import org.apache.lucene.index.CorruptIndexException;
25 26
import org.apache.lucene.index.Term;
26 27
import org.apache.lucene.queryParser.ParseException;
27 28
import org.apache.lucene.search.BooleanQuery;
29
import org.apache.lucene.search.Explanation;
30
import org.apache.lucene.search.FuzzyLikeThisQuery;
31
import org.apache.lucene.search.Query;
28 32
import org.apache.lucene.search.SortField;
29 33
import org.apache.lucene.search.BooleanClause.Occur;
34
import org.apache.lucene.search.TermQuery;
30 35
import org.apache.lucene.search.TopDocs;
36
import org.apache.lucene.search.WildcardQuery;
31 37
import org.apache.lucene.search.regex.RegexQuery;
38
import org.apache.lucene.util.Version;
32 39
import org.hibernate.criterion.Criterion;
33 40
import org.springframework.beans.factory.annotation.Autowired;
34 41
import org.springframework.beans.factory.annotation.Qualifier;
......
40 47
import eu.etaxonomy.cdm.api.service.pager.Pager;
41 48
import eu.etaxonomy.cdm.api.service.pager.impl.AbstractPagerImpl;
42 49
import eu.etaxonomy.cdm.api.service.pager.impl.DefaultPagerImpl;
50
import eu.etaxonomy.cdm.api.service.search.DocumentSearchResult;
43 51
import eu.etaxonomy.cdm.api.service.search.ISearchResultBuilder;
44 52
import eu.etaxonomy.cdm.api.service.search.LuceneSearch;
45 53
import eu.etaxonomy.cdm.api.service.search.QueryFactory;
......
554 562
        return results;
555 563
    }
556 564

  
557
    
558
    protected LuceneSearch prepareFindByFuzzySearch(Class<? extends CdmBase> clazz, 
565

  
566
    protected LuceneSearch prepareFindByFuzzyNameSearch(Class<? extends CdmBase> clazz, 
559 567
    		NonViralName nvn,
560 568
    		float accuracy,
561 569
    		List<Language> languages,
570
    		boolean highlightFragments) {
571
    	String similarity = Float.toString(accuracy);    	
572
    	String searchSuffix = "~" + similarity;
573
    	
574

  
575
    	BooleanQuery finalQuery = new BooleanQuery();
576
    	BooleanQuery textQuery = new BooleanQuery();
577

  
578
    	LuceneSearch luceneSearch = new LuceneSearch(getSession(), TaxonNameBase.class);    	   
579
    	QueryFactory queryFactory = new QueryFactory(luceneSearch);
580

  
581
    	SortField[] sortFields = new  SortField[]{SortField.FIELD_SCORE, new SortField("titleCache__sort", SortField.STRING,  false)};
582
    	luceneSearch.setSortFields(sortFields);
583

  
584
    	// ---- search criteria
585
    	luceneSearch.setClazz(clazz);
586

  
587
    	
588
    	if(nvn.getGenusOrUninomial() != null && !nvn.getGenusOrUninomial().equals("")) {        	
589
    		textQuery.add(queryFactory.newTermQuery("genusOrUninomial", nvn.getGenusOrUninomial() + searchSuffix), Occur.SHOULD);
590
    		
591
    	} else {
592
    		textQuery.add(new RegexQuery (new Term ("genusOrUninomial", "^[a-zA-Z]*")), Occur.MUST_NOT);
593
    	}
594

  
595
    	if(nvn.getInfraGenericEpithet() != null && !nvn.getInfraGenericEpithet().equals("")){
596
    		textQuery.add(queryFactory.newTermQuery("infraGenericEpithet", nvn.getInfraGenericEpithet() + searchSuffix), Occur.SHOULD);
597
    	} else {
598
    		textQuery.add(new RegexQuery (new Term ("infraGenericEpithet", "^[a-zA-Z]*")), Occur.MUST_NOT);
599
    	}
600

  
601
    	if(nvn.getSpecificEpithet() != null && !nvn.getSpecificEpithet().equals("")){
602
    		textQuery.add(queryFactory.newTermQuery("specificEpithet", nvn.getSpecificEpithet() + searchSuffix), Occur.SHOULD);  
603

  
604
    	} else {
605
    		textQuery.add(new RegexQuery (new Term ("specificEpithet", "^[a-zA-Z]*")), Occur.MUST_NOT);
606
    	}
607

  
608
    	if(nvn.getInfraSpecificEpithet() != null && !nvn.getInfraSpecificEpithet().equals("")){
609
    		textQuery.add(queryFactory.newTermQuery("infraSpecificEpithet", nvn.getInfraSpecificEpithet() + searchSuffix), Occur.SHOULD);
610
    	} else {
611
    		textQuery.add(new RegexQuery (new Term ("infraSpecificEpithet", "^[a-zA-Z]*")), Occur.MUST_NOT);
612
    	}
613

  
614
    	if(nvn.getAuthorshipCache() != null && !nvn.getAuthorshipCache().equals("")){
615
    		textQuery.add(queryFactory.newTermQuery("authorshipCache", nvn.getAuthorshipCache() + searchSuffix), Occur.SHOULD);
616
    	} else {
617
    		//textQuery.add(new RegexQuery (new Term ("authorshipCache", "^[a-zA-Z]*")), Occur.MUST_NOT);
618
    	}
619

  
620
    	finalQuery.add(textQuery, Occur.MUST);
621

  
622
    	luceneSearch.setQuery(finalQuery);
623

  
624
    	if(highlightFragments){
625
    		luceneSearch.setHighlightFields(queryFactory.getTextFieldNamesAsArray());
626
    	}
627
    	return luceneSearch;
628
    }
629
    
630
    protected LuceneSearch prepareFindByExactNameSearch(Class<? extends CdmBase> clazz, 
631
    		String name,
632
    		boolean wildcard,
633
    		List<Language> languages,
562 634
            boolean highlightFragments) {
563 635
        BooleanQuery finalQuery = new BooleanQuery();
564 636
        BooleanQuery textQuery = new BooleanQuery();
......
571 643

  
572 644
        // ---- search criteria
573 645
        luceneSearch.setClazz(clazz);
574
        String similarity = Float.toString(accuracy);
575
        if(nvn.getGenusOrUninomial() != null && !nvn.getGenusOrUninomial().equals("")) {        	
576
        	textQuery.add(queryFactory.newTermQuery("genusOrUninomial", nvn.getGenusOrUninomial() + "~" + similarity), Occur.SHOULD);
577
        } else {
578
        	textQuery.add(new RegexQuery (new Term ("genusOrUninomial", "^[a-zA-Z]*")), Occur.MUST_NOT);
579
        }
580 646
        
581
        if(nvn.getInfraGenericEpithet() != null && !nvn.getInfraGenericEpithet().equals("")){
582
        	textQuery.add(queryFactory.newTermQuery("infraGenericEpithet", nvn.getInfraGenericEpithet() + "~" + similarity), Occur.SHOULD);
583
        } else {
584
        	textQuery.add(new RegexQuery (new Term ("infraGenericEpithet", "^[a-zA-Z]*")), Occur.MUST_NOT);
585
        }
586 647
        
587
        if(nvn.getSpecificEpithet() != null && !nvn.getSpecificEpithet().equals("")){
588
        	textQuery.add(queryFactory.newTermQuery("specificEpithet", nvn.getSpecificEpithet() + "~" + similarity), Occur.SHOULD);
589
        } else {
590
        	textQuery.add(new RegexQuery (new Term ("specificEpithet", "^[a-zA-Z]*")), Occur.MUST_NOT);
591
        }
592 648
        
593
        if(nvn.getInfraSpecificEpithet() != null && !nvn.getInfraSpecificEpithet().equals("")){
594
        	textQuery.add(queryFactory.newTermQuery("infraSpecificEpithet", nvn.getInfraSpecificEpithet() + "~" + similarity), Occur.SHOULD);
595
        } else {
596
        	textQuery.add(new RegexQuery (new Term ("infraSpecificEpithet", "^[a-zA-Z]*")), Occur.MUST_NOT);
597
        }
598
        
599
        if(nvn.getAuthorshipCache() != null && !nvn.getAuthorshipCache().equals("")){
600
        	textQuery.add(queryFactory.newTermQuery("authorshipCache", nvn.getAuthorshipCache() + "~" + similarity), Occur.SHOULD);
601
        } else {
602
        	textQuery.add(new RegexQuery (new Term ("authorshipCache", "^[a-zA-Z]*")), Occur.MUST_NOT);
603
        }
649
        if(name != null && !name.equals("")) {        	        	
650
        	if(wildcard) {
651
        		textQuery.add(new WildcardQuery(new Term("nameCache", name + "*")), Occur.MUST);
652
        	} else {
653
        		textQuery.add(queryFactory.newTermQuery("nameCache", name, false), Occur.MUST);
654
        	}
655
        } 
604 656
        
605 657
        finalQuery.add(textQuery, Occur.MUST);
606 658

  
607
        luceneSearch.setQuery(finalQuery);
659
        luceneSearch.setQuery(textQuery);
608 660

  
609 661
        if(highlightFragments){
610 662
            luceneSearch.setHighlightFields(queryFactory.getTextFieldNamesAsArray());
......
627 679
    	if(name != null && !name.equals("") && nvn == null) {
628 680
    		throw new ParseException("Could not parse name " + name);
629 681
    	}
630
        LuceneSearch luceneSearch = prepareFindByFuzzySearch(null, nvn, accuracy, languages, highlightFragments);
682
        LuceneSearch luceneSearch = prepareFindByFuzzyNameSearch(null, nvn, accuracy, languages, highlightFragments);
631 683

  
632 684
        // --- execute search        
633 685
        TopDocs topDocs = luceneSearch.executeSearch(maxNoOfResults);
686

  
687
        
634 688
        Map<CdmBaseType, String> idFieldMap = new HashMap<CdmBaseType, String>();
635 689
        idFieldMap.put(CdmBaseType.NONVIRALNAME, "id");
636 690

  
......
644 698
        return searchResults;
645 699

  
646 700
    }
701
    
702
    public List<DocumentSearchResult> findByNameFuzzySearch(
703
            String name,
704
            float accuracy,
705
            List<Language> languages,
706
            boolean highlightFragments, 
707
            int maxNoOfResults) throws CorruptIndexException, IOException, ParseException {
708

  
709
    	logger.info("Name to fuzzy search for : " + name);
710
    	// parse the input name
711
    	NonViralNameParserImpl parser = new NonViralNameParserImpl();
712
    	NonViralName nvn = parser.parseFullName(name);
713
    	if(name != null && !name.equals("") && nvn == null) {
714
    		throw new ParseException("Could not parse name " + name);
715
    	}
716
        LuceneSearch luceneSearch = prepareFindByFuzzyNameSearch(null, nvn, accuracy, languages, highlightFragments);
717

  
718
        // --- execute search        
719
        TopDocs topDocs = luceneSearch.executeSearch(maxNoOfResults);
720
//        for(int i = 0; i < topDocs.scoreDocs.length ; i++) {
721
//        	Explanation exp = luceneSearch.getSearcher().explain(luceneSearch.getQuery(), topDocs.scoreDocs[i].doc);
722
//        	System.out.println("-----------------------");
723
//        	System.out.println(exp.toString());
724
//        }
725
        
726
        Map<CdmBaseType, String> idFieldMap = new HashMap<CdmBaseType, String>();
727

  
728
        // --- initialize taxa, highlight matches ....
729
        ISearchResultBuilder searchResultBuilder = new SearchResultBuilder(luceneSearch, luceneSearch.getQuery());
730
        
731
        @SuppressWarnings("rawtypes")
732
        List<DocumentSearchResult> searchResults = searchResultBuilder.createResultSet(topDocs, luceneSearch.getHighlightFields());
733

  
734
        return searchResults;
735
    }
736
    
737
    public List<DocumentSearchResult> findByNameExactSearch(
738
            String name,
739
            boolean wildcard,
740
            List<Language> languages,
741
            boolean highlightFragments, 
742
            int maxNoOfResults) throws CorruptIndexException, IOException, ParseException {
743

  
744
    	logger.info("Name to fuzzy search for : " + name);
745
    	
746
        LuceneSearch luceneSearch = prepareFindByExactNameSearch(null, name, wildcard, languages, highlightFragments);
747

  
748
        // --- execute search        
749
        TopDocs topDocs = luceneSearch.executeSearch(maxNoOfResults);
750
        Map<CdmBaseType, String> idFieldMap = new HashMap<CdmBaseType, String>();        
751

  
752
        // --- initialize taxa, highlight matches ....
753
        ISearchResultBuilder searchResultBuilder = new SearchResultBuilder(luceneSearch, luceneSearch.getQuery());
754
        
755
        @SuppressWarnings("rawtypes")
756
        List<DocumentSearchResult> searchResults = searchResultBuilder.createResultSet(topDocs, luceneSearch.getHighlightFields());
757

  
758
        return searchResults;
759
    }
760
    
647 761
    /* (non-Javadoc)
648 762
     * @see eu.etaxonomy.cdm.api.service.INameService#pageNameRelationships(eu.etaxonomy.cdm.model.name.TaxonNameBase, eu.etaxonomy.cdm.model.common.RelationshipBase.Direction, eu.etaxonomy.cdm.model.name.NameRelationshipType, java.lang.Integer, java.lang.Integer, java.util.List, java.util.List)
649 763
     */
cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/search/ISearchResultBuilder.java
58 58
            String[] highlightFields, ICdmEntityDao<T> dao, Map<CdmBaseType, String> idFields, List<String> propertyPaths) throws CorruptIndexException, IOException;
59 59
    
60 60
    /**
61
     * Creates a <code>List</code> of <code>SearchResult</code> entities from the supplied <code>TopDocs</code>.
61
     * Creates a <code>List</code> of <code>DocumentSearchResult</code> entities from the supplied <code>TopDocs</code>.
62
     * This method can be used for building index-only results.
62 63
     *
63 64
     * @param topDocs
64 65
     * @param highlightFields

Also available in: Unified diff

Add picture from clipboard (Maximum size: 40 MB)