Project

General

Profile

Revision df6b4d52

IDdf6b4d52edd818109414d7865473ca8ba668fdc1
Parent 6233c56e
Child 804ac527

Added by Cherian Mathew about 7 years ago

INameService : added method to fuzzy search name cache
NameServiceImpl : implemented method to fuzzy search name cache
NameCatalogueController : added param to fuzzy search endpoint which allows for searching name cache or the atomised elements.
NonViralName : set index analyse option to yes for nameCache to include it in fuzzy searching

View differences:

cdmlib-model/src/main/java/eu/etaxonomy/cdm/model/name/NonViralName.java
126 126
    @XmlElement(name = "NameCache")
127 127
    @Fields({
128 128
        @Field(name = "nameCache_tokenized"),
129
        @Field(store = Store.YES, index = Index.YES, analyze = Analyze.NO)
129
        @Field(store = Store.YES, index = Index.YES, analyze = Analyze.YES)
130 130
    })
131 131
    @Match(value=MatchMode.CACHE, cacheReplaceMode=ReplaceMode.DEFINED,
132 132
            cacheReplacedProperties={"genusOrUninomial", "infraGenericEpithet", "specificEpithet", "infraSpecificEpithet"} )
cdmlib-remote/src/main/java/eu/etaxonomy/cdm/remote/controller/dto/NameCatalogueController.java
122 122
    /** Classifcation 'all' key */
123 123
    public static final String CLASSIFICATION_ALL = "all";
124 124
    
125
    /** Fuzzy Name Cache search */
126
    public static final String FUZZY_NAME_CACHE = "name";
127
    
128
    /** Fuzzy Atomised Name search */
129
    public static final String FUZZY_ATOMISED = "atomised";
130
    
125 131
    private static final String DWC_DATASET_ID = "http://rs.tdwg.org/dwc/terms/datasetID";
126 132

  
127 133
    private static final DateTimeFormatter fmt = DateTimeFormat.forPattern("dd-MM-yyyy");
......
482 488
     *                Briefly described, this is equivalent to the edit distance between the two words, divided by 
483 489
     *                the length of the shorter of the compared terms.
484 490
     * @param hits               
485
     *            Maximum number of responses to be returned.    
491
     *            Maximum number of responses to be returned.
492
     * @param type               
493
     *            The type of fuzzy search to call. This can be either
494
     *              "name" : fuzzy searches scientific names corresponding to 'name cache' in CDM or
495
     *              "atomised" : parses the query into atomised elements and fuzzy searches the individual elements in the CDM      
486 496
     * @param request Http servlet request.
487 497
     * @param response Http servlet response.
488 498
     * @return a List of {@link NameSearch} objects each corresponding to a
......
494 504
    public ModelAndView doGetNameFuzzySearch(@RequestParam(value = "query", required = true) String[] queries,
495 505
    		@RequestParam(value = "accuracy", required = false, defaultValue = "0.6") String accuracy,
496 506
    		@RequestParam(value = "hits", required = false, defaultValue = "10") String hits,
507
    		@RequestParam(value = "type", required = false, defaultValue = FUZZY_NAME_CACHE) String type,
497 508
            HttpServletRequest request, HttpServletResponse response) throws IOException {
498 509
        ModelAndView mv = new ModelAndView();
499 510
        List<RemoteResponse> nsList = new ArrayList<RemoteResponse>();
......
532 543
            logger.info("doGetNameSearch()" + request.getRequestURI() + " for query \"" + queryWOWildcards + " with accuracy " + accuracy);
533 544
            //List<NonViralName> nameList = new ArrayList<NonViralName>();
534 545
            List<DocumentSearchResult> nameSearchList = new ArrayList<DocumentSearchResult>();
535
            try {            	            
536
				nameSearchList = service.findByNameFuzzySearch(
537
				        queryWOWildcards,
538
				        acc,
539
				        null,
540
				        false, 
541
				        h);
542
			} catch (ParseException e) {
546
            try {       
547
            	if(type.equals(FUZZY_ATOMISED)) {
548
            		nameSearchList = service.findByNameFuzzySearch(
549
            				queryWOWildcards,
550
            				acc,
551
            				null,
552
            				false, 
553
            				h);
554
            	} else {
555
            		nameSearchList = service.findByFuzzyNameCacheSearch(
556
            				queryWOWildcards,
557
            				acc,
558
            				null,
559
            				false, 
560
            				h);
561
            	}
562
            } catch (ParseException e) {
543 563
				// TODO Auto-generated catch block
544 564
				//e.printStackTrace();
545 565
				ErrorResponse er = new ErrorResponse();
cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/INameService.java
206 206
            int maxNoOfResults) throws CorruptIndexException, IOException, ParseException;
207 207
    
208 208
	/**
209
	 * Fuzzy matching against the name cache using only the lucene index. 
210
	 * 
211
	 *  
212
	 * @param name taxon name to fuzzy match
213
	 * @param accuracy value > 0.0 and < 1.0 which determines the accuracy of the result.
214
	 * @param languages list of languages to consider when matching (currently not used)
215
	 * @param highlightFragments
216
	 * @param maxNoOfResults 
217
	 * @return
218
	 * @throws CorruptIndexException
219
	 * @throws IOException
220
	 * @throws ParseException
221
	 */
222
    public List<DocumentSearchResult> findByFuzzyNameCacheSearch(
223
            String name,
224
            float accuracy,
225
            List<Language> languages,
226
            boolean highlightFragments, 
227
            int maxNoOfResults) throws CorruptIndexException, IOException, ParseException;
228
    
229
	/**
209 230
	 * Exact matching for the taxon name elements using only the lucene index.
210 231
	 * 
211 232
	 * The input name is first atomised using the {@link NonViralNameParserImpl}
cdmlib-services/src/main/java/eu/etaxonomy/cdm/api/service/NameServiceImpl.java
567 567
    protected LuceneSearch prepareFindByFuzzyNameSearch(Class<? extends CdmBase> clazz, 
568 568
    		NonViralName nvn,
569 569
    		float accuracy,
570
    		int maxNoOfResults,
570
    		int maxNoOfResults,    		
571 571
    		List<Language> languages,
572 572
    		boolean highlightFragments) {
573 573
    	String similarity = Float.toString(accuracy);    	
......
629 629
    	return luceneSearch;
630 630
    }
631 631
    
632
    protected LuceneSearch prepareFindByFuzzyTitleCacheSearch(Class<? extends CdmBase> clazz, 
633
    		String name,
634
    		float accuracy,
635
    		int maxNoOfResults,    		
636
    		List<Language> languages,
637
            boolean highlightFragments) {    	    	
638

  
639
        LuceneSearch luceneSearch = new LuceneSearch(getSession(), TaxonNameBase.class);
640
        QueryFactory queryFactory = new QueryFactory(luceneSearch);
641

  
642
        // ---- search criteria
643
        luceneSearch.setClazz(clazz);
644
        FuzzyLikeThisQuery fltq = new FuzzyLikeThisQuery(maxNoOfResults, luceneSearch.getAnalyzer());                      
645

  
646
        fltq.addTerms(name.toLowerCase(), "nameCache", accuracy, 3);           
647

  
648
     	BooleanQuery finalQuery = new BooleanQuery(false);
649
     	
650
     	finalQuery.add(fltq, Occur.MUST);   
651
     	
652
        luceneSearch.setQuery(finalQuery);
653

  
654
        if(highlightFragments){
655
            luceneSearch.setHighlightFields(queryFactory.getTextFieldNamesAsArray());
656
        }
657
        return luceneSearch;
658
    }
659
    
632 660
    protected LuceneSearch prepareFindByExactNameSearch(Class<? extends CdmBase> clazz, 
633 661
    		String name,
634 662
    		boolean wildcard,
......
731 759
        return searchResults;
732 760
    }
733 761
    
762
    public List<DocumentSearchResult> findByFuzzyNameCacheSearch(
763
            String name,
764
    		float accuracy,            
765
            List<Language> languages,
766
            boolean highlightFragments,
767
            int maxNoOfResults) throws CorruptIndexException, IOException, ParseException {
768

  
769
    	logger.info("Name to fuzzy search for : " + name);
770
    	
771
        LuceneSearch luceneSearch = prepareFindByFuzzyTitleCacheSearch(null, name, accuracy, maxNoOfResults, languages, highlightFragments);
772

  
773
        // --- execute search        
774
        TopDocs topDocs = luceneSearch.executeSearch(maxNoOfResults);
775
        Map<CdmBaseType, String> idFieldMap = new HashMap<CdmBaseType, String>();        
776

  
777
        // --- initialize taxa, highlight matches ....
778
        ISearchResultBuilder searchResultBuilder = new SearchResultBuilder(luceneSearch, luceneSearch.getQuery());
779
        
780
        @SuppressWarnings("rawtypes")
781
        List<DocumentSearchResult> searchResults = searchResultBuilder.createResultSet(topDocs, luceneSearch.getHighlightFields());
782

  
783
        return searchResults;
784
    }
785
    
734 786
    public List<DocumentSearchResult> findByNameExactSearch(
735 787
            String name,
736 788
            boolean wildcard,

Also available in: Unified diff

Add picture from clipboard (Maximum size: 40 MB)