Project

General

Profile

Revision ef422766

IDef422766cdf356f4a14c2f09c8dfd11ee55d8955
Parent 8ab014c8
Child e1958cb8

Added by Cherian Mathew over 7 years ago

NameCatalogueController : added 'fuzzy' endpoint to perform fuzzy matching on names
NameSearch : updated NameSearch dto to include similarity score

View differences:

cdmlib-remote/src/main/java/eu/etaxonomy/cdm/remote/controller/dto/NameCatalogueController.java
23 23
import java.util.Hashtable;
24 24

  
25 25
import org.apache.log4j.Level;
26
import org.apache.lucene.queryParser.ParseException;
26 27
import org.joda.time.DateTime;
27 28
import org.joda.time.format.DateTimeFormat;
28 29
import org.joda.time.format.DateTimeFormatter;
......
41 42
import eu.etaxonomy.cdm.api.service.INameService;
42 43
import eu.etaxonomy.cdm.api.service.ITaxonService;
43 44
import eu.etaxonomy.cdm.api.service.ITermService;
45
import eu.etaxonomy.cdm.api.service.search.SearchResult;
44 46
import eu.etaxonomy.cdm.common.DocUtils;
47
import eu.etaxonomy.cdm.hibernate.HibernateProxyHelper;
45 48

  
46 49
import eu.etaxonomy.cdm.remote.controller.BaseController;
47 50
import eu.etaxonomy.cdm.remote.dto.common.ErrorResponse;
......
56 59
import eu.etaxonomy.cdm.model.common.IdentifiableSource;
57 60
import eu.etaxonomy.cdm.model.common.Language;
58 61
import eu.etaxonomy.cdm.model.common.Representation;
62
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
59 63
import eu.etaxonomy.cdm.model.description.Feature;
60 64
import eu.etaxonomy.cdm.model.name.NonViralName;
61 65
import eu.etaxonomy.cdm.model.name.TaxonNameBase;
......
114 118
    private static final String DWC_DATASET_ID = "http://rs.tdwg.org/dwc/terms/datasetID";
115 119

  
116 120
    private static final DateTimeFormatter fmt = DateTimeFormat.forPattern("dd-MM-yyyy");
121
    
117 122
    @Autowired
118 123
    private ITaxonService taxonService;
119 124
    
......
123 128
    
124 129
    @Autowired
125 130
    private ICommonService commonService;
131
    
126 132
    /** Hibernate name search initialisation strategy */
127 133
    private static final List<String> NAME_SEARCH_INIT_STRATEGY = Arrays.asList(new String[] {
128 134
            "combinationAuthorTeam.$",
......
379 385
                        }
380 386
                    }
381 387
                    // update name search object
382
                    ns.addToResponseList(nvn.getTitleCache(), nvn.getNameCache(), nvn.getUuid()
388
                    ns.addToResponseList(nvn.getTitleCache(), nvn.getNameCache(), 0, nvn.getUuid()
383 389
                            .toString(), tbSet, accTbSet);
384 390
                }
385 391
                nsList.add(ns);
......
395 401
        return mv;
396 402
    }
397 403

  
404
    /**
405
     * Returns a list of scientific names similar to the <code>{query}</code>
406
     * string pattern. Each of these scientific names is accompanied by a list of
407
     * name uuids, a list of taxon uuids and a list of accepted taxon uuids.
408
     * The underlying (Lucene FuzzyQuery) string distance metric used is based on a 
409
     * fail-fast Levenshtein distance algorithm (is aborted if it is discovered that 
410
     * the mimimum distance between the words is greater than some threshold)
411
     * <p>
412
     * Endpoint documentation can be found <a href="{@docRoot}/../remote/name-catalogue-fuzzy.html">here</a>
413
     * <p>
414
     * URI: <b>&#x002F;{datasource-name}&#x002F;name_catalogue/fuzzy</b>
415
     *
416
     * @param query
417
     *                The scientific name pattern(s) to query for. Any wildcard characters in the
418
     *                query are removed.
419
     * @param accuracy
420
     *                Similarity measure (between 0 and 1) to impose on the matching algorithm.
421
     *                Briefly described, this is equivalent to the edit distance between the two words, divided by 
422
     *                the length of the shorter of the compared terms.
423
     * @param hits               
424
     *            Maximum number of responses to be returned.    
425
     * @param request Http servlet request.
426
     * @param response Http servlet response.
427
     * @return a List of {@link NameSearch} objects each corresponding to a
428
     *         single query. These are built from {@link TaxonNameBase} entities
429
     *         which are in turn initialized using the {@link #NAME_SEARCH_INIT_STRATEGY}
430
     * @throws IOException
431
     */
432
    @RequestMapping(value = { "fuzzy" }, method = RequestMethod.GET, params = {"query"})
433
    public ModelAndView doGetNameFuzzySearch(@RequestParam(value = "query", required = true) String[] queries,
434
    		@RequestParam(value = "accuracy", required = false, defaultValue = "0.5") String accuracy,
435
    		@RequestParam(value = "hits", required = false, defaultValue = "10") String hits,
436
            HttpServletRequest request, HttpServletResponse response) throws IOException {
437
        ModelAndView mv = new ModelAndView();
438
        List<RemoteResponse> nsList = new ArrayList<RemoteResponse>();
439
        float acc = 0.5f;
440
        int h = 10;
441
        try {
442
        	acc = Float.parseFloat(accuracy);
443
        	h = Integer.parseInt(hits);
444
        } catch(NumberFormatException nfe) {
445
        	ErrorResponse er = new ErrorResponse();
446
        	er.setErrorMessage("accuracy or hits parameter is not a number");
447
        	mv.addObject(er);
448
            return mv;
449
        }
450
        // search through each query
451
        for (String query : queries) {
452
        	if(query.equals("")) {
453
				ErrorResponse er = new ErrorResponse();
454
                er.setErrorMessage("Empty query field");
455
                nsList.add(er);
456
                continue;
457
        	}
458
        	// remove wildcards if any
459
            String queryWOWildcards = getQueryWithoutWildCards(query);
460
            // convert first char to upper case
461
            char[] stringArray = queryWOWildcards.toCharArray();
462
            stringArray[0] = Character.toUpperCase(stringArray[0]);
463
            queryWOWildcards = new String(stringArray);
464
            logger.info("doGetNameSearch()" + request.getServletPath() + " for query \"" + queryWOWildcards + " with accuracy " + accuracy);
465
            //List<NonViralName> nameList = new ArrayList<NonViralName>();
466
            List<SearchResult<TaxonNameBase>> nameSearchList = new ArrayList<SearchResult<TaxonNameBase>>();
467
            try {            	            
468
				nameSearchList = service.findByNameFuzzySearch(
469
				        queryWOWildcards,
470
				        acc,
471
				        null,
472
				        false, 
473
				        NAME_SEARCH_INIT_STRATEGY,
474
				        h);
475
			} catch (ParseException e) {
476
				// TODO Auto-generated catch block
477
				//e.printStackTrace();
478
				ErrorResponse er = new ErrorResponse();
479
                er.setErrorMessage("Could not parse name : " + queryWOWildcards);
480
                nsList.add(er);
481
                continue;
482
			} 
483
     
484

  
485
            // if search is successful then get related information , else return error
486
            if (nameSearchList == null || !nameSearchList.isEmpty()) {
487
                NameSearch ns = new NameSearch();
488
                ns.setRequest(query);
489

  
490
                for (SearchResult searchResult : nameSearchList) {
491
                	NonViralName nvn = HibernateProxyHelper.deproxy(searchResult.getEntity(), NonViralName.class);
492
                	
493
                    // we need to retrieve both taxon uuid of name queried and
494
                    // the corresponding accepted taxa.
495
                    // reason to return accepted taxa also, is to be able to get from
496
                    // scientific name to taxon concept in two web service calls.
497
                    Set<TaxonBase> tbSet = nvn.getTaxonBases();
498
                    Set<TaxonBase> accTbSet = new HashSet<TaxonBase>();
499
                    for (TaxonBase tb : tbSet) {
500
                        // if synonym then get accepted taxa.
501
                        if (tb instanceof Synonym) {
502
                            Synonym synonym = (Synonym) tb;
503
                            Set<SynonymRelationship> synRelationships = synonym.getSynonymRelations();
504
                            for (SynonymRelationship sr : synRelationships) {
505
                                Taxon accTaxon = sr.getAcceptedTaxon();
506
                                accTbSet.add(accTaxon);
507
                            }
508
                        } else {
509
                            accTbSet.add(tb);
510
                        }
511
                    }
512
                    // update name search object
513
                    ns.addToResponseList(nvn.getTitleCache(), nvn.getNameCache(), searchResult.getMaxScore(), nvn.getUuid()
514
                            .toString(), tbSet, accTbSet);
515
                }
516
                nsList.add(ns);
517

  
518
            } else {
519
                ErrorResponse er = new ErrorResponse();
520
                er.setErrorMessage("No Taxon Name matches : " + query + ", for given accuracy");
521
                nsList.add(er);
522
            }
523
        }        
524

  
525
        mv.addObject(nsList);
526
        return mv;
527
    }
398 528

  
399 529
    /**
400 530
     * Returns a documentation page for the Name Information API.
......
988 1118
            String classificationKey = removeInternalWhitespace(c.getTitleCache());
989 1119
            if(c.getReference() != null) {
990 1120
                refTitleCache = c.getReference().getTitleCache();
1121
                c.getAllNodes();
991 1122
            }
992 1123
            // default is the first element of the list
993 1124
            // always created with the same sorting (DESCENDING)
cdmlib-remote/src/main/java/eu/etaxonomy/cdm/remote/dto/namecatalogue/NameSearch.java
43 43
        return this.request;
44 44
    }
45 45

  
46
    public void addToResponseList(String title, String name, String nameUuid, Set<TaxonBase> taxonBases,Set<TaxonBase> accTaxonBases) {
46
    public void addToResponseList(String title, String name, float score, String nameUuid, Set<TaxonBase> taxonBases,Set<TaxonBase> accTaxonBases) {
47 47

  
48 48
        NameSearch.NameSearchResponse res = responseWithtitle(title);
49 49
        if (res == null) {
50 50
            res = new NameSearch.NameSearchResponse();
51 51
            res.setTitle(title);
52 52
            res.setName(name);
53
            res.setScore(score);
53 54
            response.add(res);
54 55
        }
55 56
        res.addToNameUuids(nameUuid);
......
92 93
    public class NameSearchResponse {
93 94
        private String title;
94 95
        private String name;
95
        private Set<String> nameUuids;
96
        private float score;
97
		private Set<String> nameUuids;
96 98
        private Set<String> taxonConceptUuids;
97 99
        private Set<String> acceptedTaxontUuids;
98 100

  
99 101
        public NameSearchResponse() {
100 102
            title = "";
101 103
            name = "";
104
            score = 0;
102 105
            nameUuids = new HashSet<String>();
103 106
            taxonConceptUuids = new HashSet<String>();
104 107
            acceptedTaxontUuids = new HashSet<String>();
......
120 123
            return this.name;
121 124
        }
122 125

  
126
        public float getScore() {
127
			return score;
128
		}
129

  
130
		public void setScore(float score) {
131
			this.score = score;
132
		}
133
		
123 134
        public void addToNameUuids(String nameUuid) {
124 135
            nameUuids.add(nameUuid);
125 136
        }

Also available in: Unified diff

Add picture from clipboard (Maximum size: 40 MB)