Project

General

Profile

« Previous | Next » 

Revision c4591f7a

Added by Andreas Müller over 5 years ago

ref #7798 import freetext nameInSource for occurrences best way and log all problems

View differences:

app-import/src/main/java/eu/etaxonomy/cdm/io/berlinModel/in/BerlinModelOccurrenceSourceImport.java
11 11

  
12 12
import java.sql.ResultSet;
13 13
import java.sql.SQLException;
14
import java.util.ArrayList;
15 14
import java.util.HashMap;
16 15
import java.util.HashSet;
17
import java.util.List;
18 16
import java.util.Map;
19 17
import java.util.Set;
20 18

  
......
31 29
import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
32 30
import eu.etaxonomy.cdm.model.description.DescriptionElementSource;
33 31
import eu.etaxonomy.cdm.model.description.Distribution;
34
import eu.etaxonomy.cdm.model.name.INonViralName;
32
import eu.etaxonomy.cdm.model.description.TaxonDescription;
35 33
import eu.etaxonomy.cdm.model.name.TaxonName;
36 34
import eu.etaxonomy.cdm.model.reference.Reference;
35
import eu.etaxonomy.cdm.model.taxon.Taxon;
37 36

  
38 37

  
39 38
/**
......
53 52

  
54 53

  
55 54
	private Map<String, Integer> sourceNumberRefIdMap;
56
	private Set<String> unfoundReferences = new HashSet<>();
55
	private Map<String, Set<Integer>> nameCache2NameIdMap;
56
	private Set<String> notFoundReferences = new HashSet<>();
57 57

  
58 58

  
59 59
	public BerlinModelOccurrenceSourceImport(){
......
83 83

  
84 84
	@Override
85 85
	protected void doInvoke(BerlinModelImportState state) {
86
		unfoundReferences = new HashSet<>();
86
		notFoundReferences = new HashSet<>();
87 87

  
88 88
		try {
89 89
			sourceNumberRefIdMap = makeSourceNumberReferenceIdMap(state);
90
			nameCache2NameIdMap = makeNameCache2NameIdMap(state);
90 91
		} catch (SQLException e) {
91 92
			e.printStackTrace();
92 93
			throw new RuntimeException(e);
93 94
		}
94 95
		super.doInvoke(state);
95 96
		sourceNumberRefIdMap = null;
96
		if (unfoundReferences.size()>0){
97
			String unfound = "'" + CdmUtils.concat("','", unfoundReferences.toArray(new String[]{})) + "'";
97
		if (notFoundReferences.size()>0){
98
			String unfound = "'" + CdmUtils.concat("','", notFoundReferences.toArray(new String[]{})) + "'";
98 99
			logger.warn("Not found references: " + unfound);
99 100
		}
100 101
		return;
......
134 135
    					DescriptionElementSource originalSource = DescriptionElementSource.NewInstance(OriginalSourceType.PrimaryTaxonomicSource);
135 136
    					originalSource.setCitation(ref);
136 137
    					TaxonName taxonName;
137
						taxonName = TaxonName.castAndDeproxy(getName(state, oldName, oldNameFk));
138
						taxonName = TaxonName.castAndDeproxy(getName(state, oldName, oldNameFk, occurrenceSourceId, distribution));
138 139
						if (taxonName != null){
139
    						originalSource.setNameUsedInSource(taxonName);
140
						    if(isNotBlank(oldName) && !oldName.equals(taxonName.getNameCache())){
141
	                            originalSource.setOriginalNameString(oldName);
142
	                        }
143
						    originalSource.setNameUsedInSource(taxonName);
140 144
    					}else if(isNotBlank(oldName)){
141 145
    						originalSource.setOriginalNameString(oldName);
142 146
    					}
143 147
    					distribution.addSource(originalSource);
144 148
    				}else{
145 149
    					logger.warn("reference for sourceNumber "+sourceNumber+" could not be found. OccurrenceSourceId: " + occurrenceSourceId );
146
    					unfoundReferences.add(sourceNumber);
150
    					notFoundReferences.add(sourceNumber);
147 151
    				}
148 152
    			}else{
149 153
    				logger.warn("distribution ("+occurrenceFk+") for occurrence source (" + occurrenceSourceId + ") could not be found." );
......
170 174

  
171 175
		try{
172 176
			Set<String> occurrenceIdSet = new HashSet<>();
173
			Set<String> referenceIdSet = new HashSet<>();
174 177
			Set<String> nameIdSet = new HashSet<>();
175 178
			Set<String> sourceNumberSet = new HashSet<>();
179
			Set<String> oldNamesSet = new HashSet<>();
176 180
			while (rs.next()){
177 181
				handleForeignKey(rs, occurrenceIdSet, "occurrenceFk");
178 182
				handleForeignKey(rs, nameIdSet, "oldNameFk");
179 183
				sourceNumberSet.add(CdmUtils.NzTrim(rs.getString("SourceNumber")));
184
				oldNamesSet.add(CdmUtils.NzTrim(rs.getString("oldName")));
180 185
			}
181 186

  
182 187
			sourceNumberSet.remove("");
183
			referenceIdSet = handleSourceNumber(sourceNumberSet);
184

  
188
			Set<String> referenceIdSet = handleSourceNumber(sourceNumberSet);
189
            oldNamesSet.remove("");
190
            Set<String> oldNameIdSet = handleOldNames(oldNamesSet);
191
            nameIdSet.addAll(oldNameIdSet);
185 192

  
186 193
			//occurrence map
187 194
			nameSpace = BerlinModelOccurrenceImport.NAMESPACE;
......
224 231
		return referenceIdSet;
225 232
	}
226 233

  
234
    private Set<String> handleOldNames(Set<String> oldNamesSet) {
235
        Set<String> oldNameIdSet = new HashSet<>();
236

  
237
        for(String oldName : oldNamesSet){
238
            if (isNotBlank(oldName)){
239
                Set<Integer> nameIds = nameCache2NameIdMap.get(oldName);
240
                for (Integer nameId : nameIds){
241
                    oldNameIdSet.add(String.valueOf(nameId));
242
                }
243
            }
244
        }
245
        return oldNameIdSet;
246
    }
247

  
227 248

  
228 249

  
229 250
	/**
......
233 254
	 * @return
234 255
	 */
235 256
	boolean isFirstTimeNoNameByService = true;
236
	private INonViralName getName(BerlinModelImportState state, String oldName, Integer oldNameFk) {
257
	private TaxonName getName(BerlinModelImportState state, String oldName, Integer oldNameFk, Integer occSourceId, Distribution distribution) {
237 258
		TaxonName taxonName = (TaxonName)state.getRelatedObject(BerlinModelTaxonNameImport.NAMESPACE, String.valueOf(oldNameFk));
238
		if (taxonName == null && oldName != null){
239
			if (isFirstTimeNoNameByService){
240
				logger.warn("oldName not checked against names in BerlinModel. Just take it as a string");
241
				isFirstTimeNoNameByService = false;
242
			}
243
			List<INonViralName> names = new ArrayList<>();
244
//			names = getNameService().getNamesByNameCache(oldName);
245
			if (names.isEmpty()){
246
				return null;
247
			}else {
248
				if (names.size()> 1){
249
					logger.info("There is more than one name matching oldName: " + oldName + ".");
250
				}
251
				return names.get(0);
252
				//taxonName = nameParser.parseSimpleName(oldName);
253
			}
259
		if (oldNameFk != null && taxonName == null){
260
		    logger.warn("OldNameFk "+oldNameFk+" exists but taxonName not found for occSource: " + occSourceId);
261
		}
262
		if (isNotBlank(oldName)){
263
		    if (taxonName == null){
264
		        if (isFirstTimeNoNameByService){
265
		            logger.warn("oldName not checked against names in BerlinModel. Just take it as a string");
266
		            isFirstTimeNoNameByService = false;
267
		        }
268
		        Set<TaxonName> names = getOldNames(state, oldName);
269
		        if (names.isEmpty()){
270
		            logger.warn("No name found for freetext oldName '"+oldName+"'; occSourceId: " + occSourceId);
271
		            //taxonName = nameParser.parseSimpleName(oldName);
272
		            return null;
273
		        }else {
274
		            if (names.size()> 1){
275
		                TaxonName synName = getFirstSynonymName(state, names, distribution, occSourceId);
276
		                if (synName == null){
277
		                    logger.warn("There is more than one matching oldName for '"+oldName+"' but none of them is a synonym of the accepted taxon. Take arbitrary one. OccSourceId: " + occSourceId);
278
		                    return names.iterator().next();
279
		                }else{
280
		                    return synName;
281
		                }
282
		            }else{
283
                        return names.iterator().next();
284
		            }
285
		        }
286
		    }else if (!oldName.equals(taxonName.getNameCache())){
287
		        logger.warn("Old name freetext and linked name nameCache are not equal: " + oldName + "/" + taxonName.getNameCache() + "; occSourceId: " +  occSourceId);
288
	            return taxonName;
289
		    }else{
290
		        return taxonName;
291
		    }
292
		}else{
293
		    return taxonName;
254 294
		}
255
		return taxonName;
256 295
	}
257 296

  
258 297
	/**
298
     * @param state
299
     * @param names
300
     * @param taxon
301
     * @return
302
     */
303
    private TaxonName getFirstSynonymName(BerlinModelImportState state, Set<TaxonName> names, Distribution distribution, Integer occSourceId) {
304
        Taxon taxon = CdmBase.deproxy(distribution.getInDescription(), TaxonDescription.class).getTaxon();
305
        Set<TaxonName> synonyms = taxon.getSynonymNames();
306
        TaxonName result = null;
307
        for (TaxonName name : names){
308
            if (synonyms.contains(name)){
309
                if (result != null){
310
                    logger.warn("There is more than 1 matching synonym for " + name.getNameCache() + "; occSourceId: " + occSourceId);
311
                }
312
                result = name;
313
            }
314
        }
315
        return result;
316
    }
317

  
318
    /**
319
     * @param state
320
     * @param oldName
321
     * @return
322
     */
323
    private Set<TaxonName> getOldNames(BerlinModelImportState state, String oldName) {
324
        Set<Integer> nameIds = nameCache2NameIdMap.get(oldName);
325
        Set<TaxonName> names = new HashSet<>(nameIds.size());
326
        for (Integer id : nameIds){
327
            TaxonName name = (TaxonName)state.getRelatedObject(BerlinModelTaxonNameImport.NAMESPACE, String.valueOf(id));
328
            names.add(name);
329
        }
330
        return names;
331
    }
332

  
333
    /**
259 334
	 * Creates a map which maps source numbers on references
260 335
	 * @param state
261 336
	 * @return
......
284 359
		return result;
285 360
	}
286 361

  
362
	   /**
363
     * Creates a map which maps nameCaches to nameIDs numbers on references
364
     * @param state
365
     * @return
366
     * @throws SQLException
367
     */
368
    private Map<String, Set<Integer>> makeNameCache2NameIdMap(BerlinModelImportState state) throws SQLException {
369
        Map<String, Set<Integer>> result = new HashMap<>();
370

  
371
        Source source = state.getConfig().getSource();
372
        String strQuery = " SELECT NameId, nameCache " +
373
                          " FROM Name " +
374
                          " WHERE (nameCache IS NOT NULL) AND (nameCache NOT LIKE '') ";
375

  
376
        ResultSet rs = source.getResultSet(strQuery) ;
377
        while (rs.next()){
378
            int nameId = rs.getInt("NameId");
379
            String nameCache = rs.getString("nameCache");
380
            if (isNotBlank(nameCache)){
381
                nameCache = nameCache.trim();
382
                Set<Integer> set = result.get(nameCache);
383
                if (set == null){
384
                    set = new HashSet<>();
385
                    result.put(nameCache, set);
386
                }
387
                set.add(nameId);
388
            }
389
        }
390
        return result;
391
    }
392

  
393

  
394

  
287 395
	@Override
288 396
	protected boolean doCheck(BerlinModelImportState state){
289 397
		IOValidator<BerlinModelImportState> validator = new BerlinModelOccurrenceSourceImportValidator();

Also available in: Unified diff