Project

General

Profile

Download (16.3 KB) Statistics
| Branch: | Revision:
1
/**
2
* Copyright (C) 2007 EDIT
3
* European Distributed Institute of Taxonomy 
4
* http://www.e-taxonomy.eu
5
* 
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9

    
10
package eu.etaxonomy.cdm.io.algaterra;
11

    
12
import java.net.URI;
13
import java.sql.ResultSet;
14
import java.sql.SQLException;
15
import java.text.ParseException;
16
import java.util.HashMap;
17
import java.util.HashSet;
18
import java.util.Locale;
19
import java.util.Map;
20
import java.util.Set;
21

    
22
import org.apache.commons.lang.StringUtils;
23
import org.apache.log4j.Logger;
24
import org.joda.time.DateTime;
25
import org.joda.time.format.DateTimeFormat;
26
import org.joda.time.format.DateTimeFormatter;
27
import org.springframework.format.datetime.joda.DateTimeParser;
28
import org.springframework.stereotype.Component;
29

    
30
import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade.DerivedUnitType;
31
import eu.etaxonomy.cdm.io.algaterra.validation.AlgaTerraDnaImportValidator;
32
import eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportConfigurator;
33
import eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportState;
34
import eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelTaxonImport;
35
import eu.etaxonomy.cdm.io.common.IOValidator;
36
import eu.etaxonomy.cdm.io.common.ResultSetPartitioner;
37
import eu.etaxonomy.cdm.model.common.Annotation;
38
import eu.etaxonomy.cdm.model.common.AnnotationType;
39
import eu.etaxonomy.cdm.model.common.CdmBase;
40
import eu.etaxonomy.cdm.model.common.Language;
41
import eu.etaxonomy.cdm.model.description.IndividualsAssociation;
42
import eu.etaxonomy.cdm.model.description.TaxonDescription;
43
import eu.etaxonomy.cdm.model.molecular.DnaSample;
44
import eu.etaxonomy.cdm.model.molecular.GenBankAccession;
45
import eu.etaxonomy.cdm.model.molecular.Locus;
46
import eu.etaxonomy.cdm.model.molecular.Sequence;
47
import eu.etaxonomy.cdm.model.occurrence.DerivationEvent;
48
import eu.etaxonomy.cdm.model.occurrence.DerivationEventType;
49
import eu.etaxonomy.cdm.model.occurrence.DerivedUnitBase;
50
import eu.etaxonomy.cdm.model.occurrence.FieldObservation;
51
import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationBase;
52
import eu.etaxonomy.cdm.model.reference.Reference;
53
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
54
import eu.etaxonomy.cdm.model.taxon.Taxon;
55
import eu.etaxonomy.cdm.model.taxon.TaxonBase;
56

    
57

    
58
/**
59
 * @author a.mueller
60
 * @created 01.09.2012
61
 */
62
@Component
63
public class AlgaTerraDnaImport  extends AlgaTerraSpecimenImportBase {
64
	private static final Logger logger = Logger.getLogger(AlgaTerraDnaImport.class);
65

    
66
	
67
	private static int modCount = 5000;
68
	private static final String pluralString = "dna facts";
69
	private static final String dbTableName = "DNAFact";  //??  
70

    
71

    
72
	public AlgaTerraDnaImport(){
73
		super(dbTableName, pluralString);
74
	}
75
	
76
	
77
	
78
	/* (non-Javadoc)
79
	 * @see eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportBase#getIdQuery()
80
	 */
81
	@Override
82
	protected String getIdQuery(BerlinModelImportState bmState) {
83
		AlgaTerraImportState state = (AlgaTerraImportState)bmState;
84
		String result = " SELECT df.DNAFactId " + 
85
				" FROM DNAFact df " +
86
					" INNER JOIN Fact f ON  f.ExtensionFk = df.DNAFactID " +
87
					" WHERE f.FactCategoryFk = 203 ";
88
		if (state.getAlgaTerraConfigurator().isRemoveRestricted()){
89
				result = result + " AND df.ProtectedFlag = 0 ";
90
		}
91
		result += " ORDER BY df.DNAFactID ";
92
		return result;
93
	}
94

    
95
	/* (non-Javadoc)
96
	 * @see eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportBase#getRecordQuery(eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportConfigurator)
97
	 */
98
	@Override
99
	protected String getRecordQuery(BerlinModelImportConfigurator config) {
100
			String strQuery =   
101
	            " SELECT df.*, pt.RIdentifier as taxonId, f.FactId, f.restrictedFlag, ecoFact.ecoFactId as ecoFactId " +
102
	            " FROM DNAFact df INNER JOIN Fact f ON  f.ExtensionFk = df.DNAFactID " +
103
	            	" LEFT OUTER JOIN PTaxon pt ON f.PTNameFk = pt.PTNameFk AND f.PTRefFk = pt.PTRefFk " + 
104
	            	" LEFT OUTER JOIN EcoFact ecoFact ON ecoFact.CultureStrain = df.CultureStrainNo " +
105
	              " WHERE f.FactCategoryFk = 203 AND (df.DNAFactId IN (" + ID_LIST_TOKEN + ")  )"  
106
	            + " ORDER BY DNAFactID "
107
            ;
108
		return strQuery;
109
	}
110

    
111
	/* (non-Javadoc)
112
	 * @see eu.etaxonomy.cdm.io.berlinModel.in.IPartitionedIO#doPartition(eu.etaxonomy.cdm.io.berlinModel.in.ResultSetPartitioner, eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportState)
113
	 */
114
	public boolean doPartition(ResultSetPartitioner partitioner, BerlinModelImportState bmState) {
115
		boolean success = true;
116
		
117
		AlgaTerraImportState state = (AlgaTerraImportState)bmState;
118
		try {
119
//			makeVocabulariesAndFeatures(state);
120
		} catch (Exception e1) {
121
			logger.warn("Exception occurred when trying to create Ecofact vocabularies: " + e1.getMessage());
122
			e1.printStackTrace();
123
		}
124
		Set<SpecimenOrObservationBase> samplesToSave = new HashSet<SpecimenOrObservationBase>();
125
		Set<TaxonBase> taxaToSave = new HashSet<TaxonBase>();
126
		
127
		Map<String, FieldObservation> ecoFactFieldObservationMap = (Map<String, FieldObservation>) partitioner.getObjectMap(ECO_FACT_FIELD_OBSERVATION_NAMESPACE);
128
		
129
		ResultSet rs = partitioner.getResultSet();
130
		
131
		Map<String, Reference> referenceMap = new HashMap<String, Reference>();
132
		
133

    
134
		try {
135
			
136
			int i = 0;
137

    
138
			//for each reference
139
            while (rs.next()){
140
                
141
        		if ((i++ % modCount) == 0 && i!= 1 ){ logger.info(pluralString + " handled: " + (i-1));}
142
				
143
				int dnaFactId = rs.getInt("DNAFactId");
144
				String keywordsStr = rs.getString("Keywords");
145
				String locusStr = rs.getString("Locus");
146
				String definitionStr = rs.getString("Definition");
147
				
148
				
149
				try {
150
					
151
					//source ref
152
					Reference<?> sourceRef = state.getTransactionalSourceReference();
153
				
154
					//import date
155
					DateTime importDateTime = makeImportDateTime(rs);
156
					
157
					//DNA Sample
158
					DnaSample dnaSample = DnaSample.NewInstance();
159
					dnaSample.setCreated(importDateTime);
160
					
161
					//ecoFactFk
162
					makeDerivationFromEcoFact(state, rs, dnaSample, samplesToSave, dnaFactId);
163
					
164
					//sequence
165
					Sequence sequence = makeSequence(rs, dnaSample, dnaFactId, importDateTime);
166
					
167
					//locus
168
					Locus locus = Locus.NewInstance(keywordsStr, definitionStr);
169
					locus.setCreated(importDateTime);
170
					sequence.setLocus(locus);
171
					
172
					//GenBank Accession
173
					makeGenBankAccession(rs, sequence, importDateTime, dnaFactId);
174
					
175
					//Comment
176
					String commentStr = rs.getString("Comment");
177
					if (isNotBlank(commentStr)){
178
						Annotation annotation = Annotation.NewInstance(commentStr, AnnotationType.EDITORIAL(), Language.DEFAULT());
179
						annotation.setCreated(importDateTime);
180
						sequence.addAnnotation(annotation);
181
					}
182
					
183
					//Indiv.Assoc.
184
					makeIndividualsAssociation(partitioner, rs, state, taxaToSave, dnaSample);
185
					
186
					//TODO titleCache
187
					//prelim implementation:
188
					String cultStrain = rs.getString("CultureStrainNo");
189
					String title = String.format("DNA Sample for %s at %s", cultStrain, keywordsStr);
190
					dnaSample.setTitleCache(title, true);
191

    
192
					//TODO preliminary implementation
193
					String referenceStr = rs.getString("FactReference");
194
					if (isNotBlank(referenceStr)){
195
						Reference<?> ref = referenceMap.get(referenceStr);
196
						if (ref == null){
197
							ref = ReferenceFactory.newGeneric();
198
							ref.setTitleCache(referenceStr, true);
199
							referenceMap.put(referenceStr, ref);
200
						}
201
						sequence.setPublishedIn(ref);
202
					}
203
					
204
					//save
205
					samplesToSave.add(dnaSample); 
206
					
207

    
208
				} catch (Exception e) {
209
					logger.warn("Exception in ecoFact: ecoFactId " + dnaFactId + ". " + e.getMessage());
210
					e.printStackTrace();
211
				} 
212
                
213
            }
214
           
215
			logger.warn("DNASample or EcoFacts to save: " + samplesToSave.size());
216
			getOccurrenceService().saveOrUpdate(samplesToSave);	
217
			logger.warn("Taxa to save: " + samplesToSave.size());
218
			getTaxonService().saveOrUpdate(taxaToSave);
219
			
220
			return success;
221
		} catch (SQLException e) {
222
			logger.error("SQLException:" +  e);
223
			return false;
224
		}
225
	}
226

    
227

    
228
	private void makeDerivationFromEcoFact(AlgaTerraImportState state, ResultSet rs, DnaSample dnaSample, Set<SpecimenOrObservationBase> samplesToSave, Integer dnaFactId) throws SQLException {
229
		Integer ecoFactFk = nullSafeInt(rs, "ecoFactId");
230
		if (ecoFactFk != null){
231
			
232
			DerivedUnitBase<?> ecoFact = (DerivedUnitBase<?>)state.getRelatedObject(ECO_FACT_DERIVED_UNIT_NAMESPACE, ecoFactFk.toString());
233
			if (ecoFact == null){
234
				logger.warn("EcoFact is null for ecoFactFk: " + ecoFactFk + ", DnaFactId: " + dnaFactId);
235
			}else{
236
				DerivationEvent.NewSimpleInstance(ecoFact, dnaSample, DerivationEventType.DNA_EXTRACTION());
237
				samplesToSave.add(ecoFact);
238
			}
239
		}
240
		
241
		
242
		
243
	}
244

    
245

    
246

    
247
	private void makeIndividualsAssociation(ResultSetPartitioner partitioner, ResultSet rs, AlgaTerraImportState state, Set<TaxonBase> taxaToSave, DnaSample dnaSample) throws SQLException{
248
		Reference<?> sourceRef = state.getTransactionalSourceReference();
249
		Map<String, TaxonBase> taxonMap = (Map<String, TaxonBase>) partitioner.getObjectMap(BerlinModelTaxonImport.NAMESPACE);
250
		Integer taxonId = rs.getInt("taxonId");
251
		Integer factId = rs.getInt("factId");
252
		Taxon taxon = getTaxon(state, taxonId, taxonMap, factId);
253
		TaxonDescription desc = getTaxonDescription(state, taxon, sourceRef);
254
		IndividualsAssociation assoc = IndividualsAssociation.NewInstance(dnaSample);
255
		desc.addElement(assoc);
256
		taxaToSave.add(taxon);
257
	}
258
	
259

    
260
	/**
261
	 * @param rs
262
	 * @return
263
	 * @throws SQLException
264
	 * @throws ParseException
265
	 */
266
	private DateTime makeImportDateTime(ResultSet rs) throws SQLException,
267
			ParseException {
268
		DateTime importDateTime = null;
269
		String importDateTimeStr = rs.getString("ImportDateTime");
270
		if (isNotBlank(importDateTimeStr)){
271
			importDateTimeStr = importDateTimeStr.substring(0,10);
272
			DateTimeFormatter dayFormatter = DateTimeFormat.forPattern("dd.MM.yyyy");
273

    
274
//						DateTimeFormatter formatter = new DateTimeFormatterBuilder().
275
//								append;
276
			DateTimeParser p = new DateTimeParser(dayFormatter);
277
			importDateTime = p.parse(importDateTimeStr, Locale.GERMANY);
278
			
279
		}
280
		return importDateTime;
281
	}
282

    
283

    
284

    
285
	private Sequence makeSequence(ResultSet rs, DnaSample dnaSample, int dnaFactId, DateTime importDateTime) throws SQLException {
286
		String sequenceStr = rs.getString("PlainSequence");
287
		Integer originalLen = null;
288
		Integer seqLen = nullSafeInt(rs, "SeqLen");
289
		if (seqLen == null){
290
			if (sequenceStr != null){
291
				seqLen = sequenceStr.length();
292
			}
293
		}
294
		
295
		if (sequenceStr != null){
296
			originalLen = sequenceStr.length();
297
			if (originalLen > 255){
298
				logger.warn("Sequence truncated. Id: " + dnaFactId);
299
				sequenceStr = sequenceStr.substring(0, 255);
300
			}
301
		}else{
302
			logger.warn("PlainSequence is null. Id: " + dnaFactId);
303
		}
304
		Sequence sequence = Sequence.NewInstance(sequenceStr);
305
		sequence.setLength(seqLen);
306
		if (! originalLen.equals(seqLen)){
307
			logger.warn("SeqLen (" + seqLen+ ") and OriginalLen ("+originalLen+") differ for dnaFact: "  + dnaFactId);
308
		}
309
		
310
		sequence.setCreated(importDateTime);
311
		dnaSample.addSequences(sequence);
312
		return sequence;
313
	}
314

    
315

    
316

    
317
	/**
318
	 * @param sequence2 
319
	 * @param rs 
320
	 * @param accessionStr
321
	 * @param notesStr
322
	 * @param sequence
323
	 * @param importDateTime 
324
	 * @return
325
	 * @throws SQLException 
326
	 */
327
	private void makeGenBankAccession(ResultSet rs, Sequence sequence, DateTime importDateTime, Integer dnaFactId) throws SQLException {
328
		String accessionStr = rs.getString("Accession");
329
		String notesStr = rs.getString("Notes");
330
		String versionStr = rs.getString("Version");
331
		
332
		URI genBankUri = null;
333
		if (StringUtils.isNotBlank(notesStr)){
334
			if (notesStr.startsWith("http")){
335
				genBankUri = URI.create(notesStr);
336
			}else{
337
				logger.warn("Notes do not start with URI: " +  notesStr);
338
			}
339
		}
340
		
341
		if (isNotBlank(accessionStr) || genBankUri != null){
342
			if (accessionStr != null && accessionStr.trim().equals("")){
343
				accessionStr = null;
344
			}
345
			if (isGenBankAccessionNumber(accessionStr, versionStr, genBankUri, dnaFactId) || genBankUri != null){
346
				GenBankAccession accession = GenBankAccession.NewInstance(accessionStr);
347
				accession.setUri(genBankUri);
348
				accession.setCreated(importDateTime);
349
				sequence.addGenBankAccession(accession);				
350
			}
351
		}
352
	}
353
	
354
	private boolean isGenBankAccessionNumber(String accessionStr, String versionStr, URI genBankUri, Integer dnaFactId) {
355
		boolean isGenBankAccessionNumber = accessionStr.matches("[A-Z]{2}\\d{6}");
356
		boolean versionHasGenBankPart = versionStr.matches(".*GI:.*");
357
		if (isGenBankAccessionNumber && versionHasGenBankPart){
358
			return true;
359
		}else {
360
			if (genBankUri != null){
361
				logger.warn("GenBank Uri exists but accession or version have been identified to use GenBank syntax. DNAFactID: " + dnaFactId);	
362
			}
363
			if(isGenBankAccessionNumber || versionHasGenBankPart){
364
				logger.warn("Either accession ("+ accessionStr +") or version ("+versionStr+") use GenBank syntax but the other does not. DNAFactID: " + dnaFactId);	
365
			}
366
			return false;
367
		}
368
	}
369

    
370

    
371

    
372
	protected String getDerivedUnitNameSpace(){
373
		return ECO_FACT_DERIVED_UNIT_NAMESPACE;
374
	}
375
	
376
	protected String getFieldObservationNameSpace(){
377
		return ECO_FACT_FIELD_OBSERVATION_NAMESPACE;
378
	}
379

    
380

    
381
	private DerivedUnitType makeDerivedUnitType(String recordBasis) {
382
		DerivedUnitType result = null;
383
		if (StringUtils.isBlank(recordBasis)){
384
			result = DerivedUnitType.DerivedUnit;
385
		} else if (recordBasis.equalsIgnoreCase("FossileSpecimen")){
386
			result = DerivedUnitType.Fossil;
387
		}else if (recordBasis.equalsIgnoreCase("HumanObservation")){
388
			result = DerivedUnitType.Observation;
389
		}else if (recordBasis.equalsIgnoreCase("Literature")){
390
			logger.warn("Literature record basis not yet supported");
391
			result = DerivedUnitType.DerivedUnit;
392
		}else if (recordBasis.equalsIgnoreCase("LivingSpecimen")){
393
			result = DerivedUnitType.LivingBeing;
394
		}else if (recordBasis.equalsIgnoreCase("MachineObservation")){
395
			logger.warn("MachineObservation record basis not yet supported");
396
			result = DerivedUnitType.Observation;
397
		}else if (recordBasis.equalsIgnoreCase("PreservedSpecimen")){
398
			result = DerivedUnitType.Specimen;
399
		}
400
		return result;
401
	}
402

    
403
	/* (non-Javadoc)
404
	 * @see eu.etaxonomy.cdm.io.berlinModel.in.IPartitionedIO#getRelatedObjectsForPartition(java.sql.ResultSet)
405
	 */
406
	public Map<Object, Map<String, ? extends CdmBase>> getRelatedObjectsForPartition(ResultSet rs) {
407
		String nameSpace;
408
		Class cdmClass;
409
		Set<String> idSet;
410
		Map<Object, Map<String, ? extends CdmBase>> result = new HashMap<Object, Map<String, ? extends CdmBase>>();
411
		
412
		try{
413
			Set<String> taxonIdSet = new HashSet<String>();
414
			
415
			Set<String> ecoFactFkSet = new HashSet<String>();
416
			
417
			while (rs.next()){
418
				handleForeignKey(rs, taxonIdSet, "taxonId");
419
				handleForeignKey(rs, ecoFactFkSet, "ecoFactId");
420

    
421
			}
422
			
423
			//taxon map
424
			nameSpace = BerlinModelTaxonImport.NAMESPACE;
425
			cdmClass = TaxonBase.class;
426
			idSet = taxonIdSet;
427
			Map<String, TaxonBase> objectMap = (Map<String, TaxonBase>)getCommonService().getSourcedObjectsByIdInSource(cdmClass, idSet, nameSpace);
428
			result.put(nameSpace, objectMap);
429
			
430

    
431
			//eco fact derived unit map
432
			nameSpace = AlgaTerraFactEcologyImport.ECO_FACT_DERIVED_UNIT_NAMESPACE;
433
			cdmClass = DerivedUnitBase.class;
434
			idSet = ecoFactFkSet;
435
			Map<String, DerivedUnitBase> derivedUnitMap = (Map<String, DerivedUnitBase>)getCommonService().getSourcedObjectsByIdInSource(cdmClass, idSet, nameSpace);
436
			result.put(nameSpace, derivedUnitMap);
437
			
438
		} catch (SQLException e) {
439
			throw new RuntimeException(e);
440
		}
441
		return result;
442
	}
443

    
444

    
445

    
446
	/* (non-Javadoc)
447
	 * @see eu.etaxonomy.cdm.io.common.CdmIoBase#doCheck(eu.etaxonomy.cdm.io.common.IoStateBase)
448
	 */
449
	@Override
450
	protected boolean doCheck(BerlinModelImportState state){
451
		IOValidator<BerlinModelImportState> validator = new AlgaTerraDnaImportValidator();
452
		return validator.validate(state);
453
	}
454

    
455

    
456
	/* (non-Javadoc)
457
	 * @see eu.etaxonomy.cdm.io.common.CdmIoBase#isIgnore(eu.etaxonomy.cdm.io.common.IImportConfigurator)
458
	 */
459
	protected boolean isIgnore(BerlinModelImportState state){
460
		return ! ((AlgaTerraImportState)state).getAlgaTerraConfigurator().isDoDna();
461
	}
462
	
463
}
(2-2/15)