1
|
/**
|
2
|
* Copyright (C) 2007 EDIT
|
3
|
* European Distributed Institute of Taxonomy
|
4
|
* http://www.e-taxonomy.eu
|
5
|
*
|
6
|
* The contents of this file are subject to the Mozilla Public License Version 1.1
|
7
|
* See LICENSE.TXT at the top of this package for the full license terms.
|
8
|
*/
|
9
|
|
10
|
package eu.etaxonomy.cdm.io.algaterra;
|
11
|
|
12
|
import java.net.URI;
|
13
|
import java.sql.ResultSet;
|
14
|
import java.sql.SQLException;
|
15
|
import java.text.ParseException;
|
16
|
import java.util.HashMap;
|
17
|
import java.util.HashSet;
|
18
|
import java.util.Locale;
|
19
|
import java.util.Map;
|
20
|
import java.util.Set;
|
21
|
|
22
|
import org.apache.commons.lang.StringUtils;
|
23
|
import org.apache.log4j.Logger;
|
24
|
import org.joda.time.DateTime;
|
25
|
import org.joda.time.format.DateTimeFormat;
|
26
|
import org.joda.time.format.DateTimeFormatter;
|
27
|
import org.springframework.format.datetime.joda.DateTimeParser;
|
28
|
import org.springframework.stereotype.Component;
|
29
|
|
30
|
import eu.etaxonomy.cdm.api.facade.DerivedUnitFacade.DerivedUnitType;
|
31
|
import eu.etaxonomy.cdm.io.algaterra.validation.AlgaTerraDnaImportValidator;
|
32
|
import eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportConfigurator;
|
33
|
import eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportState;
|
34
|
import eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelTaxonImport;
|
35
|
import eu.etaxonomy.cdm.io.common.IOValidator;
|
36
|
import eu.etaxonomy.cdm.io.common.ResultSetPartitioner;
|
37
|
import eu.etaxonomy.cdm.model.common.Annotation;
|
38
|
import eu.etaxonomy.cdm.model.common.AnnotationType;
|
39
|
import eu.etaxonomy.cdm.model.common.CdmBase;
|
40
|
import eu.etaxonomy.cdm.model.common.Language;
|
41
|
import eu.etaxonomy.cdm.model.description.IndividualsAssociation;
|
42
|
import eu.etaxonomy.cdm.model.description.TaxonDescription;
|
43
|
import eu.etaxonomy.cdm.model.molecular.DnaSample;
|
44
|
import eu.etaxonomy.cdm.model.molecular.GenBankAccession;
|
45
|
import eu.etaxonomy.cdm.model.molecular.Locus;
|
46
|
import eu.etaxonomy.cdm.model.molecular.Sequence;
|
47
|
import eu.etaxonomy.cdm.model.occurrence.DerivationEvent;
|
48
|
import eu.etaxonomy.cdm.model.occurrence.DerivationEventType;
|
49
|
import eu.etaxonomy.cdm.model.occurrence.DerivedUnitBase;
|
50
|
import eu.etaxonomy.cdm.model.occurrence.FieldObservation;
|
51
|
import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationBase;
|
52
|
import eu.etaxonomy.cdm.model.reference.Reference;
|
53
|
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
|
54
|
import eu.etaxonomy.cdm.model.taxon.Taxon;
|
55
|
import eu.etaxonomy.cdm.model.taxon.TaxonBase;
|
56
|
|
57
|
|
58
|
/**
|
59
|
* @author a.mueller
|
60
|
* @created 01.09.2012
|
61
|
*/
|
62
|
@Component
|
63
|
public class AlgaTerraDnaImport extends AlgaTerraSpecimenImportBase {
|
64
|
private static final Logger logger = Logger.getLogger(AlgaTerraDnaImport.class);
|
65
|
|
66
|
|
67
|
private static int modCount = 5000;
|
68
|
private static final String pluralString = "dna facts";
|
69
|
private static final String dbTableName = "DNAFact"; //??
|
70
|
|
71
|
|
72
|
public AlgaTerraDnaImport(){
|
73
|
super(dbTableName, pluralString);
|
74
|
}
|
75
|
|
76
|
|
77
|
|
78
|
/* (non-Javadoc)
|
79
|
* @see eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportBase#getIdQuery()
|
80
|
*/
|
81
|
@Override
|
82
|
protected String getIdQuery(BerlinModelImportState bmState) {
|
83
|
AlgaTerraImportState state = (AlgaTerraImportState)bmState;
|
84
|
String result = " SELECT df.DNAFactId " +
|
85
|
" FROM DNAFact df " +
|
86
|
" INNER JOIN Fact f ON f.ExtensionFk = df.DNAFactID " +
|
87
|
" WHERE f.FactCategoryFk = 203 ";
|
88
|
if (state.getAlgaTerraConfigurator().isRemoveRestricted()){
|
89
|
result = result + " AND df.ProtectedFlag = 0 ";
|
90
|
}
|
91
|
result += " ORDER BY df.DNAFactID ";
|
92
|
return result;
|
93
|
}
|
94
|
|
95
|
/* (non-Javadoc)
|
96
|
* @see eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportBase#getRecordQuery(eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportConfigurator)
|
97
|
*/
|
98
|
@Override
|
99
|
protected String getRecordQuery(BerlinModelImportConfigurator config) {
|
100
|
String strQuery =
|
101
|
" SELECT df.*, pt.RIdentifier as taxonId, f.FactId, f.restrictedFlag, ecoFact.ecoFactId as ecoFactId " +
|
102
|
" FROM DNAFact df INNER JOIN Fact f ON f.ExtensionFk = df.DNAFactID " +
|
103
|
" LEFT OUTER JOIN PTaxon pt ON f.PTNameFk = pt.PTNameFk AND f.PTRefFk = pt.PTRefFk " +
|
104
|
" LEFT OUTER JOIN EcoFact ecoFact ON ecoFact.CultureStrain = df.CultureStrainNo " +
|
105
|
" WHERE f.FactCategoryFk = 203 AND (df.DNAFactId IN (" + ID_LIST_TOKEN + ") )"
|
106
|
+ " ORDER BY DNAFactID "
|
107
|
;
|
108
|
return strQuery;
|
109
|
}
|
110
|
|
111
|
/* (non-Javadoc)
|
112
|
* @see eu.etaxonomy.cdm.io.berlinModel.in.IPartitionedIO#doPartition(eu.etaxonomy.cdm.io.berlinModel.in.ResultSetPartitioner, eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportState)
|
113
|
*/
|
114
|
public boolean doPartition(ResultSetPartitioner partitioner, BerlinModelImportState bmState) {
|
115
|
boolean success = true;
|
116
|
|
117
|
AlgaTerraImportState state = (AlgaTerraImportState)bmState;
|
118
|
try {
|
119
|
// makeVocabulariesAndFeatures(state);
|
120
|
} catch (Exception e1) {
|
121
|
logger.warn("Exception occurred when trying to create Ecofact vocabularies: " + e1.getMessage());
|
122
|
e1.printStackTrace();
|
123
|
}
|
124
|
Set<SpecimenOrObservationBase> samplesToSave = new HashSet<SpecimenOrObservationBase>();
|
125
|
Set<TaxonBase> taxaToSave = new HashSet<TaxonBase>();
|
126
|
|
127
|
Map<String, FieldObservation> ecoFactFieldObservationMap = (Map<String, FieldObservation>) partitioner.getObjectMap(ECO_FACT_FIELD_OBSERVATION_NAMESPACE);
|
128
|
|
129
|
ResultSet rs = partitioner.getResultSet();
|
130
|
|
131
|
Map<String, Reference> referenceMap = new HashMap<String, Reference>();
|
132
|
|
133
|
|
134
|
try {
|
135
|
|
136
|
int i = 0;
|
137
|
|
138
|
//for each reference
|
139
|
while (rs.next()){
|
140
|
|
141
|
if ((i++ % modCount) == 0 && i!= 1 ){ logger.info(pluralString + " handled: " + (i-1));}
|
142
|
|
143
|
int dnaFactId = rs.getInt("DNAFactId");
|
144
|
String keywordsStr = rs.getString("Keywords");
|
145
|
String locusStr = rs.getString("Locus");
|
146
|
String definitionStr = rs.getString("Definition");
|
147
|
|
148
|
|
149
|
try {
|
150
|
|
151
|
//source ref
|
152
|
Reference<?> sourceRef = state.getTransactionalSourceReference();
|
153
|
|
154
|
//import date
|
155
|
DateTime importDateTime = makeImportDateTime(rs);
|
156
|
|
157
|
//DNA Sample
|
158
|
DnaSample dnaSample = DnaSample.NewInstance();
|
159
|
dnaSample.setCreated(importDateTime);
|
160
|
|
161
|
//ecoFactFk
|
162
|
makeDerivationFromEcoFact(state, rs, dnaSample, samplesToSave, dnaFactId);
|
163
|
|
164
|
//sequence
|
165
|
Sequence sequence = makeSequence(rs, dnaSample, dnaFactId, importDateTime);
|
166
|
|
167
|
//locus
|
168
|
Locus locus = Locus.NewInstance(keywordsStr, definitionStr);
|
169
|
locus.setCreated(importDateTime);
|
170
|
sequence.setLocus(locus);
|
171
|
|
172
|
//GenBank Accession
|
173
|
makeGenBankAccession(rs, sequence, importDateTime, dnaFactId);
|
174
|
|
175
|
//Comment
|
176
|
String commentStr = rs.getString("Comment");
|
177
|
if (isNotBlank(commentStr)){
|
178
|
Annotation annotation = Annotation.NewInstance(commentStr, AnnotationType.EDITORIAL(), Language.DEFAULT());
|
179
|
annotation.setCreated(importDateTime);
|
180
|
sequence.addAnnotation(annotation);
|
181
|
}
|
182
|
|
183
|
//Indiv.Assoc.
|
184
|
makeIndividualsAssociation(partitioner, rs, state, taxaToSave, dnaSample);
|
185
|
|
186
|
//TODO titleCache
|
187
|
//prelim implementation:
|
188
|
String cultStrain = rs.getString("CultureStrainNo");
|
189
|
String title = String.format("DNA Sample for %s at %s", cultStrain, keywordsStr);
|
190
|
dnaSample.setTitleCache(title, true);
|
191
|
|
192
|
//TODO preliminary implementation
|
193
|
String referenceStr = rs.getString("FactReference");
|
194
|
if (isNotBlank(referenceStr)){
|
195
|
Reference<?> ref = referenceMap.get(referenceStr);
|
196
|
if (ref == null){
|
197
|
ref = ReferenceFactory.newGeneric();
|
198
|
ref.setTitleCache(referenceStr, true);
|
199
|
referenceMap.put(referenceStr, ref);
|
200
|
}
|
201
|
sequence.setPublishedIn(ref);
|
202
|
}
|
203
|
|
204
|
//save
|
205
|
samplesToSave.add(dnaSample);
|
206
|
|
207
|
|
208
|
} catch (Exception e) {
|
209
|
logger.warn("Exception in ecoFact: ecoFactId " + dnaFactId + ". " + e.getMessage());
|
210
|
e.printStackTrace();
|
211
|
}
|
212
|
|
213
|
}
|
214
|
|
215
|
logger.warn("DNASample or EcoFacts to save: " + samplesToSave.size());
|
216
|
getOccurrenceService().saveOrUpdate(samplesToSave);
|
217
|
logger.warn("Taxa to save: " + samplesToSave.size());
|
218
|
getTaxonService().saveOrUpdate(taxaToSave);
|
219
|
|
220
|
return success;
|
221
|
} catch (SQLException e) {
|
222
|
logger.error("SQLException:" + e);
|
223
|
return false;
|
224
|
}
|
225
|
}
|
226
|
|
227
|
|
228
|
private void makeDerivationFromEcoFact(AlgaTerraImportState state, ResultSet rs, DnaSample dnaSample, Set<SpecimenOrObservationBase> samplesToSave, Integer dnaFactId) throws SQLException {
|
229
|
Integer ecoFactFk = nullSafeInt(rs, "ecoFactId");
|
230
|
if (ecoFactFk != null){
|
231
|
|
232
|
DerivedUnitBase<?> ecoFact = (DerivedUnitBase<?>)state.getRelatedObject(ECO_FACT_DERIVED_UNIT_NAMESPACE, ecoFactFk.toString());
|
233
|
if (ecoFact == null){
|
234
|
logger.warn("EcoFact is null for ecoFactFk: " + ecoFactFk + ", DnaFactId: " + dnaFactId);
|
235
|
}else{
|
236
|
DerivationEvent.NewSimpleInstance(ecoFact, dnaSample, DerivationEventType.DNA_EXTRACTION());
|
237
|
samplesToSave.add(ecoFact);
|
238
|
}
|
239
|
}
|
240
|
|
241
|
|
242
|
|
243
|
}
|
244
|
|
245
|
|
246
|
|
247
|
private void makeIndividualsAssociation(ResultSetPartitioner partitioner, ResultSet rs, AlgaTerraImportState state, Set<TaxonBase> taxaToSave, DnaSample dnaSample) throws SQLException{
|
248
|
Reference<?> sourceRef = state.getTransactionalSourceReference();
|
249
|
Map<String, TaxonBase> taxonMap = (Map<String, TaxonBase>) partitioner.getObjectMap(BerlinModelTaxonImport.NAMESPACE);
|
250
|
Integer taxonId = rs.getInt("taxonId");
|
251
|
Integer factId = rs.getInt("factId");
|
252
|
Taxon taxon = getTaxon(state, taxonId, taxonMap, factId);
|
253
|
TaxonDescription desc = getTaxonDescription(state, taxon, sourceRef);
|
254
|
IndividualsAssociation assoc = IndividualsAssociation.NewInstance(dnaSample);
|
255
|
desc.addElement(assoc);
|
256
|
taxaToSave.add(taxon);
|
257
|
}
|
258
|
|
259
|
|
260
|
/**
|
261
|
* @param rs
|
262
|
* @return
|
263
|
* @throws SQLException
|
264
|
* @throws ParseException
|
265
|
*/
|
266
|
private DateTime makeImportDateTime(ResultSet rs) throws SQLException,
|
267
|
ParseException {
|
268
|
DateTime importDateTime = null;
|
269
|
String importDateTimeStr = rs.getString("ImportDateTime");
|
270
|
if (isNotBlank(importDateTimeStr)){
|
271
|
importDateTimeStr = importDateTimeStr.substring(0,10);
|
272
|
DateTimeFormatter dayFormatter = DateTimeFormat.forPattern("dd.MM.yyyy");
|
273
|
|
274
|
// DateTimeFormatter formatter = new DateTimeFormatterBuilder().
|
275
|
// append;
|
276
|
DateTimeParser p = new DateTimeParser(dayFormatter);
|
277
|
importDateTime = p.parse(importDateTimeStr, Locale.GERMANY);
|
278
|
|
279
|
}
|
280
|
return importDateTime;
|
281
|
}
|
282
|
|
283
|
|
284
|
|
285
|
private Sequence makeSequence(ResultSet rs, DnaSample dnaSample, int dnaFactId, DateTime importDateTime) throws SQLException {
|
286
|
String sequenceStr = rs.getString("PlainSequence");
|
287
|
Integer originalLen = null;
|
288
|
Integer seqLen = nullSafeInt(rs, "SeqLen");
|
289
|
if (seqLen == null){
|
290
|
if (sequenceStr != null){
|
291
|
seqLen = sequenceStr.length();
|
292
|
}
|
293
|
}
|
294
|
|
295
|
if (sequenceStr != null){
|
296
|
originalLen = sequenceStr.length();
|
297
|
if (originalLen > 255){
|
298
|
logger.warn("Sequence truncated. Id: " + dnaFactId);
|
299
|
sequenceStr = sequenceStr.substring(0, 255);
|
300
|
}
|
301
|
}else{
|
302
|
logger.warn("PlainSequence is null. Id: " + dnaFactId);
|
303
|
}
|
304
|
Sequence sequence = Sequence.NewInstance(sequenceStr);
|
305
|
sequence.setLength(seqLen);
|
306
|
if (! originalLen.equals(seqLen)){
|
307
|
logger.warn("SeqLen (" + seqLen+ ") and OriginalLen ("+originalLen+") differ for dnaFact: " + dnaFactId);
|
308
|
}
|
309
|
|
310
|
sequence.setCreated(importDateTime);
|
311
|
dnaSample.addSequences(sequence);
|
312
|
return sequence;
|
313
|
}
|
314
|
|
315
|
|
316
|
|
317
|
/**
|
318
|
* @param sequence2
|
319
|
* @param rs
|
320
|
* @param accessionStr
|
321
|
* @param notesStr
|
322
|
* @param sequence
|
323
|
* @param importDateTime
|
324
|
* @return
|
325
|
* @throws SQLException
|
326
|
*/
|
327
|
private void makeGenBankAccession(ResultSet rs, Sequence sequence, DateTime importDateTime, Integer dnaFactId) throws SQLException {
|
328
|
String accessionStr = rs.getString("Accession");
|
329
|
String notesStr = rs.getString("Notes");
|
330
|
String versionStr = rs.getString("Version");
|
331
|
|
332
|
URI genBankUri = null;
|
333
|
if (StringUtils.isNotBlank(notesStr)){
|
334
|
if (notesStr.startsWith("http")){
|
335
|
genBankUri = URI.create(notesStr);
|
336
|
}else{
|
337
|
logger.warn("Notes do not start with URI: " + notesStr);
|
338
|
}
|
339
|
}
|
340
|
|
341
|
if (isNotBlank(accessionStr) || genBankUri != null){
|
342
|
if (accessionStr != null && accessionStr.trim().equals("")){
|
343
|
accessionStr = null;
|
344
|
}
|
345
|
if (isGenBankAccessionNumber(accessionStr, versionStr, genBankUri, dnaFactId) || genBankUri != null){
|
346
|
GenBankAccession accession = GenBankAccession.NewInstance(accessionStr);
|
347
|
accession.setUri(genBankUri);
|
348
|
accession.setCreated(importDateTime);
|
349
|
sequence.addGenBankAccession(accession);
|
350
|
}
|
351
|
}
|
352
|
}
|
353
|
|
354
|
private boolean isGenBankAccessionNumber(String accessionStr, String versionStr, URI genBankUri, Integer dnaFactId) {
|
355
|
boolean isGenBankAccessionNumber = accessionStr.matches("[A-Z]{2}\\d{6}");
|
356
|
boolean versionHasGenBankPart = versionStr.matches(".*GI:.*");
|
357
|
if (isGenBankAccessionNumber && versionHasGenBankPart){
|
358
|
return true;
|
359
|
}else {
|
360
|
if (genBankUri != null){
|
361
|
logger.warn("GenBank Uri exists but accession or version have been identified to use GenBank syntax. DNAFactID: " + dnaFactId);
|
362
|
}
|
363
|
if(isGenBankAccessionNumber || versionHasGenBankPart){
|
364
|
logger.warn("Either accession ("+ accessionStr +") or version ("+versionStr+") use GenBank syntax but the other does not. DNAFactID: " + dnaFactId);
|
365
|
}
|
366
|
return false;
|
367
|
}
|
368
|
}
|
369
|
|
370
|
|
371
|
|
372
|
protected String getDerivedUnitNameSpace(){
|
373
|
return ECO_FACT_DERIVED_UNIT_NAMESPACE;
|
374
|
}
|
375
|
|
376
|
protected String getFieldObservationNameSpace(){
|
377
|
return ECO_FACT_FIELD_OBSERVATION_NAMESPACE;
|
378
|
}
|
379
|
|
380
|
|
381
|
private DerivedUnitType makeDerivedUnitType(String recordBasis) {
|
382
|
DerivedUnitType result = null;
|
383
|
if (StringUtils.isBlank(recordBasis)){
|
384
|
result = DerivedUnitType.DerivedUnit;
|
385
|
} else if (recordBasis.equalsIgnoreCase("FossileSpecimen")){
|
386
|
result = DerivedUnitType.Fossil;
|
387
|
}else if (recordBasis.equalsIgnoreCase("HumanObservation")){
|
388
|
result = DerivedUnitType.Observation;
|
389
|
}else if (recordBasis.equalsIgnoreCase("Literature")){
|
390
|
logger.warn("Literature record basis not yet supported");
|
391
|
result = DerivedUnitType.DerivedUnit;
|
392
|
}else if (recordBasis.equalsIgnoreCase("LivingSpecimen")){
|
393
|
result = DerivedUnitType.LivingBeing;
|
394
|
}else if (recordBasis.equalsIgnoreCase("MachineObservation")){
|
395
|
logger.warn("MachineObservation record basis not yet supported");
|
396
|
result = DerivedUnitType.Observation;
|
397
|
}else if (recordBasis.equalsIgnoreCase("PreservedSpecimen")){
|
398
|
result = DerivedUnitType.Specimen;
|
399
|
}
|
400
|
return result;
|
401
|
}
|
402
|
|
403
|
/* (non-Javadoc)
|
404
|
* @see eu.etaxonomy.cdm.io.berlinModel.in.IPartitionedIO#getRelatedObjectsForPartition(java.sql.ResultSet)
|
405
|
*/
|
406
|
public Map<Object, Map<String, ? extends CdmBase>> getRelatedObjectsForPartition(ResultSet rs) {
|
407
|
String nameSpace;
|
408
|
Class cdmClass;
|
409
|
Set<String> idSet;
|
410
|
Map<Object, Map<String, ? extends CdmBase>> result = new HashMap<Object, Map<String, ? extends CdmBase>>();
|
411
|
|
412
|
try{
|
413
|
Set<String> taxonIdSet = new HashSet<String>();
|
414
|
|
415
|
Set<String> ecoFactFkSet = new HashSet<String>();
|
416
|
|
417
|
while (rs.next()){
|
418
|
handleForeignKey(rs, taxonIdSet, "taxonId");
|
419
|
handleForeignKey(rs, ecoFactFkSet, "ecoFactId");
|
420
|
|
421
|
}
|
422
|
|
423
|
//taxon map
|
424
|
nameSpace = BerlinModelTaxonImport.NAMESPACE;
|
425
|
cdmClass = TaxonBase.class;
|
426
|
idSet = taxonIdSet;
|
427
|
Map<String, TaxonBase> objectMap = (Map<String, TaxonBase>)getCommonService().getSourcedObjectsByIdInSource(cdmClass, idSet, nameSpace);
|
428
|
result.put(nameSpace, objectMap);
|
429
|
|
430
|
|
431
|
//eco fact derived unit map
|
432
|
nameSpace = AlgaTerraFactEcologyImport.ECO_FACT_DERIVED_UNIT_NAMESPACE;
|
433
|
cdmClass = DerivedUnitBase.class;
|
434
|
idSet = ecoFactFkSet;
|
435
|
Map<String, DerivedUnitBase> derivedUnitMap = (Map<String, DerivedUnitBase>)getCommonService().getSourcedObjectsByIdInSource(cdmClass, idSet, nameSpace);
|
436
|
result.put(nameSpace, derivedUnitMap);
|
437
|
|
438
|
} catch (SQLException e) {
|
439
|
throw new RuntimeException(e);
|
440
|
}
|
441
|
return result;
|
442
|
}
|
443
|
|
444
|
|
445
|
|
446
|
/* (non-Javadoc)
|
447
|
* @see eu.etaxonomy.cdm.io.common.CdmIoBase#doCheck(eu.etaxonomy.cdm.io.common.IoStateBase)
|
448
|
*/
|
449
|
@Override
|
450
|
protected boolean doCheck(BerlinModelImportState state){
|
451
|
IOValidator<BerlinModelImportState> validator = new AlgaTerraDnaImportValidator();
|
452
|
return validator.validate(state);
|
453
|
}
|
454
|
|
455
|
|
456
|
/* (non-Javadoc)
|
457
|
* @see eu.etaxonomy.cdm.io.common.CdmIoBase#isIgnore(eu.etaxonomy.cdm.io.common.IImportConfigurator)
|
458
|
*/
|
459
|
protected boolean isIgnore(BerlinModelImportState state){
|
460
|
return ! ((AlgaTerraImportState)state).getAlgaTerraConfigurator().isDoDna();
|
461
|
}
|
462
|
|
463
|
}
|