e8174f630237859d16a4f14b4dd03d74da8e3595
[cdmlib-apps.git] / app-import / src / main / java / eu / etaxonomy / cdm / io / algaterra / AlgaTerraDnaImport.java
1 /**
2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9
10 package eu.etaxonomy.cdm.io.algaterra;
11
12 import java.net.URI;
13 import java.sql.ResultSet;
14 import java.sql.SQLException;
15 import java.text.ParseException;
16 import java.util.HashMap;
17 import java.util.HashSet;
18 import java.util.Locale;
19 import java.util.Map;
20 import java.util.Set;
21
22 import org.apache.commons.lang.StringUtils;
23 import org.apache.log4j.Logger;
24 import org.joda.time.DateTime;
25 import org.joda.time.format.DateTimeFormat;
26 import org.joda.time.format.DateTimeFormatter;
27 import org.springframework.format.datetime.joda.DateTimeParser;
28 import org.springframework.stereotype.Component;
29
30 import eu.etaxonomy.cdm.io.algaterra.validation.AlgaTerraDnaImportValidator;
31 import eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportConfigurator;
32 import eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportState;
33 import eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelTaxonImport;
34 import eu.etaxonomy.cdm.io.common.IOValidator;
35 import eu.etaxonomy.cdm.io.common.ResultSetPartitioner;
36 import eu.etaxonomy.cdm.model.common.Annotation;
37 import eu.etaxonomy.cdm.model.common.AnnotationType;
38 import eu.etaxonomy.cdm.model.common.CdmBase;
39 import eu.etaxonomy.cdm.model.common.DefinedTerm;
40 import eu.etaxonomy.cdm.model.common.Language;
41 import eu.etaxonomy.cdm.model.description.IndividualsAssociation;
42 import eu.etaxonomy.cdm.model.description.TaxonDescription;
43 import eu.etaxonomy.cdm.model.molecular.DnaSample;
44 import eu.etaxonomy.cdm.model.molecular.Sequence;
45 import eu.etaxonomy.cdm.model.occurrence.DerivationEvent;
46 import eu.etaxonomy.cdm.model.occurrence.DerivationEventType;
47 import eu.etaxonomy.cdm.model.occurrence.DerivedUnit;
48 import eu.etaxonomy.cdm.model.occurrence.FieldUnit;
49 import eu.etaxonomy.cdm.model.occurrence.SpecimenOrObservationBase;
50 import eu.etaxonomy.cdm.model.reference.Reference;
51 import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
52 import eu.etaxonomy.cdm.model.taxon.Taxon;
53 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
54
55
56 /**
57 * @author a.mueller
58 * @created 01.09.2012
59 */
60 @Component
61 public class AlgaTerraDnaImport extends AlgaTerraSpecimenImportBase {
62 private static final Logger logger = Logger.getLogger(AlgaTerraDnaImport.class);
63
64
65 private static int modCount = 5000;
66 private static final String pluralString = "dna facts";
67 private static final String dbTableName = "DNAFact"; //??
68
69
70 public AlgaTerraDnaImport(){
71 super(dbTableName, pluralString);
72 }
73
74
75
76 /* (non-Javadoc)
77 * @see eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportBase#getIdQuery()
78 */
79 @Override
80 protected String getIdQuery(BerlinModelImportState bmState) {
81 AlgaTerraImportState state = (AlgaTerraImportState)bmState;
82 String result = " SELECT df.DNAFactId " +
83 " FROM DNAFact df " +
84 " INNER JOIN Fact f ON f.ExtensionFk = df.DNAFactID " +
85 " WHERE f.FactCategoryFk = 203 ";
86 if (state.getAlgaTerraConfigurator().isRemoveRestricted()){
87 result = result + " AND df.ProtectedFlag = 0 ";
88 }
89 result += " ORDER BY df.DNAFactID ";
90 return result;
91 }
92
93 /* (non-Javadoc)
94 * @see eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportBase#getRecordQuery(eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportConfigurator)
95 */
96 @Override
97 protected String getRecordQuery(BerlinModelImportConfigurator config) {
98 String strQuery =
99 " SELECT df.*, pt.RIdentifier as taxonId, f.FactId, f.restrictedFlag, ecoFact.ecoFactId as ecoFactId " +
100 " FROM DNAFact df INNER JOIN Fact f ON f.ExtensionFk = df.DNAFactID " +
101 " LEFT OUTER JOIN PTaxon pt ON f.PTNameFk = pt.PTNameFk AND f.PTRefFk = pt.PTRefFk " +
102 " LEFT OUTER JOIN EcoFact ecoFact ON ecoFact.CultureStrain = df.CultureStrainNo " +
103 " WHERE f.FactCategoryFk = 203 AND (df.DNAFactId IN (" + ID_LIST_TOKEN + ") )"
104 + " ORDER BY DNAFactID "
105 ;
106 return strQuery;
107 }
108
109 /* (non-Javadoc)
110 * @see eu.etaxonomy.cdm.io.berlinModel.in.IPartitionedIO#doPartition(eu.etaxonomy.cdm.io.berlinModel.in.ResultSetPartitioner, eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportState)
111 */
112 public boolean doPartition(ResultSetPartitioner partitioner, BerlinModelImportState bmState) {
113 boolean success = true;
114
115 AlgaTerraImportState state = (AlgaTerraImportState)bmState;
116 try {
117 // makeVocabulariesAndFeatures(state);
118 } catch (Exception e1) {
119 logger.warn("Exception occurred when trying to create Ecofact vocabularies: " + e1.getMessage());
120 e1.printStackTrace();
121 }
122 Set<SpecimenOrObservationBase> samplesToSave = new HashSet<SpecimenOrObservationBase>();
123 Set<TaxonBase> taxaToSave = new HashSet<TaxonBase>();
124
125 Map<String, FieldUnit> ecoFactFieldObservationMap = (Map<String, FieldUnit>) partitioner.getObjectMap(ECO_FACT_FIELD_OBSERVATION_NAMESPACE);
126
127 ResultSet rs = partitioner.getResultSet();
128
129 Map<String, Reference> referenceMap = new HashMap<String, Reference>();
130
131
132 try {
133
134 int i = 0;
135
136 //for each reference
137 while (rs.next()){
138
139 if ((i++ % modCount) == 0 && i!= 1 ){ logger.info(pluralString + " handled: " + (i-1));}
140
141 int dnaFactId = rs.getInt("DNAFactId");
142 String keywordsStr = rs.getString("Keywords");
143 String locusStr = rs.getString("Locus");
144 String definitionStr = rs.getString("Definition");
145
146
147 try {
148
149 //source ref
150 Reference<?> sourceRef = state.getTransactionalSourceReference();
151
152 //import date
153 DateTime importDateTime = makeImportDateTime(rs);
154
155 //DNA Sample
156 DnaSample dnaSample = DnaSample.NewInstance();
157 dnaSample.setCreated(importDateTime);
158
159 //ecoFactFk
160 makeDerivationFromEcoFact(state, rs, dnaSample, samplesToSave, dnaFactId);
161
162 //sequence
163 Sequence sequence = makeSequence(rs, dnaSample, dnaFactId, importDateTime);
164
165 //locus
166 //FIXME Deduplicate DnaMarker
167 DefinedTerm locus = DefinedTerm.NewDnaMarkerInstance(definitionStr, keywordsStr, null);
168 locus.setCreated(importDateTime);
169 this.getTermService().save(locus);
170
171 sequence.setDnaMarker(locus);
172
173 //GenBank Accession
174 makeGenBankAccession(rs, sequence, importDateTime, dnaFactId);
175
176 //Comment
177 String commentStr = rs.getString("Comment");
178 if (isNotBlank(commentStr)){
179 Annotation annotation = Annotation.NewInstance(commentStr, AnnotationType.EDITORIAL(), Language.DEFAULT());
180 annotation.setCreated(importDateTime);
181 sequence.addAnnotation(annotation);
182 }
183
184 //Indiv.Assoc.
185 makeIndividualsAssociation(partitioner, rs, state, taxaToSave, dnaSample);
186
187 //TODO titleCache
188 //prelim implementation:
189 String cultStrain = rs.getString("CultureStrainNo");
190 String title = String.format("DNA Sample for %s at %s", cultStrain, keywordsStr);
191 dnaSample.setTitleCache(title, true);
192
193 //TODO preliminary implementation
194 String referenceStr = rs.getString("FactReference");
195 if (isNotBlank(referenceStr)){
196 Reference<?> ref = referenceMap.get(referenceStr);
197 if (ref == null){
198 ref = ReferenceFactory.newGeneric();
199 ref.setTitleCache(referenceStr, true);
200 referenceMap.put(referenceStr, ref);
201 }
202 sequence.addCitation(ref);
203 }
204
205 //save
206 samplesToSave.add(dnaSample);
207
208
209 } catch (Exception e) {
210 logger.warn("Exception in ecoFact: ecoFactId " + dnaFactId + ". " + e.getMessage());
211 e.printStackTrace();
212 }
213
214 }
215
216 logger.warn("DNASample or EcoFacts to save: " + samplesToSave.size());
217 getOccurrenceService().saveOrUpdate(samplesToSave);
218 logger.warn("Taxa to save: " + samplesToSave.size());
219 getTaxonService().saveOrUpdate(taxaToSave);
220
221 return success;
222 } catch (SQLException e) {
223 logger.error("SQLException:" + e);
224 return false;
225 }
226 }
227
228
229 private void makeDerivationFromEcoFact(AlgaTerraImportState state, ResultSet rs, DnaSample dnaSample, Set<SpecimenOrObservationBase> samplesToSave, Integer dnaFactId) throws SQLException {
230 Integer ecoFactFk = nullSafeInt(rs, "ecoFactId");
231 if (ecoFactFk != null){
232
233 DerivedUnit ecoFact = (DerivedUnit)state.getRelatedObject(ECO_FACT_DERIVED_UNIT_NAMESPACE, ecoFactFk.toString());
234 if (ecoFact == null){
235 logger.warn("EcoFact is null for ecoFactFk: " + ecoFactFk + ", DnaFactId: " + dnaFactId);
236 }else{
237 DerivationEvent.NewSimpleInstance(ecoFact, dnaSample, DerivationEventType.DNA_EXTRACTION());
238 samplesToSave.add(ecoFact);
239 }
240 }
241
242
243
244 }
245
246
247
248 private void makeIndividualsAssociation(ResultSetPartitioner partitioner, ResultSet rs, AlgaTerraImportState state, Set<TaxonBase> taxaToSave, DnaSample dnaSample) throws SQLException{
249 Reference<?> sourceRef = state.getTransactionalSourceReference();
250 Map<String, TaxonBase> taxonMap = (Map<String, TaxonBase>) partitioner.getObjectMap(BerlinModelTaxonImport.NAMESPACE);
251 Integer taxonId = rs.getInt("taxonId");
252 Integer factId = rs.getInt("factId");
253 Taxon taxon = getTaxon(state, taxonId, taxonMap, factId);
254 TaxonDescription desc = getTaxonDescription(state, taxon, sourceRef);
255 IndividualsAssociation assoc = IndividualsAssociation.NewInstance(dnaSample);
256 desc.addElement(assoc);
257 taxaToSave.add(taxon);
258 }
259
260
261 /**
262 * @param rs
263 * @return
264 * @throws SQLException
265 * @throws ParseException
266 */
267 private DateTime makeImportDateTime(ResultSet rs) throws SQLException,
268 ParseException {
269 DateTime importDateTime = null;
270 String importDateTimeStr = rs.getString("ImportDateTime");
271 if (isNotBlank(importDateTimeStr)){
272 importDateTimeStr = importDateTimeStr.substring(0,10);
273 DateTimeFormatter dayFormatter = DateTimeFormat.forPattern("dd.MM.yyyy");
274
275 // DateTimeFormatter formatter = new DateTimeFormatterBuilder().
276 // append;
277 DateTimeParser p = new DateTimeParser(dayFormatter);
278 importDateTime = p.parse(importDateTimeStr, Locale.GERMANY);
279
280 }
281 return importDateTime;
282 }
283
284
285
286 private Sequence makeSequence(ResultSet rs, DnaSample dnaSample, int dnaFactId, DateTime importDateTime) throws SQLException {
287 String sequenceStr = rs.getString("PlainSequence");
288 Integer seqLen = nullSafeInt(rs, "SeqLen");
289
290 if (sequenceStr == null){
291 logger.warn("PlainSequence is null. Id: " + dnaFactId);
292 }else{
293 if (sequenceStr.length() != seqLen){
294 logger.warn("SeqLen (" + seqLen+ ") and OriginalLen ("+sequenceStr.length()+") differ for dnaFact: " + dnaFactId);
295 }
296 }
297
298 Sequence sequence = Sequence.NewInstance(sequenceStr, seqLen);
299 sequence.setCreated(importDateTime);
300 dnaSample.addSequence(sequence);
301 return sequence;
302 }
303
304
305
306 /**
307 * @param sequence2
308 * @param rs
309 * @param accessionStr
310 * @param notesStr
311 * @param sequence
312 * @param importDateTime
313 * @return
314 * @throws SQLException
315 */
316 private void makeGenBankAccession(ResultSet rs, Sequence sequence, DateTime importDateTime, Integer dnaFactId) throws SQLException {
317 String accessionStr = rs.getString("Accession");
318 String notesStr = rs.getString("Notes");
319 String versionStr = rs.getString("Version");
320
321 URI genBankUri = null;
322 if (StringUtils.isNotBlank(notesStr)){
323 if (notesStr.startsWith("http")){
324 genBankUri = URI.create(notesStr);
325 }else{
326 logger.warn("Notes do not start with URI: " + notesStr);
327 }
328 }
329
330 if (isNotBlank(accessionStr) || genBankUri != null){
331 if (accessionStr != null && accessionStr.trim().equals("")){
332 accessionStr = null;
333 }
334 if (isGenBankAccessionNumber(accessionStr, versionStr, genBankUri, dnaFactId) || genBankUri != null){
335 sequence.setGeneticAccessionNumber(accessionStr);
336 }
337 }
338 }
339
340 private boolean isGenBankAccessionNumber(String accessionStr, String versionStr, URI genBankUri, Integer dnaFactId) {
341 boolean isGenBankAccessionNumber = accessionStr.matches("[A-Z]{2}\\d{6}");
342 boolean versionHasGenBankPart = versionStr.matches(".*GI:.*");
343 if (isGenBankAccessionNumber && versionHasGenBankPart){
344 return true;
345 }else {
346 if (genBankUri != null){
347 logger.warn("GenBank Uri exists but accession or version have been identified to use GenBank syntax. DNAFactID: " + dnaFactId);
348 }
349 if(isGenBankAccessionNumber || versionHasGenBankPart){
350 logger.warn("Either accession ("+ accessionStr +") or version ("+versionStr+") use GenBank syntax but the other does not. DNAFactID: " + dnaFactId);
351 }
352 return false;
353 }
354 }
355
356
357
358 protected String getDerivedUnitNameSpace(){
359 return ECO_FACT_DERIVED_UNIT_NAMESPACE;
360 }
361
362 protected String getFieldObservationNameSpace(){
363 return ECO_FACT_FIELD_OBSERVATION_NAMESPACE;
364 }
365
366 @Override
367 public Map<Object, Map<String, ? extends CdmBase>> getRelatedObjectsForPartition(ResultSet rs, BerlinModelImportState state) {
368 String nameSpace;
369 Class<?> cdmClass;
370 Set<String> idSet;
371 Map<Object, Map<String, ? extends CdmBase>> result = new HashMap<Object, Map<String, ? extends CdmBase>>();
372
373 try{
374 Set<String> taxonIdSet = new HashSet<String>();
375
376 Set<String> ecoFactFkSet = new HashSet<String>();
377
378 while (rs.next()){
379 handleForeignKey(rs, taxonIdSet, "taxonId");
380 handleForeignKey(rs, ecoFactFkSet, "ecoFactId");
381
382 }
383
384 //taxon map
385 nameSpace = BerlinModelTaxonImport.NAMESPACE;
386 cdmClass = TaxonBase.class;
387 idSet = taxonIdSet;
388 Map<String, TaxonBase> objectMap = (Map<String, TaxonBase>)getCommonService().getSourcedObjectsByIdInSource(cdmClass, idSet, nameSpace);
389 result.put(nameSpace, objectMap);
390
391
392 //eco fact derived unit map
393 nameSpace = AlgaTerraFactEcologyImport.ECO_FACT_DERIVED_UNIT_NAMESPACE;
394 cdmClass = DerivedUnit.class;
395 idSet = ecoFactFkSet;
396 Map<String, DerivedUnit> derivedUnitMap = (Map<String, DerivedUnit>)getCommonService().getSourcedObjectsByIdInSource(cdmClass, idSet, nameSpace);
397 result.put(nameSpace, derivedUnitMap);
398
399 } catch (SQLException e) {
400 throw new RuntimeException(e);
401 }
402 return result;
403 }
404
405 @Override
406 protected boolean doCheck(BerlinModelImportState state){
407 IOValidator<BerlinModelImportState> validator = new AlgaTerraDnaImportValidator();
408 return validator.validate(state);
409 }
410
411 @Override
412 protected boolean isIgnore(BerlinModelImportState state){
413 return ! ((AlgaTerraImportState)state).getAlgaTerraConfigurator().isDoDna();
414 }
415
416 }