update factory methods for original sources #1549
[cdmlib.git] / cdmlib-io / src / main / java / eu / etaxonomy / cdm / io / berlinModel / in / BerlinModelOccurrenceImport.java
1 /**
2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9
10 package eu.etaxonomy.cdm.io.berlinModel.in;
11
12 import java.sql.ResultSet;
13 import java.sql.SQLException;
14 import java.util.ArrayList;
15 import java.util.HashMap;
16 import java.util.HashSet;
17 import java.util.List;
18 import java.util.Map;
19 import java.util.Set;
20
21 import org.apache.commons.lang.StringUtils;
22 import org.apache.log4j.Logger;
23 import org.springframework.stereotype.Component;
24
25 import eu.etaxonomy.cdm.common.CdmUtils;
26 import eu.etaxonomy.cdm.hibernate.HibernateProxyHelper;
27 import eu.etaxonomy.cdm.io.berlinModel.BerlinModelTransformer;
28 import eu.etaxonomy.cdm.io.berlinModel.in.validation.BerlinModelOccurrenceImportValidator;
29 import eu.etaxonomy.cdm.io.common.IOValidator;
30 import eu.etaxonomy.cdm.io.common.ResultSetPartitioner;
31 import eu.etaxonomy.cdm.model.common.Annotation;
32 import eu.etaxonomy.cdm.model.common.AnnotationType;
33 import eu.etaxonomy.cdm.model.common.CdmBase;
34 import eu.etaxonomy.cdm.model.common.Marker;
35 import eu.etaxonomy.cdm.model.common.MarkerType;
36 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
37 import eu.etaxonomy.cdm.model.description.Distribution;
38 import eu.etaxonomy.cdm.model.description.PresenceAbsenceTermBase;
39 import eu.etaxonomy.cdm.model.description.TaxonDescription;
40 import eu.etaxonomy.cdm.model.location.NamedArea;
41 import eu.etaxonomy.cdm.model.location.TdwgArea;
42 import eu.etaxonomy.cdm.model.reference.Reference;
43 import eu.etaxonomy.cdm.model.taxon.Taxon;
44 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
45 import eu.etaxonomy.cdm.strategy.exceptions.UnknownCdmTypeException;
46
47
48 /**
49 * @author a.mueller
50 * @created 20.03.2008
51 * @version 1.0
52 */
53 @Component
54 public class BerlinModelOccurrenceImport extends BerlinModelImportBase {
55 private static final Logger logger = Logger.getLogger(BerlinModelOccurrenceImport.class);
56
57 public static final String NAMESPACE = "Occurrence";
58
59
60 private static int modCount = 5000;
61 private static final String pluralString = "occurrences";
62 private static final String dbTableName = "emOccurrence"; //??
63
64
65 public BerlinModelOccurrenceImport(){
66 super(dbTableName, pluralString);
67 }
68
69 /* (non-Javadoc)
70 * @see eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportBase#getIdQuery()
71 */
72 @Override
73 protected String getIdQuery(BerlinModelImportState state) {
74 String result = " SELECT occurrenceId FROM " + getTableName();
75 if (StringUtils.isNotBlank(state.getConfig().getOccurrenceFilter())){
76 result += " WHERE " + state.getConfig().getOccurrenceFilter();
77 }
78 return result;
79 }
80
81 /* (non-Javadoc)
82 * @see eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportBase#getRecordQuery(eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportConfigurator)
83 */
84 @Override
85 protected String getRecordQuery(BerlinModelImportConfigurator config) {
86 String emCode = config.isIncludesAreaEmCode()? ", emArea.EMCode" : "";
87 String strQuery = //DISTINCT because otherwise emOccurrenceSource creates multiple records for a single distribution
88 " SELECT DISTINCT PTaxon.RIdentifier AS taxonId, emOccurrence.OccurrenceId, emOccurrence.Native, emOccurrence.Introduced, " +
89 " emOccurrence.Cultivated, emOccurSumCat.emOccurSumCatId, emOccurSumCat.Short, emOccurSumCat.Description, " +
90 " emOccurSumCat.OutputCode, emArea.AreaId, emArea.TDWGCode " + emCode +
91 " FROM emOccurrence INNER JOIN " +
92 " emArea ON emOccurrence.AreaFk = emArea.AreaId INNER JOIN " +
93 " PTaxon ON emOccurrence.PTNameFk = PTaxon.PTNameFk AND emOccurrence.PTRefFk = PTaxon.PTRefFk LEFT OUTER JOIN " +
94 " emOccurSumCat ON emOccurrence.SummaryStatus = emOccurSumCat.emOccurSumCatId LEFT OUTER JOIN " +
95 " emOccurrenceSource ON emOccurrence.OccurrenceId = emOccurrenceSource.OccurrenceFk " +
96 " WHERE (emOccurrence.OccurrenceId IN (" + ID_LIST_TOKEN + ") )" +
97 " ORDER BY PTaxon.RIdentifier";
98 return strQuery;
99 }
100
101 /* (non-Javadoc)
102 * @see eu.etaxonomy.cdm.io.berlinModel.in.IPartitionedIO#doPartition(eu.etaxonomy.cdm.io.berlinModel.in.ResultSetPartitioner, eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportState)
103 */
104 @Override
105 public boolean doPartition(ResultSetPartitioner partitioner, BerlinModelImportState state) {
106 boolean success = true;
107 Set<TaxonBase> taxaToSave = new HashSet<TaxonBase>();
108
109 Map<String, TaxonBase<?>> taxonMap = (Map<String, TaxonBase<?>>) partitioner.getObjectMap(BerlinModelTaxonImport.NAMESPACE);
110
111 ResultSet rs = partitioner.getResultSet();
112
113 try {
114 //map to store the mapping of duplicate berlin model occurrences to their real distributions
115 //duplicated may occurr due to area mappings from BM areas to TDWG areas
116 Map<Integer, String> duplicateMap = new HashMap<Integer, String>();
117 int oldTaxonId = -1;
118 TaxonDescription oldDescription = null;
119 int i = 0;
120 int countDescriptions = 0;
121 int countDistributions = 0;
122 int countDuplicates = 0;
123 //for each reference
124 while (rs.next()){
125
126 if ((i++ % modCount) == 0 && i!= 1 ){ logger.info("Facts handled: " + (i-1));}
127
128 int occurrenceId = rs.getInt("OccurrenceId");
129 int newTaxonId = rs.getInt("taxonId");
130 String tdwgCodeString = rs.getString("TDWGCode");
131 String emCodeString = state.getConfig().isIncludesAreaEmCode() ? rs.getString("EMCode") : null;
132 Integer emStatusId = nullSafeInt(rs, "emOccurSumCatId");
133
134 try {
135 //status
136 PresenceAbsenceTermBase<?> status = null;
137 String alternativeStatusString = null;
138 if (emStatusId != null){
139 status = BerlinModelTransformer.occStatus2PresenceAbsence(emStatusId);
140 }else{
141 String[] stringArray = new String[]{rs.getString("Native"), rs.getString("Introduced"), rs.getString("Cultivated")};
142 alternativeStatusString = CdmUtils.concat(",", stringArray);
143 }
144
145 //Create area list
146 List<NamedArea> areas = new ArrayList<NamedArea>();
147 if (tdwgCodeString != null){
148
149 String[] tdwgCodes = new String[]{tdwgCodeString};
150 if (state.getConfig().isSplitTdwgCodes()){
151 tdwgCodes = tdwgCodeString.split(";");
152 }
153
154 for (String tdwgCode : tdwgCodes){
155 NamedArea area = TdwgArea.getAreaByTdwgAbbreviation(tdwgCode.trim());
156 if (area == null){
157 area = getOtherAreas(state, emCodeString, tdwgCodeString);
158 }
159 if (area != null){
160 areas.add(area);
161 }
162 }
163 }
164
165 Reference<?> sourceRef = state.getTransactionalSourceReference();
166 //create description(elements)
167 TaxonDescription taxonDescription = getTaxonDescription(newTaxonId, oldTaxonId, oldDescription, taxonMap, occurrenceId, sourceRef);
168 if (areas.size()== 0){
169 NamedArea area = getOtherAreas(state, emCodeString, tdwgCodeString);
170 if (area != null){
171 areas.add(area);
172 }
173 }
174 if (areas.size() == 0){
175 String areaId = rs.getString("AreaId");
176 logger.warn("No areas defined for occurrence " + occurrenceId + ". EMCode: " + CdmUtils.Nz(emCodeString).trim() + ". AreaId: " + areaId );
177 }
178 for (NamedArea area : areas){
179 Distribution distribution = Distribution.NewInstance(area, status);
180 if (status == null){
181 AnnotationType annotationType = AnnotationType.EDITORIAL();
182 Annotation annotation = Annotation.NewInstance(alternativeStatusString, annotationType, null);
183 distribution.addAnnotation(annotation);
184 distribution.addMarker(Marker.NewInstance(MarkerType.PUBLISH(), false));
185 }
186 // distribution.setCitation(sourceRef);
187 if (taxonDescription != null) {
188 Distribution duplicate = checkIsNoDuplicate(taxonDescription, distribution, duplicateMap , occurrenceId);
189 if (duplicate == null){
190 taxonDescription.addElement(distribution);
191 distribution.addImportSource(String.valueOf(occurrenceId), NAMESPACE, state.getTransactionalSourceReference(), null);
192 countDistributions++;
193 if (taxonDescription != oldDescription){
194 taxaToSave.add(taxonDescription.getTaxon());
195 oldDescription = taxonDescription;
196 countDescriptions++;
197 }
198 }else{
199 countDuplicates++;
200 duplicate.addImportSource(String.valueOf(occurrenceId), NAMESPACE, state.getTransactionalSourceReference(), null);
201 logger.info("Distribution is duplicate"); }
202 } else {
203 logger.warn("Distribution " + area.getLabel() + " ignored. OccurrenceId = " + occurrenceId);
204 success = false;
205 }
206 }
207
208 } catch (UnknownCdmTypeException e) {
209 logger.error("Unknown presenceAbsence status id: " + emStatusId);
210 e.printStackTrace();
211 success = false;
212 }
213
214 }
215
216 logger.info("Distributions: " + countDistributions + ", Descriptions: " + countDescriptions );
217 logger.info("Duplicate occurrences: " + (countDuplicates));
218
219 logger.info("Taxa to save: " + taxaToSave.size());
220 getTaxonService().save(taxaToSave);
221
222 return success;
223 } catch (SQLException e) {
224 logger.error("SQLException:" + e);
225 return false;
226 }
227 }
228
229
230
231 /* (non-Javadoc)
232 * @see eu.etaxonomy.cdm.io.berlinModel.in.IPartitionedIO#getRelatedObjectsForPartition(java.sql.ResultSet)
233 */
234 public Map<Object, Map<String, ? extends CdmBase>> getRelatedObjectsForPartition(ResultSet rs) {
235 String nameSpace;
236 Class cdmClass;
237 Set<String> idSet;
238 Map<Object, Map<String, ? extends CdmBase>> result = new HashMap<Object, Map<String, ? extends CdmBase>>();
239
240 try{
241 Set<String> taxonIdSet = new HashSet<String>();
242 while (rs.next()){
243 handleForeignKey(rs, taxonIdSet, "taxonId");
244 }
245
246 //taxon map
247 nameSpace = BerlinModelTaxonImport.NAMESPACE;
248 cdmClass = TaxonBase.class;
249 idSet = taxonIdSet;
250 Map<String, TaxonBase> objectMap = (Map<String, TaxonBase>)getCommonService().getSourcedObjectsByIdInSource(cdmClass, idSet, nameSpace);
251 result.put(nameSpace, objectMap);
252
253 } catch (SQLException e) {
254 throw new RuntimeException(e);
255 }
256 return result;
257 }
258
259
260
261 /**
262 * Tests if a distribution with the same tdwgArea and the same status already exists in the description.
263 * If so the old distribution is returned
264 * @param description
265 * @param tdwgArea
266 * @return false, if dupplicate exists. True otherwise.
267 */
268 private Distribution checkIsNoDuplicate(TaxonDescription description, Distribution distribution, Map<Integer, String> duplicateMap, Integer bmDistributionId){
269 for (DescriptionElementBase descElBase : description.getElements()){
270 if (descElBase.isInstanceOf(Distribution.class)){
271 Distribution oldDistr = HibernateProxyHelper.deproxy(descElBase, Distribution.class);
272 NamedArea oldArea = oldDistr.getArea();
273 if (oldArea != null && oldArea.equals(distribution.getArea())){
274 PresenceAbsenceTermBase<?> oldStatus = oldDistr.getStatus();
275 if (oldStatus != null && oldStatus.equals(distribution.getStatus())){
276 duplicateMap.put(bmDistributionId, oldDistr.getSources().iterator().next().getIdInSource());
277 return oldDistr;
278 }
279 }
280 }
281 }
282 return null;
283 }
284
285 /**
286 * Use same TaxonDescription if two records belong to the same taxon
287 * @param newTaxonId
288 * @param oldTaxonId
289 * @param oldDescription
290 * @param taxonMap
291 * @return
292 */
293 private TaxonDescription getTaxonDescription(int newTaxonId, int oldTaxonId, TaxonDescription oldDescription, Map<String, TaxonBase<?>> taxonMap, int occurrenceId, Reference<?> sourceSec){
294 TaxonDescription result = null;
295 if (oldDescription == null || newTaxonId != oldTaxonId){
296 TaxonBase<?> taxonBase = taxonMap.get(String.valueOf(newTaxonId));
297 //TODO for testing
298 //TaxonBase taxonBase = Taxon.NewInstance(BotanicalName.NewInstance(Rank.SPECIES()), null);
299 Taxon taxon;
300 if ( taxonBase instanceof Taxon ) {
301 taxon = (Taxon) taxonBase;
302 } else if (taxonBase != null) {
303 logger.warn("TaxonBase for Occurrence " + occurrenceId + " was not of type Taxon but: " + taxonBase.getClass().getSimpleName());
304 return null;
305 } else {
306 logger.warn("TaxonBase for Occurrence " + occurrenceId + " is null.");
307 return null;
308 }
309 Set<TaxonDescription> descriptionSet= taxon.getDescriptions();
310 if (descriptionSet.size() > 0) {
311 result = descriptionSet.iterator().next();
312 }else{
313 result = TaxonDescription.NewInstance();
314 result.setTitleCache(sourceSec.getTitleCache(), true);
315 taxon.addDescription(result);
316 }
317 }else{
318 result = oldDescription;
319 }
320 return result;
321 }
322
323
324 /* (non-Javadoc)
325 * @see eu.etaxonomy.cdm.io.common.CdmIoBase#doCheck(eu.etaxonomy.cdm.io.common.IoStateBase)
326 */
327 @Override
328 protected boolean doCheck(BerlinModelImportState state){
329 IOValidator<BerlinModelImportState> validator = new BerlinModelOccurrenceImportValidator();
330 return validator.validate(state);
331 }
332
333
334 /* (non-Javadoc)
335 * @see eu.etaxonomy.cdm.io.common.CdmIoBase#isIgnore(eu.etaxonomy.cdm.io.common.IImportConfigurator)
336 */
337 protected boolean isIgnore(BerlinModelImportState state){
338 if (! state.getConfig().isDoOccurrence()){
339 return true;
340 }else{
341 if (!this.checkSqlServerColumnExists(state.getConfig().getSource(), "emOccurrence", "OccurrenceId")){
342 logger.error("emOccurrence table or emOccurrenceId does not exist. Must ignore occurrence import");
343 return true;
344 }else{
345 return false;
346 }
347 }
348 }
349
350 }