bugfix for wrong http anchor replacement
[cdmlib.git] / cdmlib-io / src / main / java / eu / etaxonomy / cdm / io / berlinModel / in / BerlinModelOccurrenceImport.java
1 /**
2 * Copyright (C) 2007 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9
10 package eu.etaxonomy.cdm.io.berlinModel.in;
11
12 import java.sql.ResultSet;
13 import java.sql.SQLException;
14 import java.util.ArrayList;
15 import java.util.HashMap;
16 import java.util.HashSet;
17 import java.util.List;
18 import java.util.Map;
19 import java.util.Set;
20
21 import org.apache.commons.lang.StringUtils;
22 import org.apache.log4j.Logger;
23 import org.springframework.stereotype.Component;
24
25 import eu.etaxonomy.cdm.common.CdmUtils;
26 import eu.etaxonomy.cdm.hibernate.HibernateProxyHelper;
27 import eu.etaxonomy.cdm.io.berlinModel.BerlinModelTransformer;
28 import eu.etaxonomy.cdm.io.berlinModel.in.validation.BerlinModelOccurrenceImportValidator;
29 import eu.etaxonomy.cdm.io.common.IOValidator;
30 import eu.etaxonomy.cdm.io.common.ResultSetPartitioner;
31 import eu.etaxonomy.cdm.model.common.Annotation;
32 import eu.etaxonomy.cdm.model.common.AnnotationType;
33 import eu.etaxonomy.cdm.model.common.CdmBase;
34 import eu.etaxonomy.cdm.model.common.Marker;
35 import eu.etaxonomy.cdm.model.common.MarkerType;
36 import eu.etaxonomy.cdm.model.description.DescriptionElementBase;
37 import eu.etaxonomy.cdm.model.description.Distribution;
38 import eu.etaxonomy.cdm.model.description.PresenceAbsenceTermBase;
39 import eu.etaxonomy.cdm.model.description.TaxonDescription;
40 import eu.etaxonomy.cdm.model.location.NamedArea;
41 import eu.etaxonomy.cdm.model.location.TdwgArea;
42 import eu.etaxonomy.cdm.model.reference.Reference;
43 import eu.etaxonomy.cdm.model.taxon.Taxon;
44 import eu.etaxonomy.cdm.model.taxon.TaxonBase;
45 import eu.etaxonomy.cdm.strategy.exceptions.UnknownCdmTypeException;
46
47
48 /**
49 * @author a.mueller
50 * @created 20.03.2008
51 * @version 1.0
52 */
53 @Component
54 public class BerlinModelOccurrenceImport extends BerlinModelImportBase {
55 private static final Logger logger = Logger.getLogger(BerlinModelOccurrenceImport.class);
56
57 public static final String NAMESPACE = "Occurrence";
58
59
60 private static int modCount = 5000;
61 private static final String pluralString = "occurrences";
62 private static final String dbTableName = "emOccurrence"; //??
63
64
65 public BerlinModelOccurrenceImport(){
66 super();
67 }
68
69 /* (non-Javadoc)
70 * @see eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportBase#getIdQuery()
71 */
72 @Override
73 protected String getIdQuery(BerlinModelImportState state) {
74 String result = " SELECT occurrenceId FROM " + getTableName();
75 if (StringUtils.isNotBlank(state.getConfig().getOccurrenceFilter())){
76 result += " WHERE " + state.getConfig().getOccurrenceFilter();
77 }
78 return result;
79 }
80
81 /* (non-Javadoc)
82 * @see eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportBase#getRecordQuery(eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportConfigurator)
83 */
84 @Override
85 protected String getRecordQuery(BerlinModelImportConfigurator config) {
86 String strQuery = //DISTINCT because otherwise emOccurrenceSource creates multiple records for a single distribution
87 " SELECT DISTINCT PTaxon.RIdentifier AS taxonId, emOccurrence.OccurrenceId, emOccurrence.Native, emOccurrence.Introduced, " +
88 " emOccurrence.Cultivated, emOccurSumCat.emOccurSumCatId, emOccurSumCat.Short, emOccurSumCat.Description, " +
89 " emOccurSumCat.OutputCode, emArea.AreaId, emArea.TDWGCode, emArea.EMCode " +
90 " FROM emOccurrence INNER JOIN " +
91 " emArea ON emOccurrence.AreaFk = emArea.AreaId INNER JOIN " +
92 " PTaxon ON emOccurrence.PTNameFk = PTaxon.PTNameFk AND emOccurrence.PTRefFk = PTaxon.PTRefFk LEFT OUTER JOIN " +
93 " emOccurSumCat ON emOccurrence.SummaryStatus = emOccurSumCat.emOccurSumCatId LEFT OUTER JOIN " +
94 " emOccurrenceSource ON emOccurrence.OccurrenceId = emOccurrenceSource.OccurrenceFk " +
95 " WHERE (emOccurrence.OccurrenceId IN (" + ID_LIST_TOKEN + ") )" +
96 " ORDER BY PTaxon.RIdentifier";
97 return strQuery;
98 }
99
100 /* (non-Javadoc)
101 * @see eu.etaxonomy.cdm.io.berlinModel.in.IPartitionedIO#doPartition(eu.etaxonomy.cdm.io.berlinModel.in.ResultSetPartitioner, eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportState)
102 */
103 public boolean doPartition(ResultSetPartitioner partitioner, BerlinModelImportState state) {
104 boolean success = true;
105 Set<TaxonBase> taxaToSave = new HashSet<TaxonBase>();
106
107 Map<String, TaxonBase> taxonMap = (Map<String, TaxonBase>) partitioner.getObjectMap(BerlinModelTaxonImport.NAMESPACE);
108
109 ResultSet rs = partitioner.getResultSet();
110
111 try {
112 //map to store the mapping of duplicate berlin model occurrences to their real distributions
113 //duplicated may occurr due to area mappings from BM areas to TDWG areas
114 Map<Integer, String> duplicateMap = new HashMap<Integer, String>();
115 int oldTaxonId = -1;
116 TaxonDescription oldDescription = null;
117 int i = 0;
118 int countDescriptions = 0;
119 int countDistributions = 0;
120 int countDuplicates = 0;
121 //for each reference
122 while (rs.next()){
123
124 if ((i++ % modCount) == 0 && i!= 1 ){ logger.info("Facts handled: " + (i-1));}
125
126 int occurrenceId = rs.getInt("OccurrenceId");
127 int newTaxonId = rs.getInt("taxonId");
128 String tdwgCodeString = rs.getString("TDWGCode");
129 String emCodeString = rs.getString("EMCode");
130 Integer emStatusId = (Integer)rs.getObject("emOccurSumCatId");
131
132 try {
133 //status
134 PresenceAbsenceTermBase<?> status = null;
135 String alternativeStatusString = null;
136 if (emStatusId != null){
137 status = BerlinModelTransformer.occStatus2PresenceAbsence(emStatusId);
138 }else{
139 String[] stringArray = new String[]{rs.getString("Native"), rs.getString("Introduced"), rs.getString("Cultivated")};
140 alternativeStatusString = CdmUtils.concat(",", stringArray);
141 }
142
143 //Create area list
144 List<NamedArea> areas = new ArrayList<NamedArea>();
145 if (tdwgCodeString != null){
146
147 String[] tdwgCodes = new String[]{tdwgCodeString};
148 if (state.getConfig().isSplitTdwgCodes()){
149 tdwgCodes = tdwgCodeString.split(";");
150 }
151
152 for (String tdwgCode : tdwgCodes){
153 NamedArea area = TdwgArea.getAreaByTdwgAbbreviation(tdwgCode.trim());
154 if (area == null){
155 area = getOtherAreas(state, emCodeString, tdwgCodeString);
156 }
157 if (area != null){
158 areas.add(area);
159 }
160 }
161 }
162
163 Reference<?> sourceRef = state.getTransactionalSourceReference();
164 //create description(elements)
165 TaxonDescription taxonDescription = getTaxonDescription(newTaxonId, oldTaxonId, oldDescription, taxonMap, occurrenceId, sourceRef);
166 if (areas.size()== 0){
167 NamedArea area = getOtherAreas(state, emCodeString, tdwgCodeString);
168 if (area != null){
169 areas.add(area);
170 }
171 }
172 if (areas.size() == 0){
173 String areaId = rs.getString("AreaId");
174 logger.warn("No areas defined for occurrence " + occurrenceId + ". EMCode: " + CdmUtils.Nz(emCodeString).trim() + ". AreaId: " + areaId );
175 }
176 for (NamedArea area : areas){
177 Distribution distribution = Distribution.NewInstance(area, status);
178 if (status == null){
179 AnnotationType annotationType = AnnotationType.EDITORIAL();
180 Annotation annotation = Annotation.NewInstance(alternativeStatusString, annotationType, null);
181 distribution.addAnnotation(annotation);
182 distribution.addMarker(Marker.NewInstance(MarkerType.PUBLISH(), false));
183 }
184 // distribution.setCitation(sourceRef);
185 if (taxonDescription != null) {
186 Distribution duplicate = checkIsNoDuplicate(taxonDescription, distribution, duplicateMap , occurrenceId);
187 if (duplicate == null){
188 taxonDescription.addElement(distribution);
189 distribution.addSource(String.valueOf(occurrenceId), NAMESPACE, state.getTransactionalSourceReference(), null);
190 countDistributions++;
191 if (taxonDescription != oldDescription){
192 taxaToSave.add(taxonDescription.getTaxon());
193 oldDescription = taxonDescription;
194 countDescriptions++;
195 }
196 }else{
197 countDuplicates++;
198 duplicate.addSource(String.valueOf(occurrenceId), NAMESPACE, state.getTransactionalSourceReference(), null);
199 logger.info("Distribution is duplicate"); }
200 } else {
201 logger.warn("Distribution " + area.getLabel() + " ignored. OccurrenceId = " + occurrenceId);
202 success = false;
203 }
204 }
205
206 } catch (UnknownCdmTypeException e) {
207 logger.error("Unknown presenceAbsence status id: " + emStatusId);
208 e.printStackTrace();
209 success = false;
210 }
211
212 }
213
214 logger.info("Distributions: " + countDistributions + ", Descriptions: " + countDescriptions );
215 logger.info("Duplicate occurrences: " + (countDuplicates));
216
217 logger.info("Taxa to save: " + taxaToSave.size());
218 getTaxonService().save(taxaToSave);
219
220 return success;
221 } catch (SQLException e) {
222 logger.error("SQLException:" + e);
223 return false;
224 }
225 }
226
227
228
229 /* (non-Javadoc)
230 * @see eu.etaxonomy.cdm.io.berlinModel.in.IPartitionedIO#getRelatedObjectsForPartition(java.sql.ResultSet)
231 */
232 public Map<Object, Map<String, ? extends CdmBase>> getRelatedObjectsForPartition(ResultSet rs) {
233 String nameSpace;
234 Class cdmClass;
235 Set<String> idSet;
236 Map<Object, Map<String, ? extends CdmBase>> result = new HashMap<Object, Map<String, ? extends CdmBase>>();
237
238 try{
239 Set<String> taxonIdSet = new HashSet<String>();
240 while (rs.next()){
241 handleForeignKey(rs, taxonIdSet, "taxonId");
242 }
243
244 //taxon map
245 nameSpace = BerlinModelTaxonImport.NAMESPACE;
246 cdmClass = TaxonBase.class;
247 idSet = taxonIdSet;
248 Map<String, TaxonBase> objectMap = (Map<String, TaxonBase>)getCommonService().getSourcedObjectsByIdInSource(cdmClass, idSet, nameSpace);
249 result.put(nameSpace, objectMap);
250
251 } catch (SQLException e) {
252 throw new RuntimeException(e);
253 }
254 return result;
255 }
256
257
258
259 /**
260 * Tests if a distribution with the same tdwgArea and the same status already exists in the description.
261 * If so the old distribution is returned
262 * @param description
263 * @param tdwgArea
264 * @return false, if dupplicate exists. True otherwise.
265 */
266 private Distribution checkIsNoDuplicate(TaxonDescription description, Distribution distribution, Map<Integer, String> duplicateMap, Integer bmDistributionId){
267 for (DescriptionElementBase descElBase : description.getElements()){
268 if (descElBase.isInstanceOf(Distribution.class)){
269 Distribution oldDistr = HibernateProxyHelper.deproxy(descElBase, Distribution.class);
270 NamedArea oldArea = oldDistr.getArea();
271 if (oldArea != null && oldArea.equals(distribution.getArea())){
272 PresenceAbsenceTermBase<?> oldStatus = oldDistr.getStatus();
273 if (oldStatus != null && oldStatus.equals(distribution.getStatus())){
274 duplicateMap.put(bmDistributionId, oldDistr.getSources().iterator().next().getIdInSource());
275 return oldDistr;
276 }
277 }
278 }
279 }
280 return null;
281 }
282
283 /**
284 * Use same TaxonDescription if two records belong to the same taxon
285 * @param newTaxonId
286 * @param oldTaxonId
287 * @param oldDescription
288 * @param taxonMap
289 * @return
290 */
291 private TaxonDescription getTaxonDescription(int newTaxonId, int oldTaxonId, TaxonDescription oldDescription, Map<String, TaxonBase> taxonMap, int occurrenceId, Reference<?> sourceSec){
292 TaxonDescription result = null;
293 if (oldDescription == null || newTaxonId != oldTaxonId){
294 TaxonBase taxonBase = taxonMap.get(String.valueOf(newTaxonId));
295 //TODO for testing
296 //TaxonBase taxonBase = Taxon.NewInstance(BotanicalName.NewInstance(Rank.SPECIES()), null);
297 Taxon taxon;
298 if ( taxonBase instanceof Taxon ) {
299 taxon = (Taxon) taxonBase;
300 } else if (taxonBase != null) {
301 logger.warn("TaxonBase for Occurrence " + occurrenceId + " was not of type Taxon but: " + taxonBase.getClass().getSimpleName());
302 return null;
303 } else {
304 logger.warn("TaxonBase for Occurrence " + occurrenceId + " is null.");
305 return null;
306 }
307 Set<TaxonDescription> descriptionSet= taxon.getDescriptions();
308 if (descriptionSet.size() > 0) {
309 result = descriptionSet.iterator().next();
310 }else{
311 result = TaxonDescription.NewInstance();
312 result.setTitleCache(sourceSec.getTitleCache(), true);
313 taxon.addDescription(result);
314 }
315 }else{
316 result = oldDescription;
317 }
318 return result;
319 }
320
321
322 /* (non-Javadoc)
323 * @see eu.etaxonomy.cdm.io.common.CdmIoBase#doCheck(eu.etaxonomy.cdm.io.common.IoStateBase)
324 */
325 @Override
326 protected boolean doCheck(BerlinModelImportState state){
327 IOValidator<BerlinModelImportState> validator = new BerlinModelOccurrenceImportValidator();
328 return validator.validate(state);
329 }
330
331 /* (non-Javadoc)
332 * @see eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportBase#getTableName()
333 */
334 @Override
335 protected String getTableName() {
336 return dbTableName;
337 }
338
339 /* (non-Javadoc)
340 * @see eu.etaxonomy.cdm.io.berlinModel.in.BerlinModelImportBase#getPluralString()
341 */
342 @Override
343 public String getPluralString() {
344 return pluralString;
345 }
346
347 /* (non-Javadoc)
348 * @see eu.etaxonomy.cdm.io.common.CdmIoBase#isIgnore(eu.etaxonomy.cdm.io.common.IImportConfigurator)
349 */
350 protected boolean isIgnore(BerlinModelImportState state){
351 return ! state.getConfig().isDoOccurrence();
352 }
353
354 }