1
|
/**
|
2
|
* Copyright (C) 2009 EDIT
|
3
|
* European Distributed Institute of Taxonomy
|
4
|
* http://www.e-taxonomy.eu
|
5
|
*
|
6
|
* The contents of this file are subject to the Mozilla Public License Version 1.1
|
7
|
* See LICENSE.TXT at the top of this package for the full license terms.
|
8
|
*/
|
9
|
package eu.etaxonomy.cdm.io.dwca.in;
|
10
|
|
11
|
import java.io.File;
|
12
|
import java.io.IOException;
|
13
|
import java.io.InputStream;
|
14
|
import java.io.InputStreamReader;
|
15
|
import java.io.Reader;
|
16
|
import java.io.UnsupportedEncodingException;
|
17
|
import java.net.URI;
|
18
|
import java.util.ArrayList;
|
19
|
import java.util.Arrays;
|
20
|
import java.util.HashMap;
|
21
|
import java.util.List;
|
22
|
import java.util.Map;
|
23
|
import java.util.zip.ZipEntry;
|
24
|
import java.util.zip.ZipFile;
|
25
|
|
26
|
import javax.xml.bind.JAXBContext;
|
27
|
import javax.xml.bind.JAXBException;
|
28
|
import javax.xml.bind.Unmarshaller;
|
29
|
|
30
|
import org.apache.commons.lang.StringUtils;
|
31
|
import org.apache.log4j.Logger;
|
32
|
|
33
|
import au.com.bytecode.opencsv.CSVParser;
|
34
|
import au.com.bytecode.opencsv.CSVReader;
|
35
|
import au.com.bytecode.opencsv.CSVWriter;
|
36
|
import eu.etaxonomy.cdm.io.dwca.jaxb.Archive;
|
37
|
import eu.etaxonomy.cdm.io.dwca.jaxb.ArchiveEntryBase;
|
38
|
import eu.etaxonomy.cdm.io.dwca.jaxb.Extension;
|
39
|
import eu.etaxonomy.cdm.io.dwca.out.DwcaMetaDataRecord;
|
40
|
import eu.etaxonomy.cdm.io.stream.CsvStream;
|
41
|
import eu.etaxonomy.cdm.io.stream.IReader;
|
42
|
import eu.etaxonomy.cdm.io.stream.ListReader;
|
43
|
import eu.etaxonomy.cdm.io.stream.terms.TermUri;
|
44
|
|
45
|
/**
|
46
|
* This class transforms a Darwin Core Archive zip file into a set of CSVReaderInputStreams.
|
47
|
* For each data file included in the zip it creates one stream by evaluating the meta file.
|
48
|
* Ecological metadata handling is still unclear.
|
49
|
* @author a.mueller
|
50
|
\* @since 17.10.2011
|
51
|
*
|
52
|
*/
|
53
|
public class DwcaZipToStreamConverter<STATE extends DwcaImportState> {
|
54
|
private static Logger logger = Logger.getLogger(DwcaZipToStreamConverter.class);
|
55
|
|
56
|
private final String META_XML = "meta.xml";
|
57
|
protected static final boolean IS_CORE = true;
|
58
|
|
59
|
private final List<TermUri> extensionList = Arrays.asList(
|
60
|
TermUri.EOL_AGENT,
|
61
|
TermUri.DWC_RESOURCE_RELATIONSHIP,
|
62
|
TermUri.GBIF_TYPES_AND_SPECIMEN,
|
63
|
TermUri.GBIF_VERNACULAR_NAMES,
|
64
|
TermUri.GBIF_IDENTIFIER,
|
65
|
TermUri.GBIF_SPECIES_PROFILE,
|
66
|
TermUri.GBIF_REFERENCE,
|
67
|
TermUri.GBIF_DESCRIPTION,
|
68
|
TermUri.GBIF_DISTRIBUTION,
|
69
|
TermUri.GBIF_IMAGE
|
70
|
|
71
|
);
|
72
|
|
73
|
private final URI dwcaZip;
|
74
|
private final Map<String, DwcaMetaDataRecord> metaRecords = new HashMap<>();
|
75
|
private Archive archive;
|
76
|
|
77
|
/// ******************** FACTORY ********************************/
|
78
|
|
79
|
public static DwcaZipToStreamConverter<DwcaImportState> NewInstance(URI dwcaZip){
|
80
|
return new DwcaZipToStreamConverter(dwcaZip);
|
81
|
}
|
82
|
|
83
|
|
84
|
//************************ CONSTRUCTOR *********************************/
|
85
|
|
86
|
/**
|
87
|
* Constructor
|
88
|
* @param dwcaZip
|
89
|
*/
|
90
|
public DwcaZipToStreamConverter(URI dwcaZip) {
|
91
|
this.dwcaZip = dwcaZip;
|
92
|
initArchive();
|
93
|
}
|
94
|
|
95
|
|
96
|
protected Archive getArchive(){
|
97
|
return this.archive;
|
98
|
}
|
99
|
|
100
|
public CsvStream getCoreStream(STATE state) throws IOException{
|
101
|
initArchive();
|
102
|
ArchiveEntryBase core = archive.getCore();
|
103
|
return makeStream(core, state);
|
104
|
}
|
105
|
|
106
|
public CsvStream getStream(String rowType, STATE state) throws IOException{
|
107
|
initArchive();
|
108
|
|
109
|
ArchiveEntryBase archiveEntry = null;
|
110
|
List<Extension> extensions = archive.getExtension();
|
111
|
for (Extension extension : extensions){
|
112
|
if (rowType.equalsIgnoreCase(extension.getRowType())){
|
113
|
archiveEntry = extension;
|
114
|
break;
|
115
|
}
|
116
|
}
|
117
|
return makeStream(archiveEntry, state);
|
118
|
}
|
119
|
|
120
|
public CsvStream getStream(TermUri rowType, STATE state) throws IOException{
|
121
|
return getStream(rowType.getUriString(), state);
|
122
|
}
|
123
|
|
124
|
public IReader<CsvStream> getEntriesStream(STATE state){
|
125
|
//core
|
126
|
List<CsvStream> streamList = new ArrayList<>();
|
127
|
try {
|
128
|
if (state.getConfig().isDoTaxa()){
|
129
|
streamList.add(getCoreStream(state)); //for taxa and names
|
130
|
}
|
131
|
} catch (IOException e) {
|
132
|
String message = "Core stream not available for %s: %s";
|
133
|
//FIXME fire event (also in following code)
|
134
|
logger.warn(String.format(message, "taxa", e.getMessage()));
|
135
|
state.setSuccess(false);
|
136
|
}
|
137
|
//core relationships
|
138
|
try {
|
139
|
if (state.getConfig().isDoTaxonRelationships()){
|
140
|
streamList.add(getCoreStream(state)); //for taxon and name relations
|
141
|
}
|
142
|
} catch (IOException e) {
|
143
|
String message = "Core stream not available for %s: %s";
|
144
|
logger.warn(String.format(message, "taxon relations", e.getMessage()));
|
145
|
state.setSuccess(false);
|
146
|
}
|
147
|
//extensions
|
148
|
for (TermUri extension : extensionList){
|
149
|
CsvStream extensionStream;
|
150
|
try {
|
151
|
if (state.getConfig().isDoExtensions()){
|
152
|
extensionStream = getStream(extension, state);
|
153
|
if (extensionStream != null){
|
154
|
streamList.add(extensionStream);
|
155
|
}
|
156
|
}
|
157
|
} catch (IOException e) {
|
158
|
String message = "Extension stream not available for extension %s: %s";
|
159
|
logger.warn(String.format(message, extension.getUriString(), e.getMessage()));
|
160
|
state.setSuccess(false);
|
161
|
}
|
162
|
}
|
163
|
IReader<CsvStream> result = new ListReader<>(streamList);
|
164
|
return result;
|
165
|
}
|
166
|
|
167
|
|
168
|
/**
|
169
|
* Creates the CsvStream for an archive entry. Returns null if archive entry is null.
|
170
|
* @param archiveEntry
|
171
|
* @param state
|
172
|
* @return
|
173
|
* @throws IOException
|
174
|
* @throws UnsupportedEncodingException
|
175
|
*/
|
176
|
private CsvStream makeStream(ArchiveEntryBase archiveEntry, STATE state) throws IOException, UnsupportedEncodingException {
|
177
|
if (archiveEntry == null){
|
178
|
return null;
|
179
|
}
|
180
|
|
181
|
char fieldTerminatedBy = StringUtils.isEmpty(archiveEntry.getFieldsTerminatedBy()) ? CSVParser.DEFAULT_SEPARATOR : archiveEntry.getFieldsTerminatedBy().charAt(0);
|
182
|
// default is a kind of 'null' quote, which tells opencsv to ignore the enclosing quotes
|
183
|
char fieldsEnclosedBy = CSVWriter.NO_QUOTE_CHARACTER;
|
184
|
if(state == null || !state.getConfig().isNoQuotes()) {
|
185
|
fieldsEnclosedBy= StringUtils.isEmpty(archiveEntry.getFieldsEnclosedBy()) ? CSVParser.DEFAULT_QUOTE_CHARACTER: archiveEntry.getFieldsEnclosedBy().charAt(0);
|
186
|
}
|
187
|
boolean ignoreHeader = archiveEntry.getIgnoreHeaderLines();
|
188
|
String linesTerminatedBy = archiveEntry.getLinesTerminatedBy();
|
189
|
String encoding = archiveEntry.getEncoding();
|
190
|
int skipLines = ignoreHeader? 1 : 0;
|
191
|
|
192
|
String fileLocation = archiveEntry.getFiles().getLocation();
|
193
|
InputStream coreCsvInputStream = makeInputStream(fileLocation);
|
194
|
Reader coreReader = new InputStreamReader(coreCsvInputStream, encoding);
|
195
|
CSVReader csvReader = new CSVReader(coreReader, fieldTerminatedBy,fieldsEnclosedBy, skipLines);
|
196
|
CsvStream csvStream = new CsvStream(csvReader, archiveEntry, skipLines);
|
197
|
|
198
|
// InputStream s;
|
199
|
// s.
|
200
|
|
201
|
return csvStream;
|
202
|
}
|
203
|
|
204
|
|
205
|
private void initArchive() {
|
206
|
if (archive == null){
|
207
|
try {
|
208
|
InputStream metaInputStream = makeInputStream(META_XML);
|
209
|
|
210
|
JAXBContext jaxbContext = JAXBContext.newInstance("eu.etaxonomy.cdm.io.dwca.jaxb");
|
211
|
Unmarshaller unmarshaller = jaxbContext.createUnmarshaller();
|
212
|
archive = (Archive)unmarshaller.unmarshal(metaInputStream);
|
213
|
|
214
|
validateArchive(archive);
|
215
|
} catch (IOException e) {
|
216
|
throw new RuntimeException(e);
|
217
|
} catch (JAXBException e) {
|
218
|
throw new RuntimeException(e);
|
219
|
}
|
220
|
}
|
221
|
}
|
222
|
|
223
|
|
224
|
private void validateArchive(Archive archive) {
|
225
|
if (archive.getCore().getFieldsTerminatedBy() != null && archive.getCore().getFieldsTerminatedBy().length() > 1){
|
226
|
if (archive.getCore().getFieldsTerminatedBy().equals("\\t") ){
|
227
|
//TODO handle, TODO also handle other \xxx delimiter
|
228
|
}else{
|
229
|
throw new IllegalStateException("CsvReader does not allow field delimiters with more than 1 character. ");
|
230
|
}
|
231
|
}
|
232
|
if (archive.getCore().getFieldsEnclosedBy() != null && archive.getCore().getFieldsEnclosedBy().length() > 1){
|
233
|
throw new IllegalStateException("CsvReader does not allow field delimiters with more than 1 character");
|
234
|
}
|
235
|
|
236
|
}
|
237
|
|
238
|
//
|
239
|
// /**
|
240
|
// * @return
|
241
|
// * @throws IOException
|
242
|
// */
|
243
|
// private InputStream makeInputStream(String name) throws IOException {
|
244
|
//
|
245
|
// ZipInputStream zin = new ZipInputStream(dwcaZip.toURL().openStream());
|
246
|
// ZipEntry ze = zin.getNextEntry();
|
247
|
// while (!ze.getName().equals(name)) {
|
248
|
// zin.closeEntry(); // not sure whether this is necessary
|
249
|
// ze = zin.getNextEntry();
|
250
|
// }
|
251
|
//
|
252
|
// CheckedInputStream cis = new CheckedInputStream(in, cksum)
|
253
|
//
|
254
|
// InputStream metaInputStream = zip.getInputStream(ze);
|
255
|
// return metaInputStream;
|
256
|
//
|
257
|
// InputStream metaInputStream = zip.getInputStream(metaEntry);
|
258
|
// return metaInputStream;
|
259
|
// }
|
260
|
//
|
261
|
|
262
|
/**
|
263
|
* @return
|
264
|
* @throws IOException
|
265
|
*/
|
266
|
private InputStream makeInputStream(String name) throws IOException {
|
267
|
File file = new File(dwcaZip);
|
268
|
if (! file.isFile() || ! file.exists()){
|
269
|
String message = "URI is not a file: %s";
|
270
|
throw new IOException(String.format(message, dwcaZip.toString()));
|
271
|
}
|
272
|
ZipFile zip = new ZipFile(file, ZipFile.OPEN_READ);
|
273
|
|
274
|
//Enumeration<? extends ZipEntry> zipEntries = zip.entries();
|
275
|
//ze = new ZipEntry(name);
|
276
|
ZipEntry metaEntry = zip.getEntry(name);
|
277
|
|
278
|
//Lorna added this to deal with Scratchpads dwca.zip files which when unzipped have a directory dwca/ which contains the files
|
279
|
if (metaEntry == null) {
|
280
|
metaEntry = zip.getEntry("dwca/" + name);
|
281
|
}
|
282
|
if (metaEntry == null){
|
283
|
String message = "Zip entry for %s not available";
|
284
|
throw new IOException(String.format(message, name));
|
285
|
}
|
286
|
InputStream metaInputStream = zip.getInputStream(metaEntry);
|
287
|
return metaInputStream;
|
288
|
}
|
289
|
|
290
|
|
291
|
|
292
|
|
293
|
}
|