Project

General

Profile

Download (9.19 KB) Statistics
| Branch: | Tag: | Revision:
1
/**
2
* Copyright (C) 2009 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9
package eu.etaxonomy.cdm.io.dwca.in;
10

    
11
import java.io.File;
12
import java.io.IOException;
13
import java.io.InputStream;
14
import java.io.InputStreamReader;
15
import java.io.Reader;
16
import java.io.UnsupportedEncodingException;
17
import java.net.URI;
18
import java.util.ArrayList;
19
import java.util.Arrays;
20
import java.util.HashMap;
21
import java.util.List;
22
import java.util.Map;
23
import java.util.zip.ZipEntry;
24
import java.util.zip.ZipFile;
25

    
26
import javax.xml.bind.JAXBContext;
27
import javax.xml.bind.JAXBException;
28
import javax.xml.bind.Unmarshaller;
29

    
30
import org.apache.commons.lang.StringUtils;
31
import org.apache.log4j.Logger;
32

    
33
import au.com.bytecode.opencsv.CSVParser;
34
import au.com.bytecode.opencsv.CSVReader;
35
import au.com.bytecode.opencsv.CSVWriter;
36
import eu.etaxonomy.cdm.io.dwca.jaxb.Archive;
37
import eu.etaxonomy.cdm.io.dwca.jaxb.ArchiveEntryBase;
38
import eu.etaxonomy.cdm.io.dwca.jaxb.Extension;
39
import eu.etaxonomy.cdm.io.dwca.out.DwcaMetaDataRecord;
40
import eu.etaxonomy.cdm.io.stream.CsvStream;
41
import eu.etaxonomy.cdm.io.stream.IReader;
42
import eu.etaxonomy.cdm.io.stream.ListReader;
43
import eu.etaxonomy.cdm.io.stream.terms.TermUri;
44

    
45
/**
46
 * This class transforms a Darwin Core Archive zip file into a set of CSVReaderInputStreams.
47
 * For each data file included in the zip it creates one stream by evaluating the meta file.
48
 * Ecological metadata handling is still unclear.
49
 * @author a.mueller
50
 \* @since 17.10.2011
51
 *
52
 */
53
public class DwcaZipToStreamConverter<STATE extends DwcaImportState> {
54
	private static Logger logger = Logger.getLogger(DwcaZipToStreamConverter.class);
55

    
56
	private final String META_XML = "meta.xml";
57
	protected static final boolean IS_CORE = true;
58

    
59
	private final List<TermUri> extensionList = Arrays.asList(
60
			TermUri.EOL_AGENT,
61
			TermUri.DWC_RESOURCE_RELATIONSHIP,
62
			TermUri.GBIF_TYPES_AND_SPECIMEN,
63
			TermUri.GBIF_VERNACULAR_NAMES,
64
			TermUri.GBIF_IDENTIFIER,
65
			TermUri.GBIF_SPECIES_PROFILE,
66
			TermUri.GBIF_REFERENCE,
67
			TermUri.GBIF_DESCRIPTION,
68
			TermUri.GBIF_DISTRIBUTION,
69
			TermUri.GBIF_IMAGE
70

    
71
	);
72

    
73
	private final URI dwcaZip;
74
	private final Map<String, DwcaMetaDataRecord> metaRecords = new HashMap<>();
75
	private Archive archive;
76

    
77
/// ******************** FACTORY ********************************/
78

    
79
	public static DwcaZipToStreamConverter<DwcaImportState> NewInstance(URI dwcaZip){
80
		return new DwcaZipToStreamConverter(dwcaZip);
81
	}
82

    
83

    
84
//************************ CONSTRUCTOR *********************************/
85

    
86
	/**
87
	 * Constructor
88
	 * @param dwcaZip
89
	 */
90
	public DwcaZipToStreamConverter(URI dwcaZip) {
91
		this.dwcaZip = dwcaZip;
92
		initArchive();
93
	}
94

    
95

    
96
	protected Archive getArchive(){
97
			return this.archive;
98
	}
99

    
100
	public CsvStream getCoreStream(STATE state) throws IOException{
101
		initArchive();
102
		ArchiveEntryBase core = archive.getCore();
103
		return makeStream(core, state);
104
	}
105

    
106
	public CsvStream getStream(String rowType, STATE state) throws IOException{
107
		initArchive();
108

    
109
		ArchiveEntryBase archiveEntry = null;
110
		List<Extension> extensions = archive.getExtension();
111
		for (Extension extension : extensions){
112
			if (rowType.equalsIgnoreCase(extension.getRowType())){
113
				archiveEntry = extension;
114
				break;
115
			}
116
		}
117
		return makeStream(archiveEntry, state);
118
	}
119

    
120
	public CsvStream getStream(TermUri rowType, STATE state) throws IOException{
121
		return getStream(rowType.getUriString(), state);
122
	}
123

    
124
	public IReader<CsvStream> getEntriesStream(STATE state){
125
		//core
126
		List<CsvStream> streamList = new ArrayList<>();
127
		try {
128
			if (state.getConfig().isDoTaxa()){
129
			    streamList.add(getCoreStream(state)); //for taxa and names
130
			}
131
		} catch (IOException e) {
132
			String message = "Core stream not available for %s: %s";
133
			//FIXME fire event (also in following code)
134
			logger.warn(String.format(message, "taxa", e.getMessage()));
135
			state.setSuccess(false);
136
		}
137
		//core relationships
138
		try {
139
			if (state.getConfig().isDoTaxonRelationships()){
140
			    streamList.add(getCoreStream(state)); //for taxon and name relations
141
			}
142
		} catch (IOException e) {
143
			String message = "Core stream not available for %s: %s";
144
			logger.warn(String.format(message, "taxon relations", e.getMessage()));
145
			state.setSuccess(false);
146
		}
147
		//extensions
148
		for (TermUri extension : extensionList){
149
			CsvStream extensionStream;
150
			try {
151
	            if (state.getConfig().isDoExtensions()){
152
	                extensionStream = getStream(extension, state);
153
	                if (extensionStream != null){
154
	                    streamList.add(extensionStream);
155
	                }
156
	            }
157
			} catch (IOException e) {
158
				String message = "Extension stream not available for extension %s: %s";
159
				logger.warn(String.format(message, extension.getUriString(), e.getMessage()));
160
				state.setSuccess(false);
161
			}
162
		}
163
		IReader<CsvStream> result = new ListReader<>(streamList);
164
		return result;
165
	}
166

    
167

    
168
	/**
169
	 * Creates the CsvStream for an archive entry. Returns null if archive entry is null.
170
	 * @param archiveEntry
171
	 * @param state
172
	 * @return
173
	 * @throws IOException
174
	 * @throws UnsupportedEncodingException
175
	 */
176
	private CsvStream makeStream(ArchiveEntryBase archiveEntry, STATE state) throws IOException, UnsupportedEncodingException {
177
		if (archiveEntry == null){
178
			return null;
179
		}
180

    
181
		char fieldTerminatedBy = StringUtils.isEmpty(archiveEntry.getFieldsTerminatedBy()) ? CSVParser.DEFAULT_SEPARATOR : archiveEntry.getFieldsTerminatedBy().charAt(0);
182
		// default is a kind of 'null' quote, which tells opencsv to ignore the enclosing quotes
183
		char fieldsEnclosedBy = CSVWriter.NO_QUOTE_CHARACTER;
184
		if(state == null || !state.getConfig().isNoQuotes()) {
185
		        fieldsEnclosedBy= StringUtils.isEmpty(archiveEntry.getFieldsEnclosedBy()) ? CSVParser.DEFAULT_QUOTE_CHARACTER: archiveEntry.getFieldsEnclosedBy().charAt(0);
186
		}
187
		boolean ignoreHeader = archiveEntry.getIgnoreHeaderLines();
188
		String linesTerminatedBy = archiveEntry.getLinesTerminatedBy();
189
		String encoding = archiveEntry.getEncoding();
190
		int skipLines = ignoreHeader? 1 : 0;
191

    
192
		String fileLocation = archiveEntry.getFiles().getLocation();
193
		InputStream coreCsvInputStream = makeInputStream(fileLocation);
194
		Reader coreReader = new InputStreamReader(coreCsvInputStream, encoding);
195
		CSVReader csvReader = new CSVReader(coreReader, fieldTerminatedBy,fieldsEnclosedBy, skipLines);
196
		CsvStream csvStream = new CsvStream(csvReader, archiveEntry, skipLines);
197

    
198
		//		InputStream s;
199
//		s.
200

    
201
		return csvStream;
202
	}
203

    
204

    
205
	private void initArchive() {
206
		if (archive == null){
207
			try {
208
				InputStream metaInputStream = makeInputStream(META_XML);
209

    
210
				JAXBContext jaxbContext = JAXBContext.newInstance("eu.etaxonomy.cdm.io.dwca.jaxb");
211
				Unmarshaller unmarshaller =  jaxbContext.createUnmarshaller();
212
				archive = (Archive)unmarshaller.unmarshal(metaInputStream);
213

    
214
				validateArchive(archive);
215
			} catch (IOException e) {
216
				throw new RuntimeException(e);
217
			} catch (JAXBException e) {
218
				throw new RuntimeException(e);
219
			}
220
		}
221
	}
222

    
223

    
224
	private void validateArchive(Archive archive) {
225
		if (archive.getCore().getFieldsTerminatedBy() != null && archive.getCore().getFieldsTerminatedBy().length() > 1){
226
			if (archive.getCore().getFieldsTerminatedBy().equals("\\t") ){
227
				//TODO handle, TODO also handle other \xxx delimiter
228
			}else{
229
				throw new IllegalStateException("CsvReader does not allow field delimiters with more than 1 character. ");
230
			}
231
		}
232
		if (archive.getCore().getFieldsEnclosedBy() != null && archive.getCore().getFieldsEnclosedBy().length() > 1){
233
			throw new IllegalStateException("CsvReader does not allow field delimiters with more than 1 character");
234
		}
235

    
236
	}
237

    
238
//
239
//	/**
240
//	 * @return
241
//	 * @throws IOException
242
//	 */
243
//	private InputStream makeInputStream(String name) throws IOException {
244
//
245
//		ZipInputStream zin = new ZipInputStream(dwcaZip.toURL().openStream());
246
//		ZipEntry ze = zin.getNextEntry();
247
//		while (!ze.getName().equals(name)) {
248
//		    zin.closeEntry(); // not sure whether this is necessary
249
//		    ze = zin.getNextEntry();
250
//		}
251
//
252
//		CheckedInputStream cis = new CheckedInputStream(in, cksum)
253
//
254
//		InputStream metaInputStream = zip.getInputStream(ze);
255
//		return metaInputStream;
256
//
257
//		InputStream metaInputStream = zip.getInputStream(metaEntry);
258
//		return metaInputStream;
259
//	}
260
//
261

    
262
	/**
263
	 * @return
264
	 * @throws IOException
265
	 */
266
	private InputStream makeInputStream(String name) throws IOException {
267
		File file = new File(dwcaZip);
268
		if (! file.isFile() || ! file.exists()){
269
			String message = "URI is not a file: %s";
270
			throw new IOException(String.format(message, dwcaZip.toString()));
271
		}
272
		ZipFile zip = new ZipFile(file, ZipFile.OPEN_READ);
273

    
274
		//Enumeration<? extends ZipEntry> zipEntries = zip.entries();
275
		//ze = new ZipEntry(name);
276
		ZipEntry metaEntry = zip.getEntry(name);
277

    
278
		//Lorna added this to deal with Scratchpads dwca.zip files which when unzipped have a directory dwca/ which contains the files
279
		if (metaEntry == null) {
280
			metaEntry = zip.getEntry("dwca/" + name);
281
		}
282
		if (metaEntry == null){
283
			String message = "Zip entry for %s not available";
284
			throw new IOException(String.format(message, name));
285
		}
286
		InputStream metaInputStream = zip.getInputStream(metaEntry);
287
		return metaInputStream;
288
	}
289

    
290

    
291

    
292

    
293
}
(10-10/17)