Project

General

Profile

Download (9.12 KB) Statistics
| Branch: | Tag: | Revision:
1
/**
2
* Copyright (C) 2009 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9
package eu.etaxonomy.cdm.io.dwca.in;
10

    
11
import java.io.File;
12
import java.io.IOException;
13
import java.io.InputStream;
14
import java.io.InputStreamReader;
15
import java.io.Reader;
16
import java.io.UnsupportedEncodingException;
17
import java.util.ArrayList;
18
import java.util.Arrays;
19
import java.util.HashMap;
20
import java.util.List;
21
import java.util.Map;
22
import java.util.zip.ZipEntry;
23
import java.util.zip.ZipFile;
24

    
25
import javax.xml.bind.JAXBContext;
26
import javax.xml.bind.JAXBException;
27
import javax.xml.bind.Unmarshaller;
28

    
29
import org.apache.commons.lang.StringUtils;
30
import org.apache.log4j.Logger;
31

    
32
import au.com.bytecode.opencsv.CSVParser;
33
import au.com.bytecode.opencsv.CSVReader;
34
import au.com.bytecode.opencsv.CSVWriter;
35
import eu.etaxonomy.cdm.common.URI;
36
import eu.etaxonomy.cdm.io.dwca.jaxb.Archive;
37
import eu.etaxonomy.cdm.io.dwca.jaxb.ArchiveEntryBase;
38
import eu.etaxonomy.cdm.io.dwca.jaxb.Extension;
39
import eu.etaxonomy.cdm.io.dwca.out.DwcaMetaDataRecord;
40
import eu.etaxonomy.cdm.io.stream.CsvStream;
41
import eu.etaxonomy.cdm.io.stream.IReader;
42
import eu.etaxonomy.cdm.io.stream.ListReader;
43
import eu.etaxonomy.cdm.io.stream.terms.TermUri;
44

    
45
/**
46
 * This class transforms a Darwin Core Archive zip file into a set of CSVReaderInputStreams.
47
 * For each data file included in the zip it creates one stream by evaluating the meta file.
48
 * Ecological metadata handling is still unclear.
49
 *
50
 * @author a.mueller
51
 * @since 17.10.2011
52
 */
53
public class DwcaZipToStreamConverter<STATE extends DwcaImportState> {
54
	private static Logger logger = Logger.getLogger(DwcaZipToStreamConverter.class);
55

    
56
	private final String META_XML = "meta.xml";
57
	protected static final boolean IS_CORE = true;
58

    
59
	private final List<TermUri> extensionList = Arrays.asList(
60
			TermUri.EOL_AGENT,
61
			TermUri.DWC_RESOURCE_RELATIONSHIP,
62
			TermUri.GBIF_TYPES_AND_SPECIMEN,
63
			TermUri.GBIF_VERNACULAR_NAMES,
64
			TermUri.GBIF_IDENTIFIER,
65
			TermUri.GBIF_SPECIES_PROFILE,
66
			TermUri.GBIF_REFERENCE,
67
			TermUri.GBIF_DESCRIPTION,
68
			TermUri.GBIF_DISTRIBUTION,
69
			TermUri.GBIF_IMAGE
70
	);
71

    
72
	private final URI dwcaZip;
73
	private final Map<String, DwcaMetaDataRecord> metaRecords = new HashMap<>();
74
	private Archive archive;
75

    
76
/// ******************** FACTORY ********************************/
77

    
78
	public static DwcaZipToStreamConverter<DwcaImportState> NewInstance(URI dwcaZip){
79
		return new DwcaZipToStreamConverter(dwcaZip);
80
	}
81

    
82
//************************ CONSTRUCTOR *********************************/
83

    
84
	public DwcaZipToStreamConverter(URI dwcaZip) {
85
		this.dwcaZip = dwcaZip;
86
		initArchive();
87
	}
88

    
89
	protected Archive getArchive(){
90
			return this.archive;
91
	}
92

    
93
	public CsvStream getCoreStream(STATE state) throws IOException{
94
		initArchive();
95
		ArchiveEntryBase core = archive.getCore();
96
		return makeStream(core, state);
97
	}
98

    
99
	public CsvStream getStream(String rowType, STATE state) throws IOException{
100
		initArchive();
101

    
102
		ArchiveEntryBase archiveEntry = null;
103
		List<Extension> extensions = archive.getExtension();
104
		for (Extension extension : extensions){
105
			if (rowType.equalsIgnoreCase(extension.getRowType())){
106
				archiveEntry = extension;
107
				break;
108
			}
109
		}
110
		return makeStream(archiveEntry, state);
111
	}
112

    
113
	public CsvStream getStream(TermUri rowType, STATE state) throws IOException{
114
		return getStream(rowType.getUriString(), state);
115
	}
116

    
117
	public IReader<CsvStream> getEntriesStream(STATE state){
118
		//core
119
		List<CsvStream> streamList = new ArrayList<>();
120
		try {
121
			if (state.getConfig().isDoTaxa()){
122
			    streamList.add(getCoreStream(state)); //for taxa and names
123
			}
124
		} catch (IOException e) {
125
			String message = "Core stream not available for %s: %s";
126
			//FIXME fire event (also in following code)
127
			logger.warn(String.format(message, "taxa", e.getMessage()));
128
			state.setSuccess(false);
129
		}
130
		//core relationships
131
		try {
132
			if (state.getConfig().isDoTaxonRelationships()){
133
			    streamList.add(getCoreStream(state)); //for taxon and name relations
134
			}
135
		} catch (IOException e) {
136
			String message = "Core stream not available for %s: %s";
137
			logger.warn(String.format(message, "taxon relations", e.getMessage()));
138
			state.setSuccess(false);
139
		}
140
		//extensions
141
		for (TermUri extension : extensionList){
142
			CsvStream extensionStream;
143
			try {
144
	            if (state.getConfig().isDoExtensions()){
145
	                extensionStream = getStream(extension, state);
146
	                if (extensionStream != null){
147
	                    streamList.add(extensionStream);
148
	                }
149
	            }
150
			} catch (IOException e) {
151
				String message = "Extension stream not available for extension %s: %s";
152
				logger.warn(String.format(message, extension.getUriString(), e.getMessage()));
153
				state.setSuccess(false);
154
			}
155
		}
156
		IReader<CsvStream> result = new ListReader<>(streamList);
157
		return result;
158
	}
159

    
160

    
161
	/**
162
	 * Creates the CsvStream for an archive entry. Returns null if archive entry is null.
163
	 * @param archiveEntry
164
	 * @param state
165
	 * @return
166
	 * @throws IOException
167
	 * @throws UnsupportedEncodingException
168
	 */
169
	private CsvStream makeStream(ArchiveEntryBase archiveEntry, STATE state) throws IOException, UnsupportedEncodingException {
170
		if (archiveEntry == null){
171
			return null;
172
		}
173

    
174
		char fieldTerminatedBy = StringUtils.isEmpty(archiveEntry.getFieldsTerminatedBy()) ? CSVParser.DEFAULT_SEPARATOR : archiveEntry.getFieldsTerminatedBy().charAt(0);
175
		// default is a kind of 'null' quote, which tells opencsv to ignore the enclosing quotes
176
		char fieldsEnclosedBy = CSVWriter.NO_QUOTE_CHARACTER;
177
		if(state == null || !state.getConfig().isNoQuotes()) {
178
		        fieldsEnclosedBy= StringUtils.isEmpty(archiveEntry.getFieldsEnclosedBy()) ? CSVParser.DEFAULT_QUOTE_CHARACTER: archiveEntry.getFieldsEnclosedBy().charAt(0);
179
		}
180
		boolean ignoreHeader = archiveEntry.getIgnoreHeaderLines();
181
		String linesTerminatedBy = archiveEntry.getLinesTerminatedBy();
182
		String encoding = archiveEntry.getEncoding();
183
		int skipLines = ignoreHeader? 1 : 0;
184

    
185
		String fileLocation = archiveEntry.getFiles().getLocation();
186
		InputStream coreCsvInputStream = makeInputStream(fileLocation);
187
		Reader coreReader = new InputStreamReader(coreCsvInputStream, encoding);
188
		CSVReader csvReader = new CSVReader(coreReader, fieldTerminatedBy,fieldsEnclosedBy, skipLines);
189
		CsvStream csvStream = new CsvStream(csvReader, archiveEntry, skipLines);
190

    
191
		//		InputStream s;
192
//		s.
193

    
194
		return csvStream;
195
	}
196

    
197

    
198
	private void initArchive() {
199
		if (archive == null){
200
			try {
201
				InputStream metaInputStream = makeInputStream(META_XML);
202

    
203
				JAXBContext jaxbContext = JAXBContext.newInstance("eu.etaxonomy.cdm.io.dwca.jaxb");
204
				Unmarshaller unmarshaller =  jaxbContext.createUnmarshaller();
205
				archive = (Archive)unmarshaller.unmarshal(metaInputStream);
206

    
207
				validateArchive(archive);
208
			} catch (IOException e) {
209
				throw new RuntimeException(e);
210
			} catch (JAXBException e) {
211
				throw new RuntimeException(e);
212
			}
213
		}
214
	}
215

    
216

    
217
	private void validateArchive(Archive archive) {
218
		if (archive.getCore().getFieldsTerminatedBy() != null && archive.getCore().getFieldsTerminatedBy().length() > 1){
219
			if (archive.getCore().getFieldsTerminatedBy().equals("\\t") ){
220
				//TODO handle, TODO also handle other \xxx delimiter
221
			}else{
222
				throw new IllegalStateException("CsvReader does not allow field delimiters with more than 1 character. ");
223
			}
224
		}
225
		if (archive.getCore().getFieldsEnclosedBy() != null && archive.getCore().getFieldsEnclosedBy().length() > 1){
226
			throw new IllegalStateException("CsvReader does not allow field delimiters with more than 1 character");
227
		}
228

    
229
	}
230

    
231
//
232
//	/**
233
//	 * @return
234
//	 * @throws IOException
235
//	 */
236
//	private InputStream makeInputStream(String name) throws IOException {
237
//
238
//		ZipInputStream zin = new ZipInputStream(dwcaZip.toURL().openStream());
239
//		ZipEntry ze = zin.getNextEntry();
240
//		while (!ze.getName().equals(name)) {
241
//		    zin.closeEntry(); // not sure whether this is necessary
242
//		    ze = zin.getNextEntry();
243
//		}
244
//
245
//		CheckedInputStream cis = new CheckedInputStream(in, cksum)
246
//
247
//		InputStream metaInputStream = zip.getInputStream(ze);
248
//		return metaInputStream;
249
//
250
//		InputStream metaInputStream = zip.getInputStream(metaEntry);
251
//		return metaInputStream;
252
//	}
253
//
254

    
255
	private InputStream makeInputStream(String name) throws IOException {
256
		File file = new File(dwcaZip.getJavaUri());
257
		if (! file.isFile() || ! file.exists()){
258
			String message = "URI is not a file: %s";
259
			throw new IOException(String.format(message, dwcaZip.toString()));
260
		}
261
		ZipFile zip = new ZipFile(file, ZipFile.OPEN_READ);
262

    
263
		//Enumeration<? extends ZipEntry> zipEntries = zip.entries();
264
		//ze = new ZipEntry(name);
265
		ZipEntry metaEntry = zip.getEntry(name);
266

    
267
		//Lorna added this to deal with Scratchpads dwca.zip files which when unzipped have a directory dwca/ which contains the files
268
		if (metaEntry == null) {
269
			metaEntry = zip.getEntry("dwca/" + name);
270
		}
271
		if (metaEntry == null){
272
			String message = "Zip entry for %s not available";
273
			throw new IOException(String.format(message, name));
274
		}
275
		InputStream metaInputStream = zip.getInputStream(metaEntry);
276
		return metaInputStream;
277
	}
278
}
(10-10/17)