Project

General

Profile

Download (9.08 KB) Statistics
| Branch: | Tag: | Revision:
1
/**
2
* Copyright (C) 2009 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9
package eu.etaxonomy.cdm.io.dwca.in;
10

    
11
import java.io.File;
12
import java.io.IOException;
13
import java.io.InputStream;
14
import java.io.InputStreamReader;
15
import java.io.Reader;
16
import java.io.UnsupportedEncodingException;
17
import java.net.URI;
18
import java.util.ArrayList;
19
import java.util.Arrays;
20
import java.util.HashMap;
21
import java.util.List;
22
import java.util.Map;
23
import java.util.zip.ZipEntry;
24
import java.util.zip.ZipFile;
25

    
26
import javax.xml.bind.JAXBContext;
27
import javax.xml.bind.JAXBException;
28
import javax.xml.bind.Unmarshaller;
29

    
30
import org.apache.commons.lang.StringUtils;
31
import org.apache.log4j.Logger;
32

    
33
import au.com.bytecode.opencsv.CSVParser;
34
import au.com.bytecode.opencsv.CSVReader;
35
import au.com.bytecode.opencsv.CSVWriter;
36
import eu.etaxonomy.cdm.io.dwca.TermUri;
37
import eu.etaxonomy.cdm.io.dwca.jaxb.Archive;
38
import eu.etaxonomy.cdm.io.dwca.jaxb.ArchiveEntryBase;
39
import eu.etaxonomy.cdm.io.dwca.jaxb.Extension;
40
import eu.etaxonomy.cdm.io.dwca.out.DwcaMetaDataRecord;
41

    
42
/**
43
 * This class transforms a Darwin Core Archive zip file into a set of CSVReaderInputStreams.
44
 * For each data file included in the zip it creates one stream by evaluating the meta file.
45
 * Ecological metadata handling is still unclear.
46
 * @author a.mueller
47
 * @date 17.10.2011
48
 *
49
 */
50
public class DwcaZipToStreamConverter<STATE extends DwcaImportState> {
51
	private static Logger logger = Logger.getLogger(DwcaZipToStreamConverter.class);
52

    
53
	private final String META_XML = "meta.xml";
54
	protected static final boolean IS_CORE = true;
55

    
56
	private final List<TermUri> extensionList = Arrays.asList(
57
			TermUri.EOL_AGENT,
58
			TermUri.DWC_RESOURCE_RELATIONSHIP,
59
			TermUri.GBIF_TYPES_AND_SPECIMEN,
60
			TermUri.GBIF_VERNACULAR_NAMES,
61
			TermUri.GBIF_IDENTIFIER,
62
			TermUri.GBIF_SPECIES_PROFILE,
63
			TermUri.GBIF_REFERENCE,
64
			TermUri.GBIF_DESCRIPTION,
65
			TermUri.GBIF_DISTRIBUTION,
66
			TermUri.GBIF_IMAGE
67

    
68
	);
69

    
70
	private final URI dwcaZip;
71
	private final Map<String, DwcaMetaDataRecord> metaRecords = new HashMap<String, DwcaMetaDataRecord>();
72
	private Archive archive;
73

    
74
/// ******************** FACTORY ********************************/
75

    
76
	public static DwcaZipToStreamConverter NewInstance(URI dwcaZip){
77
		return new DwcaZipToStreamConverter(dwcaZip);
78
	}
79

    
80

    
81
//************************ CONSTRUCTOR *********************************/
82

    
83
	/**
84
	 * Constructor
85
	 * @param dwcaZip
86
	 */
87
	public DwcaZipToStreamConverter(URI dwcaZip) {
88
		this.dwcaZip = dwcaZip;
89
		initArchive();
90
	}
91

    
92

    
93
	protected Archive getArchive(){
94
			return this.archive;
95
	}
96

    
97
	public CsvStream getCoreStream(STATE state) throws IOException{
98
		initArchive();
99
		ArchiveEntryBase core = archive.getCore();
100
		return makeStream(core, state);
101
	}
102

    
103
	public CsvStream getStream(String rowType, STATE state) throws IOException{
104
		initArchive();
105

    
106
		ArchiveEntryBase archiveEntry = null;
107
		List<Extension> extensions = archive.getExtension();
108
		for (Extension extension : extensions){
109
			if (rowType.equalsIgnoreCase(extension.getRowType())){
110
				archiveEntry = extension;
111
				break;
112
			}
113
		}
114
		return makeStream(archiveEntry, state);
115
	}
116

    
117
	public CsvStream getStream(TermUri rowType, STATE state) throws IOException{
118
		return getStream(rowType.getUriString(), state);
119
	}
120

    
121
	public IReader<CsvStream> getEntriesStream(STATE state){
122
		//core
123
		List<CsvStream> streamList = new ArrayList<CsvStream>();
124
		try {
125
			if (state.getConfig().isDoTaxa()){
126
			    streamList.add(getCoreStream(state)); //for taxa and names
127
			}
128
		} catch (IOException e) {
129
			String message = "Core stream not available for %s: %s";
130
			//FIXME fire event (also in following code)
131
			logger.warn(String.format(message, "taxa", e.getMessage()));
132
			state.setSuccess(false);
133
		}
134
		//core relationships
135
		try {
136
			if (state.getConfig().isDoTaxonRelationships()){
137
			    streamList.add(getCoreStream(state)); //for taxon and name relations
138
			}
139
		} catch (IOException e) {
140
			String message = "Core stream not available for %s: %s";
141
			logger.warn(String.format(message, "taxon relations", e.getMessage()));
142
			state.setSuccess(false);
143
		}
144
		//extensions
145
		for (TermUri extension : extensionList){
146
			CsvStream extensionStream;
147
			try {
148
	            if (state.getConfig().isDoExtensions()){
149
	                extensionStream = getStream(extension, state);
150
	                if (extensionStream != null){
151
	                    streamList.add(extensionStream);
152
	                }
153
	            }
154
			} catch (IOException e) {
155
				String message = "Extension stream not available for extension %s: %s";
156
				logger.warn(String.format(message, extension.getUriString(), e.getMessage()));
157
				state.setSuccess(false);
158
			}
159
		}
160
		IReader<CsvStream> result = new ListReader<CsvStream>(streamList);
161
		return result;
162
	}
163

    
164

    
165
	/**
166
	 * Creates the CsvStream for an archive entry. Returns null if archive entry is null.
167
	 * @param archiveEntry
168
	 * @param state
169
	 * @return
170
	 * @throws IOException
171
	 * @throws UnsupportedEncodingException
172
	 */
173
	private CsvStream makeStream(ArchiveEntryBase archiveEntry, STATE state) throws IOException, UnsupportedEncodingException {
174
		if (archiveEntry == null){
175
			return null;
176
		}
177

    
178
		char fieldTerminatedBy = StringUtils.isEmpty(archiveEntry.getFieldsTerminatedBy()) ? CSVParser.DEFAULT_SEPARATOR : archiveEntry.getFieldsTerminatedBy().charAt(0);
179
		// default is a kind of 'null' quote, which tells opencsv to ignore the enclosing quotes
180
		char fieldsEnclosedBy = CSVWriter.NO_QUOTE_CHARACTER;
181
		if(state == null || !state.getConfig().isNoQuotes()) {
182
		        fieldsEnclosedBy= StringUtils.isEmpty(archiveEntry.getFieldsEnclosedBy()) ? CSVParser.DEFAULT_QUOTE_CHARACTER: archiveEntry.getFieldsEnclosedBy().charAt(0);
183
		}
184
		boolean ignoreHeader = archiveEntry.getIgnoreHeaderLines();
185
		String linesTerminatedBy = archiveEntry.getLinesTerminatedBy();
186
		String encoding = archiveEntry.getEncoding();
187
		int skipLines = ignoreHeader? 1 : 0;
188

    
189
		String fileLocation = archiveEntry.getFiles().getLocation();
190
		InputStream coreCsvInputStream = makeInputStream(fileLocation);
191
		Reader coreReader = new InputStreamReader(coreCsvInputStream, encoding);
192
		CSVReader csvReader = new CSVReader(coreReader, fieldTerminatedBy,fieldsEnclosedBy, skipLines);
193
		CsvStream csvStream = new CsvStream(csvReader, archiveEntry, skipLines);
194

    
195
		//		InputStream s;
196
//		s.
197

    
198
		return csvStream;
199
	}
200

    
201

    
202
	private void initArchive() {
203
		if (archive == null){
204
			try {
205
				InputStream metaInputStream = makeInputStream(META_XML);
206

    
207
				JAXBContext jaxbContext = JAXBContext.newInstance("eu.etaxonomy.cdm.io.dwca.jaxb");
208
				Unmarshaller unmarshaller =  jaxbContext.createUnmarshaller();
209
				archive = (Archive)unmarshaller.unmarshal(metaInputStream);
210

    
211
				validateArchive(archive);
212
			} catch (IOException e) {
213
				throw new RuntimeException(e);
214
			} catch (JAXBException e) {
215
				throw new RuntimeException(e);
216
			}
217
		}
218
	}
219

    
220

    
221
	private void validateArchive(Archive archive) {
222
		if (archive.getCore().getFieldsTerminatedBy() != null && archive.getCore().getFieldsTerminatedBy().length() > 1){
223
			if (archive.getCore().getFieldsTerminatedBy().equals("\\t") ){
224
				//TODO handle, TODO also handle other \xxx delimiter
225
			}else{
226
				throw new IllegalStateException("CsvReader does not allow field delimiters with more than 1 character. ");
227
			}
228
		}
229
		if (archive.getCore().getFieldsEnclosedBy() != null && archive.getCore().getFieldsEnclosedBy().length() > 1){
230
			throw new IllegalStateException("CsvReader does not allow field delimiters with more than 1 character");
231
		}
232

    
233
	}
234

    
235
//
236
//	/**
237
//	 * @return
238
//	 * @throws IOException
239
//	 */
240
//	private InputStream makeInputStream(String name) throws IOException {
241
//
242
//		ZipInputStream zin = new ZipInputStream(dwcaZip.toURL().openStream());
243
//		ZipEntry ze = zin.getNextEntry();
244
//		while (!ze.getName().equals(name)) {
245
//		    zin.closeEntry(); // not sure whether this is necessary
246
//		    ze = zin.getNextEntry();
247
//		}
248
//
249
//		CheckedInputStream cis = new CheckedInputStream(in, cksum)
250
//
251
//		InputStream metaInputStream = zip.getInputStream(ze);
252
//		return metaInputStream;
253
//
254
//		InputStream metaInputStream = zip.getInputStream(metaEntry);
255
//		return metaInputStream;
256
//	}
257
//
258

    
259
	/**
260
	 * @return
261
	 * @throws IOException
262
	 */
263
	private InputStream makeInputStream(String name) throws IOException {
264
		File file = new File(dwcaZip);
265
		if (! file.isFile() || ! file.exists()){
266
			String message = "URI is not a file: %s";
267
			throw new IOException(String.format(message, dwcaZip.toString()));
268
		}
269
		ZipFile zip = new ZipFile(file, ZipFile.OPEN_READ);
270

    
271
		//Enumeration<? extends ZipEntry> zipEntries = zip.entries();
272
		//ze = new ZipEntry(name);
273
		ZipEntry metaEntry = zip.getEntry(name);
274

    
275
		//Lorna added this to deal with Scratchpads dwca.zip files which when unzipped have a directory dwca/ which contains the files
276
		if (metaEntry == null) {
277
			metaEntry = zip.getEntry("dwca/" + name);
278
		}
279
		if (metaEntry == null){
280
			String message = "Zip entry for %s not available";
281
			throw new IOException(String.format(message, name));
282
		}
283
		InputStream metaInputStream = zip.getInputStream(metaEntry);
284
		return metaInputStream;
285
	}
286

    
287

    
288

    
289

    
290
}
(13-13/37)