Project

General

Profile

Download (9.21 KB) Statistics
| Branch: | Tag: | Revision:
1
// $Id$
2
/**
3
* Copyright (C) 2009 EDIT
4
* European Distributed Institute of Taxonomy 
5
* http://www.e-taxonomy.eu
6
* 
7
* The contents of this file are subject to the Mozilla Public License Version 1.1
8
* See LICENSE.TXT at the top of this package for the full license terms.
9
*/
10
package eu.etaxonomy.cdm.io.dwca.in;
11

    
12
import java.io.File;
13
import java.io.IOException;
14
import java.io.InputStream;
15
import java.io.InputStreamReader;
16
import java.io.Reader;
17
import java.io.UnsupportedEncodingException;
18
import java.net.URI;
19
import java.util.ArrayList;
20
import java.util.Arrays;
21
import java.util.Enumeration;
22
import java.util.HashMap;
23
import java.util.List;
24
import java.util.Map;
25
import java.util.zip.ZipEntry;
26
import java.util.zip.ZipFile;
27

    
28
import javax.xml.bind.JAXBContext;
29
import javax.xml.bind.JAXBException;
30
import javax.xml.bind.Unmarshaller;
31

    
32
import org.apache.commons.lang.StringUtils;
33
import org.apache.log4j.Logger;
34

    
35
import au.com.bytecode.opencsv.CSVParser;
36
import au.com.bytecode.opencsv.CSVReader;
37
import au.com.bytecode.opencsv.CSVWriter;
38
import eu.etaxonomy.cdm.io.dwca.TermUri;
39
import eu.etaxonomy.cdm.io.dwca.jaxb.Archive;
40
import eu.etaxonomy.cdm.io.dwca.jaxb.ArchiveEntryBase;
41
import eu.etaxonomy.cdm.io.dwca.jaxb.Extension;
42
import eu.etaxonomy.cdm.io.dwca.out.DwcaMetaDataRecord;
43

    
44
/**
45
 * This class transforms a Darwin Core Archive zip file into a set of CSVReaderInputStreams.
46
 * For each data file included in the zip it creates one stream by evaluating the meta file.
47
 * Ecological metadata handling is still unclear.
48
 * @author a.mueller
49
 * @date 17.10.2011
50
 *
51
 */
52
public class DwcaZipToStreamConverter<STATE extends DwcaImportState> {
53
	private static Logger logger = Logger.getLogger(DwcaZipToStreamConverter.class);
54

    
55
	private final String META_XML = "meta.xml";
56
	protected static final boolean IS_CORE = true;
57
	
58
	private List<TermUri> extensionList = Arrays.asList(
59
			TermUri.EOL_AGENT,
60
			TermUri.DWC_RESOURCE_RELATIONSHIP,
61
			TermUri.GBIF_TYPES_AND_SPECIMEN,
62
			TermUri.GBIF_VERNACULAR_NAMES,
63
			TermUri.GBIF_IDENTIFIER,
64
			TermUri.GBIF_SPECIES_PROFILE,
65
			TermUri.GBIF_REFERENCE,
66
			TermUri.GBIF_DESCRIPTION,
67
			TermUri.GBIF_DISTRIBUTION,
68
			TermUri.GBIF_IMAGE
69
			
70
	);
71
			
72
	
73
	private URI dwcaZip;
74
	private Map<String, DwcaMetaDataRecord> metaRecords = new HashMap<String, DwcaMetaDataRecord>(); 
75
	private Archive archive;
76
	
77
/// ******************** FACTORY ********************************/	
78
	
79
	public static DwcaZipToStreamConverter NewInstance(URI dwcaZip){
80
		return new DwcaZipToStreamConverter(dwcaZip);
81
	}
82
	
83

    
84
//************************ CONSTRUCTOR *********************************/
85
	
86
	/**
87
	 * Constructor
88
	 * @param dwcaZip
89
	 */
90
	public DwcaZipToStreamConverter(URI dwcaZip) {
91
		this.dwcaZip = dwcaZip;
92
		initArchive();
93
	}
94
	
95

    
96
	protected Archive getArchive(){
97
			return this.archive;
98
	}
99
	
100
	public CsvStream getCoreStream(STATE state) throws IOException{
101
		initArchive();
102
		ArchiveEntryBase core = archive.getCore();
103
		return makeStream(core, state);
104
	}
105
	
106
	public CsvStream getStream(String rowType, STATE state) throws IOException{
107
		initArchive();
108
		
109
		ArchiveEntryBase archiveEntry = null; 
110
		List<Extension> extensions = archive.getExtension();
111
		for (Extension extension : extensions){
112
			if (rowType.equalsIgnoreCase(extension.getRowType())){
113
				archiveEntry = extension;
114
				break;
115
			}
116
		}
117
		return makeStream(archiveEntry, state);
118
	}
119
	
120
	public CsvStream getStream(TermUri rowType, STATE state) throws IOException{
121
		return getStream(rowType.getUriString(), state);
122
	}
123

    
124
	public IReader<CsvStream> getEntriesStream(STATE state){
125
		//core
126
		List<CsvStream> streamList = new ArrayList<CsvStream>();
127
		try {
128
			streamList.add(getCoreStream(state)); //for taxa and names
129
		} catch (IOException e) {
130
			String message = "Core stream not available for %s: %s";
131
			//FIXME fire event (also in following code)
132
			logger.warn(String.format(message, "taxa", e.getMessage()));
133
			state.setSuccess(false);
134
		} 
135
		//core relationships
136
		try {
137
			streamList.add(getCoreStream(state));//for taxon and name relations
138
		} catch (IOException e) {
139
			String message = "Core stream not available for %s: %s";
140
			logger.warn(String.format(message, "taxon relations", e.getMessage()));
141
			state.setSuccess(false);
142
		} 
143
		//extensions
144
		for (TermUri extension : extensionList){
145
			CsvStream extensionStream;
146
			try {
147
				extensionStream = getStream(extension, state);
148
				if (extensionStream != null){
149
					streamList.add(extensionStream);
150
				}
151
			} catch (IOException e) {
152
				String message = "Extension stream not available for extension %s: %s";
153
				logger.warn(String.format(message, extension.getUriString(), e.getMessage()));
154
				state.setSuccess(false);
155
			}
156
		}
157
		IReader<CsvStream> result = new ListReader<CsvStream>(streamList);
158
		return result;
159
	}
160

    
161

    
162
	/**
163
	 * Creates the CsvStream for an archive entry. Returns null if archive entry is null.
164
	 * @param archiveEntry
165
	 * @param state 
166
	 * @return
167
	 * @throws IOException
168
	 * @throws UnsupportedEncodingException
169
	 */
170
	private CsvStream makeStream(ArchiveEntryBase archiveEntry, STATE state) throws IOException, UnsupportedEncodingException {
171
		if (archiveEntry == null){
172
			return null;
173
		}
174
				 
175
		char fieldTerminatedBy = StringUtils.isEmpty(archiveEntry.getFieldsTerminatedBy()) ? CSVParser.DEFAULT_SEPARATOR : archiveEntry.getFieldsTerminatedBy().charAt(0);
176
		// default is a kind of 'null' quote, which tells opencsv to ignore the enclosing quotes
177
		char fieldsEnclosedBy = CSVWriter.NO_QUOTE_CHARACTER;
178
		if(state == null || !state.getConfig().isNoQuotes()) {
179
		        fieldsEnclosedBy= StringUtils.isEmpty(archiveEntry.getFieldsEnclosedBy()) ? CSVParser.DEFAULT_QUOTE_CHARACTER: archiveEntry.getFieldsEnclosedBy().charAt(0);
180
		}
181
		boolean ignoreHeader = archiveEntry.getIgnoreHeaderLines();
182
		String linesTerminatedBy = archiveEntry.getLinesTerminatedBy();
183
		String encoding = archiveEntry.getEncoding();
184
		int skipLines = ignoreHeader? 1 : 0;
185
		
186
		String fileLocation = archiveEntry.getFiles().getLocation();
187
		InputStream coreCsvInputStream = makeInputStream(fileLocation);
188
		Reader coreReader = new InputStreamReader(coreCsvInputStream, encoding); 
189
		CSVReader csvReader = new CSVReader(coreReader, fieldTerminatedBy,fieldsEnclosedBy, skipLines);
190
		CsvStream csvStream = new CsvStream(csvReader, archiveEntry, skipLines);
191
		
192
		//		InputStream s;
193
//		s.
194
		
195
		return csvStream;
196
	}
197

    
198

    
199
	private void initArchive() {
200
		if (archive == null){
201
			try {
202
				InputStream metaInputStream = makeInputStream(META_XML);
203
				
204
				JAXBContext jaxbContext = JAXBContext.newInstance("eu.etaxonomy.cdm.io.dwca.jaxb");
205
				Unmarshaller unmarshaller =  jaxbContext.createUnmarshaller();
206
				archive = (Archive)unmarshaller.unmarshal(metaInputStream);
207
	
208
				validateArchive(archive);
209
			} catch (IOException e) {
210
				throw new RuntimeException(e);
211
			} catch (JAXBException e) {
212
				throw new RuntimeException(e);
213
			}
214
		}
215
	}
216

    
217

    
218
	private void validateArchive(Archive archive) {
219
		if (archive.getCore().getFieldsTerminatedBy() != null && archive.getCore().getFieldsTerminatedBy().length() > 1){
220
			if (archive.getCore().getFieldsTerminatedBy().equals("\\t") ){
221
				//TODO handle, TODO also handle other \xxx delimiter
222
			}else{
223
				throw new IllegalStateException("CsvReader does not allow field delimiters with more than 1 character. ");
224
			}
225
		}
226
		if (archive.getCore().getFieldsEnclosedBy() != null && archive.getCore().getFieldsEnclosedBy().length() > 1){
227
			throw new IllegalStateException("CsvReader does not allow field delimiters with more than 1 character");
228
		}
229
		
230
	}
231

    
232
//
233
//	/**
234
//	 * @return
235
//	 * @throws IOException
236
//	 */
237
//	private InputStream makeInputStream(String name) throws IOException {
238
//		
239
//		ZipInputStream zin = new ZipInputStream(dwcaZip.toURL().openStream());
240
//		ZipEntry ze = zin.getNextEntry();
241
//		while (!ze.getName().equals(name)) {
242
//		    zin.closeEntry(); // not sure whether this is necessary
243
//		    ze = zin.getNextEntry();
244
//		}
245
//		
246
//		CheckedInputStream cis = new CheckedInputStream(in, cksum)
247
//		
248
//		InputStream metaInputStream = zip.getInputStream(ze);
249
//		return metaInputStream;
250
//	
251
//		InputStream metaInputStream = zip.getInputStream(metaEntry);
252
//		return metaInputStream;
253
//	}
254
//	
255

    
256
	/**
257
	 * @return
258
	 * @throws IOException
259
	 */
260
	private InputStream makeInputStream(String name) throws IOException {
261
		File file = new File(dwcaZip);
262
		if (! file.isFile() || ! file.exists()){
263
			String message = "URI is not a file: %s";
264
			throw new IOException(String.format(message, dwcaZip.toString()));
265
		}
266
		ZipFile zip = new ZipFile(file, ZipFile.OPEN_READ);
267
		
268
		//Enumeration<? extends ZipEntry> zipEntries = zip.entries();
269
		//ze = new ZipEntry(name);
270
		ZipEntry metaEntry = zip.getEntry(name);
271
		
272
		//Lorna added this to deal with Scratchpads dwca.zip files which when unzipped have a directory dwca/ which contains the files
273
		if (metaEntry == null) {
274
			metaEntry = zip.getEntry("dwca/" + name);
275
		}
276
		if (metaEntry == null){
277
			String message = "Zip entry for %s not available";
278
			throw new IOException(String.format(message, name));
279
		}
280
		InputStream metaInputStream = zip.getInputStream(metaEntry);
281
		return metaInputStream;
282
	}
283

    
284

    
285
	
286
	
287
}
(13-13/35)