Project

General

Profile

Download (6.12 KB) Statistics
| Branch: | Tag: | Revision:
1
/**
2
 * Copyright (C) 2009 EDIT
3
 * European Distributed Institute of Taxonomy
4
 * http://www.e-taxonomy.eu
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version 1.1
7
 * See LICENSE.TXT at the top of this package for the full license terms.
8
 */
9
package eu.etaxonomy.cdm.ext.scratchpads;
10

    
11
import java.io.BufferedReader;
12
import java.io.FileNotFoundException;
13
import java.io.FileOutputStream;
14
import java.io.IOException;
15
import java.io.InputStream;
16
import java.io.InputStreamReader;
17
import java.net.URI;
18
import java.net.URISyntaxException;
19
import java.net.URL;
20
import java.util.Iterator;
21
import java.util.zip.ZipEntry;
22
import java.util.zip.ZipInputStream;
23

    
24
import org.apache.http.HttpException;
25
import org.apache.http.HttpResponse;
26
import org.apache.log4j.Logger;
27

    
28
import com.fasterxml.jackson.databind.JsonNode;
29
import com.fasterxml.jackson.databind.ObjectMapper;
30

    
31
import eu.etaxonomy.cdm.common.UriUtils;
32

    
33

    
34
/**
35
 * @author l.morris
36
 * @date Jul 2, 2013
37
 *
38
 */
39
public class ScratchpadsService {
40

    
41
	private static final Logger logger = Logger.getLogger(ScratchpadsService.class);
42

    
43
	public static final String SCRATCHPADS_JSON_ENDPOINT = "http://scratchpads.eu/explore/sites-list/json";
44

    
45
	private static final char[] ILLEGAL_CHARACTERS = { '/', '\n', '\r', '\t', '\0', '\f', '`', '?', '*', '\\', '<', '>', '|', '\"', ':', '.' };
46

    
47
	private static final String dir = "C:\\Users\\l.morris\\Downloads\\dwca_scratchpads\\";
48

    
49
	public void harvest(){
50

    
51
		InputStream inputStream = null;
52

    
53
		try {
54
			URL url = new URL(SCRATCHPADS_JSON_ENDPOINT);
55
			boolean isAvailable = UriUtils.isServiceAvailable(url.toURI());
56

    
57
			if (isAvailable) {
58
				inputStream = UriUtils.getInputStream(url.toURI());
59
			}
60

    
61
			BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "iso-8859-1"), 8);
62
			StringBuilder sb = new StringBuilder();
63
			String line = null;
64
			while ((line = reader.readLine()) != null) {
65
				sb.append(line + "\n");
66
				logger.debug(line);
67

    
68
			}
69

    
70
			ObjectMapper m = new ObjectMapper();
71

    
72
			JsonNode rootNode = m.readTree(sb.toString());
73
			logger.debug(rootNode.toString());
74
			logger.debug(rootNode.isArray());
75

    
76
			int num = 0;
77
			if (rootNode.isArray()) {
78
				Iterator<JsonNode> arrayElements = rootNode.elements();
79
				while (arrayElements.hasNext()) {
80
					JsonNode element = arrayElements.next();
81
					JsonNode website = element.path("field_website");
82
					//logger.debug(website.getValueAsText());
83
					String fieldWebsite = website.asText();
84

    
85
					if (fieldWebsite.startsWith("http")) {
86

    
87
						url = new URL(fieldWebsite + "/dwca.zip");
88
						URI uri = url.toURI();
89
						isAvailable = UriUtils.isServiceAvailable(uri);
90
						logger.debug("Is " + fieldWebsite + " available :" + isAvailable);
91

    
92
						String websiteName = "";
93
						//websiteName = (fieldWebsite.toString().split("//")[1]).split(".*")[0];
94
						websiteName = websiteName + fieldWebsite.split("//")[1];
95
						//if (websiteName.contains(".")){
96
							//websiteName = websiteName.substring(0, websiteName.indexOf("."));
97
						websiteName = websiteName.replaceAll("\\.", "_");
98
							//websiteName = websiteName.substring(0, websiteName.indexOf("."));
99

    
100
						//}
101

    
102
						//logger.debug("the website name " + websiteName);
103

    
104
						for (int j = 0; j < ILLEGAL_CHARACTERS.length; j++) {
105

    
106
							char ch = '_';
107
							websiteName.replace(ILLEGAL_CHARACTERS[j], ch);
108
						}
109

    
110
						websiteName = websiteName.substring(0, websiteName.length());
111

    
112
						if (isAvailable) {
113

    
114
							HttpResponse response = UriUtils.getResponse(uri, null);
115
							if (UriUtils.isOk(response)) {
116

    
117

    
118
								logger.debug("There is a dwca " + websiteName);
119

    
120
								try {
121
									inputStream = UriUtils.getInputStream(url.toURI());
122

    
123
									num++;
124

    
125
									if (inputStream != null) {
126

    
127
										copyDwcaZip(inputStream, websiteName);
128
										//createDwcaZip(inputStream);
129
									}
130

    
131
								} catch (HttpException e) {
132
									// TODO Auto-generated catch block
133
									logger.error("Failed to get dwca for " + websiteName + " as there was an error " + e);
134
								}
135

    
136
							}
137

    
138
						}
139
					}
140
				}
141
			}
142

    
143
			inputStream.close();
144

    
145

    
146
		} catch (URISyntaxException e) {
147
			throw new RuntimeException(e);
148
		} catch (IOException ie) {
149
            throw new RuntimeException(ie);
150
		} catch (HttpException e) {
151
            throw new RuntimeException(e);
152
		}
153
	}
154

    
155
	/**
156
	 * FIXME
157
	 * This is a hack as dwca.zip files from Scratchpads sites have an extra directory when unzipped. i.e. all the text
158
	 * and meta.xml are in the sub-directory dwca, but the should be directly in the top-level unzipped directory
159
	 */
160
	private void createDwcaZip (InputStream inputStream, String websiteName) {
161

    
162
		 ZipInputStream zis = new ZipInputStream(inputStream);
163

    
164
	         byte[] buffer = new byte[4096];
165
	         ZipEntry ze;
166
	         try {
167
				while ((ze = zis.getNextEntry()) != null)
168
				 {
169
				    System.out.println("Extracting: " + ze);
170

    
171
				    FileOutputStream fos = new FileOutputStream(ze.getName());
172
				    {
173
				       int numBytes;
174
				       while ((numBytes = zis.read(buffer, 0, buffer.length)) != -1) {
175
                        fos.write(buffer, 0, numBytes);
176
                    }
177
				    }
178
				    zis.closeEntry();
179
				 }
180
			} catch (FileNotFoundException e) {
181
				// TODO Auto-generated catch block
182
				e.printStackTrace();
183
			} catch (IOException e) {
184
				// TODO Auto-generated catch block
185
				e.printStackTrace();
186
			}
187

    
188
	}
189

    
190
	/*
191
	 * Use this method instead of createDwcaZip, once the dwca.zip structure is fixed in Scratchpads
192
	 */
193
	private void copyDwcaZip (InputStream inputStream, String websiteName) {
194

    
195
		FileOutputStream outputStream;
196
		try {
197
			outputStream = new FileOutputStream("dwca_" + websiteName + ".zip");//dir +
198

    
199
			byte[] b = new byte[1024];
200
			int count;
201
			while ((count = inputStream.read(b)) >= 0) {
202
				outputStream.write(b, 0, count);
203
			}
204
			outputStream.flush();
205
			outputStream.close();
206
			inputStream.close();
207
		} catch (FileNotFoundException e) {
208
			// TODO Auto-generated catch block
209
			e.printStackTrace();
210
		} catch (IOException ie) {
211
			ie.printStackTrace();
212
		}
213
	}
214

    
215
	/**
216
	 * @param args
217
	 */
218
	public static void main(String[] args) {
219

    
220
		ScratchpadsService spService = new ScratchpadsService();
221
		spService.harvest();
222
		// TODO Auto-generated method stub
223

    
224
	}
225

    
226
}
    (1-1/1)