Project

General

Profile

Download (6.4 KB) Statistics
| Branch: | Tag: | Revision:
1
// $Id$
2
/**
3
 * Copyright (C) 2009 EDIT
4
 * European Distributed Institute of Taxonomy 
5
 * http://www.e-taxonomy.eu
6
 * 
7
 * The contents of this file are subject to the Mozilla Public License Version 1.1
8
 * See LICENSE.TXT at the top of this package for the full license terms.
9
 */
10
package eu.etaxonomy.cdm.ext.scratchpads;
11

    
12
import java.io.BufferedReader;
13
import java.io.FileNotFoundException;
14
import java.io.FileOutputStream;
15
import java.io.IOException;
16
import java.io.InputStream;
17
import java.io.InputStreamReader;
18
import java.net.URI;
19
import java.net.URISyntaxException;
20
import java.net.URL;
21
import java.util.Iterator;
22
import java.util.zip.ZipEntry;
23
import java.util.zip.ZipInputStream;
24

    
25
import org.apache.http.HttpException;
26
import org.apache.http.HttpResponse;
27
import org.apache.log4j.Logger;
28
import org.codehaus.jackson.JsonNode;
29
import org.codehaus.jackson.map.ObjectMapper;
30

    
31
import eu.etaxonomy.cdm.common.UriUtils;
32

    
33

    
34
/**
35
 * @author l.morris
36
 * @date Jul 2, 2013
37
 *
38
 */
39
public class ScratchpadsService {
40

    
41
	private static final Logger logger = Logger.getLogger(ScratchpadsService.class);
42

    
43
	public static final String SCRATCHPADS_JSON_ENDPOINT = "http://scratchpads.eu/explore/sites-list/json";
44

    
45
	private static final char[] ILLEGAL_CHARACTERS = { '/', '\n', '\r', '\t', '\0', '\f', '`', '?', '*', '\\', '<', '>', '|', '\"', ':', '.' };
46

    
47
	private static final String dir = "C:\\Users\\l.morris\\Downloads\\dwca_scratchpads\\";
48
	
49
	public void harvest(){
50

    
51
		InputStream inputStream = null;
52
		
53
		try {
54
			URL url = new URL(SCRATCHPADS_JSON_ENDPOINT);
55
			boolean isAvailable = UriUtils.isServiceAvailable(url.toURI());
56

    
57
			if (isAvailable) {
58
				inputStream = UriUtils.getInputStream(url.toURI());
59
			}
60

    
61
			BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "iso-8859-1"), 8);
62
			StringBuilder sb = new StringBuilder();
63
			String line = null;
64
			while ((line = reader.readLine()) != null) {
65
				sb.append(line + "\n");
66
				logger.debug(line);
67
				
68
			}
69

    
70
			ObjectMapper m = new ObjectMapper();			
71
			
72
			JsonNode rootNode = m.readTree(sb.toString());
73
			logger.debug(rootNode.toString());
74
			logger.debug(rootNode.isArray());
75
			
76
			int num = 0;
77
			if (rootNode.isArray()) {
78
				Iterator<JsonNode> arrayElements = rootNode.getElements();
79
				while (arrayElements.hasNext()) {
80
					JsonNode element = arrayElements.next();
81
					JsonNode website = element.path("field_website");
82
					//logger.debug(website.getValueAsText());
83
					String fieldWebsite = website.getValueAsText();
84
					
85
					if (fieldWebsite.startsWith("http")) {
86

    
87
						url = new URL(fieldWebsite + "/dwca.zip");
88
						URI uri = url.toURI();
89
						isAvailable = UriUtils.isServiceAvailable(uri);
90
						logger.debug("Is " + fieldWebsite + " available :" + isAvailable);
91

    
92
						String websiteName = "";
93
						//websiteName = (fieldWebsite.toString().split("//")[1]).split(".*")[0];
94
						websiteName = websiteName + fieldWebsite.split("//")[1];
95
						//if (websiteName.contains(".")){
96
							//websiteName = websiteName.substring(0, websiteName.indexOf("."));
97
						websiteName = websiteName.replaceAll("\\.", "_");
98
							//websiteName = websiteName.substring(0, websiteName.indexOf("."));
99

    
100
						//} 
101

    
102
						//logger.debug("the website name " + websiteName);
103

    
104
						for (int j = 0; j < ILLEGAL_CHARACTERS.length; j++) {
105

    
106
							char ch = '_';
107
							websiteName.replace(ILLEGAL_CHARACTERS[j], ch);
108
						}
109

    
110
						websiteName = websiteName.substring(0, websiteName.length());
111

    
112
						if (isAvailable) {
113

    
114
							HttpResponse response = UriUtils.getResponse(uri, null);
115
							if (UriUtils.isOk(response)) {
116

    
117

    
118
								logger.debug("There is a dwca " + websiteName);
119

    
120
								try {
121
									inputStream = UriUtils.getInputStream(url.toURI());
122

    
123
									num++;
124

    
125
									if (inputStream != null) {
126

    
127
										copyDwcaZip(inputStream, websiteName);
128
										//createDwcaZip(inputStream);
129
									}
130
									
131
								} catch (HttpException e) {
132
									// TODO Auto-generated catch block
133
									logger.error("Failed to get dwca for " + websiteName + " as there was an error " + e);
134
								} 
135

    
136
							}
137

    
138
						}
139
					}
140
				}
141
			}
142

    
143
			inputStream.close();
144

    
145

    
146
		} catch (URISyntaxException e) {
147
			// TODO Auto-generated catch block
148
			e.printStackTrace();
149
		} catch (IOException ie) {
150
			ie.printStackTrace();
151
		} catch (HttpException e) {
152
			// TODO Auto-generated catch block
153
			e.printStackTrace();
154
		} 
155
	}
156
	
157
	/**
158
	 * FIXME
159
	 * This is a hack as dwca.zip files from Scratchpads sites have an extra directory when unzipped. i.e. all the text
160
	 * and meta.xml are in the sub-directory dwca, but the should be directly in the top-level unzipped directory
161
	 */
162
	private void createDwcaZip (InputStream inputStream, String websiteName) {
163
		
164
		 ZipInputStream zis = new ZipInputStream(inputStream);
165
	    
166
	         byte[] buffer = new byte[4096];
167
	         ZipEntry ze;
168
	         try {
169
				while ((ze = zis.getNextEntry()) != null)
170
				 {
171
				    System.out.println("Extracting: " + ze);
172
				    
173
				    FileOutputStream fos = new FileOutputStream(ze.getName());
174
				    {
175
				       int numBytes;
176
				       while ((numBytes = zis.read(buffer, 0, buffer.length)) != -1)
177
				          fos.write(buffer, 0, numBytes);
178
				    }
179
				    zis.closeEntry();
180
				 }
181
			} catch (FileNotFoundException e) {
182
				// TODO Auto-generated catch block
183
				e.printStackTrace();
184
			} catch (IOException e) {
185
				// TODO Auto-generated catch block
186
				e.printStackTrace();
187
			}		
188
		
189
	}
190
	
191
	/*
192
	 * Use this method instead of createDwcaZip, once the dwca.zip structure is fixed in Scratchpads
193
	 */
194
	private void copyDwcaZip (InputStream inputStream, String websiteName) {
195
		
196
		FileOutputStream outputStream;
197
		try {
198
			outputStream = new FileOutputStream("dwca_" + websiteName + ".zip");//dir + 
199

    
200
			byte[] b = new byte[1024];
201
			int count;
202
			while ((count = inputStream.read(b)) >= 0) {
203
				outputStream.write(b, 0, count);
204
			}
205
			outputStream.flush(); 
206
			outputStream.close(); 
207
			inputStream.close();   
208
		} catch (FileNotFoundException e) {
209
			// TODO Auto-generated catch block
210
			e.printStackTrace();
211
		} catch (IOException ie) {
212
			ie.printStackTrace();
213
		} 
214
	}
215

    
216
	/**
217
	 * @param args
218
	 */
219
	public static void main(String[] args) {
220

    
221
		ScratchpadsService spService = new ScratchpadsService();
222
		spService.harvest();
223
		// TODO Auto-generated method stub
224

    
225
	}
226

    
227
}
    (1-1/1)