1
|
// $Id$
|
2
|
/**
|
3
|
* Copyright (C) 2009 EDIT
|
4
|
* European Distributed Institute of Taxonomy
|
5
|
* http://www.e-taxonomy.eu
|
6
|
*
|
7
|
* The contents of this file are subject to the Mozilla Public License Version 1.1
|
8
|
* See LICENSE.TXT at the top of this package for the full license terms.
|
9
|
*/
|
10
|
package eu.etaxonomy.cdm.ext.scratchpads;
|
11
|
|
12
|
import java.io.BufferedReader;
|
13
|
import java.io.FileNotFoundException;
|
14
|
import java.io.FileOutputStream;
|
15
|
import java.io.IOException;
|
16
|
import java.io.InputStream;
|
17
|
import java.io.InputStreamReader;
|
18
|
import java.net.URI;
|
19
|
import java.net.URISyntaxException;
|
20
|
import java.net.URL;
|
21
|
import java.util.Iterator;
|
22
|
import java.util.zip.ZipEntry;
|
23
|
import java.util.zip.ZipInputStream;
|
24
|
|
25
|
import org.apache.http.HttpException;
|
26
|
import org.apache.http.HttpResponse;
|
27
|
import org.apache.log4j.Logger;
|
28
|
|
29
|
import com.fasterxml.jackson.databind.JsonNode;
|
30
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
31
|
|
32
|
import eu.etaxonomy.cdm.common.UriUtils;
|
33
|
|
34
|
|
35
|
/**
|
36
|
* @author l.morris
|
37
|
* @date Jul 2, 2013
|
38
|
*
|
39
|
*/
|
40
|
public class ScratchpadsService {
|
41
|
|
42
|
private static final Logger logger = Logger.getLogger(ScratchpadsService.class);
|
43
|
|
44
|
public static final String SCRATCHPADS_JSON_ENDPOINT = "http://scratchpads.eu/explore/sites-list/json";
|
45
|
|
46
|
private static final char[] ILLEGAL_CHARACTERS = { '/', '\n', '\r', '\t', '\0', '\f', '`', '?', '*', '\\', '<', '>', '|', '\"', ':', '.' };
|
47
|
|
48
|
private static final String dir = "C:\\Users\\l.morris\\Downloads\\dwca_scratchpads\\";
|
49
|
|
50
|
public void harvest(){
|
51
|
|
52
|
InputStream inputStream = null;
|
53
|
|
54
|
try {
|
55
|
URL url = new URL(SCRATCHPADS_JSON_ENDPOINT);
|
56
|
boolean isAvailable = UriUtils.isServiceAvailable(url.toURI());
|
57
|
|
58
|
if (isAvailable) {
|
59
|
inputStream = UriUtils.getInputStream(url.toURI());
|
60
|
}
|
61
|
|
62
|
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "iso-8859-1"), 8);
|
63
|
StringBuilder sb = new StringBuilder();
|
64
|
String line = null;
|
65
|
while ((line = reader.readLine()) != null) {
|
66
|
sb.append(line + "\n");
|
67
|
logger.debug(line);
|
68
|
|
69
|
}
|
70
|
|
71
|
ObjectMapper m = new ObjectMapper();
|
72
|
|
73
|
JsonNode rootNode = m.readTree(sb.toString());
|
74
|
logger.debug(rootNode.toString());
|
75
|
logger.debug(rootNode.isArray());
|
76
|
|
77
|
int num = 0;
|
78
|
if (rootNode.isArray()) {
|
79
|
Iterator<JsonNode> arrayElements = rootNode.elements();
|
80
|
while (arrayElements.hasNext()) {
|
81
|
JsonNode element = arrayElements.next();
|
82
|
JsonNode website = element.path("field_website");
|
83
|
//logger.debug(website.getValueAsText());
|
84
|
String fieldWebsite = website.asText();
|
85
|
|
86
|
if (fieldWebsite.startsWith("http")) {
|
87
|
|
88
|
url = new URL(fieldWebsite + "/dwca.zip");
|
89
|
URI uri = url.toURI();
|
90
|
isAvailable = UriUtils.isServiceAvailable(uri);
|
91
|
logger.debug("Is " + fieldWebsite + " available :" + isAvailable);
|
92
|
|
93
|
String websiteName = "";
|
94
|
//websiteName = (fieldWebsite.toString().split("//")[1]).split(".*")[0];
|
95
|
websiteName = websiteName + fieldWebsite.split("//")[1];
|
96
|
//if (websiteName.contains(".")){
|
97
|
//websiteName = websiteName.substring(0, websiteName.indexOf("."));
|
98
|
websiteName = websiteName.replaceAll("\\.", "_");
|
99
|
//websiteName = websiteName.substring(0, websiteName.indexOf("."));
|
100
|
|
101
|
//}
|
102
|
|
103
|
//logger.debug("the website name " + websiteName);
|
104
|
|
105
|
for (int j = 0; j < ILLEGAL_CHARACTERS.length; j++) {
|
106
|
|
107
|
char ch = '_';
|
108
|
websiteName.replace(ILLEGAL_CHARACTERS[j], ch);
|
109
|
}
|
110
|
|
111
|
websiteName = websiteName.substring(0, websiteName.length());
|
112
|
|
113
|
if (isAvailable) {
|
114
|
|
115
|
HttpResponse response = UriUtils.getResponse(uri, null);
|
116
|
if (UriUtils.isOk(response)) {
|
117
|
|
118
|
|
119
|
logger.debug("There is a dwca " + websiteName);
|
120
|
|
121
|
try {
|
122
|
inputStream = UriUtils.getInputStream(url.toURI());
|
123
|
|
124
|
num++;
|
125
|
|
126
|
if (inputStream != null) {
|
127
|
|
128
|
copyDwcaZip(inputStream, websiteName);
|
129
|
//createDwcaZip(inputStream);
|
130
|
}
|
131
|
|
132
|
} catch (HttpException e) {
|
133
|
// TODO Auto-generated catch block
|
134
|
logger.error("Failed to get dwca for " + websiteName + " as there was an error " + e);
|
135
|
}
|
136
|
|
137
|
}
|
138
|
|
139
|
}
|
140
|
}
|
141
|
}
|
142
|
}
|
143
|
|
144
|
inputStream.close();
|
145
|
|
146
|
|
147
|
} catch (URISyntaxException e) {
|
148
|
throw new RuntimeException(e);
|
149
|
} catch (IOException ie) {
|
150
|
throw new RuntimeException(ie);
|
151
|
} catch (HttpException e) {
|
152
|
throw new RuntimeException(e);
|
153
|
}
|
154
|
}
|
155
|
|
156
|
/**
|
157
|
* FIXME
|
158
|
* This is a hack as dwca.zip files from Scratchpads sites have an extra directory when unzipped. i.e. all the text
|
159
|
* and meta.xml are in the sub-directory dwca, but the should be directly in the top-level unzipped directory
|
160
|
*/
|
161
|
private void createDwcaZip (InputStream inputStream, String websiteName) {
|
162
|
|
163
|
ZipInputStream zis = new ZipInputStream(inputStream);
|
164
|
|
165
|
byte[] buffer = new byte[4096];
|
166
|
ZipEntry ze;
|
167
|
try {
|
168
|
while ((ze = zis.getNextEntry()) != null)
|
169
|
{
|
170
|
System.out.println("Extracting: " + ze);
|
171
|
|
172
|
FileOutputStream fos = new FileOutputStream(ze.getName());
|
173
|
{
|
174
|
int numBytes;
|
175
|
while ((numBytes = zis.read(buffer, 0, buffer.length)) != -1) {
|
176
|
fos.write(buffer, 0, numBytes);
|
177
|
}
|
178
|
}
|
179
|
zis.closeEntry();
|
180
|
}
|
181
|
} catch (FileNotFoundException e) {
|
182
|
// TODO Auto-generated catch block
|
183
|
e.printStackTrace();
|
184
|
} catch (IOException e) {
|
185
|
// TODO Auto-generated catch block
|
186
|
e.printStackTrace();
|
187
|
}
|
188
|
|
189
|
}
|
190
|
|
191
|
/*
|
192
|
* Use this method instead of createDwcaZip, once the dwca.zip structure is fixed in Scratchpads
|
193
|
*/
|
194
|
private void copyDwcaZip (InputStream inputStream, String websiteName) {
|
195
|
|
196
|
FileOutputStream outputStream;
|
197
|
try {
|
198
|
outputStream = new FileOutputStream("dwca_" + websiteName + ".zip");//dir +
|
199
|
|
200
|
byte[] b = new byte[1024];
|
201
|
int count;
|
202
|
while ((count = inputStream.read(b)) >= 0) {
|
203
|
outputStream.write(b, 0, count);
|
204
|
}
|
205
|
outputStream.flush();
|
206
|
outputStream.close();
|
207
|
inputStream.close();
|
208
|
} catch (FileNotFoundException e) {
|
209
|
// TODO Auto-generated catch block
|
210
|
e.printStackTrace();
|
211
|
} catch (IOException ie) {
|
212
|
ie.printStackTrace();
|
213
|
}
|
214
|
}
|
215
|
|
216
|
/**
|
217
|
* @param args
|
218
|
*/
|
219
|
public static void main(String[] args) {
|
220
|
|
221
|
ScratchpadsService spService = new ScratchpadsService();
|
222
|
spService.harvest();
|
223
|
// TODO Auto-generated method stub
|
224
|
|
225
|
}
|
226
|
|
227
|
}
|