1
|
/**
|
2
|
* Copyright (C) 2009 EDIT
|
3
|
* European Distributed Institute of Taxonomy
|
4
|
* http://www.e-taxonomy.eu
|
5
|
*
|
6
|
* The contents of this file are subject to the Mozilla Public License Version 1.1
|
7
|
* See LICENSE.TXT at the top of this package for the full license terms.
|
8
|
*/
|
9
|
package eu.etaxonomy.cdm.ext.scratchpads;
|
10
|
|
11
|
import java.io.BufferedReader;
|
12
|
import java.io.FileNotFoundException;
|
13
|
import java.io.FileOutputStream;
|
14
|
import java.io.IOException;
|
15
|
import java.io.InputStream;
|
16
|
import java.io.InputStreamReader;
|
17
|
import java.net.URI;
|
18
|
import java.net.URISyntaxException;
|
19
|
import java.net.URL;
|
20
|
import java.util.Iterator;
|
21
|
import java.util.zip.ZipEntry;
|
22
|
import java.util.zip.ZipInputStream;
|
23
|
|
24
|
import org.apache.http.HttpException;
|
25
|
import org.apache.http.HttpResponse;
|
26
|
import org.apache.log4j.Logger;
|
27
|
|
28
|
import com.fasterxml.jackson.databind.JsonNode;
|
29
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
30
|
|
31
|
import eu.etaxonomy.cdm.common.UriUtils;
|
32
|
|
33
|
|
34
|
/**
|
35
|
* @author l.morris
|
36
|
* @date Jul 2, 2013
|
37
|
*
|
38
|
*/
|
39
|
public class ScratchpadsService {
|
40
|
|
41
|
private static final Logger logger = Logger.getLogger(ScratchpadsService.class);
|
42
|
|
43
|
public static final String SCRATCHPADS_JSON_ENDPOINT = "http://scratchpads.eu/explore/sites-list/json";
|
44
|
|
45
|
private static final char[] ILLEGAL_CHARACTERS = { '/', '\n', '\r', '\t', '\0', '\f', '`', '?', '*', '\\', '<', '>', '|', '\"', ':', '.' };
|
46
|
|
47
|
private static final String dir = "C:\\Users\\l.morris\\Downloads\\dwca_scratchpads\\";
|
48
|
|
49
|
public void harvest(){
|
50
|
|
51
|
InputStream inputStream = null;
|
52
|
|
53
|
try {
|
54
|
URL url = new URL(SCRATCHPADS_JSON_ENDPOINT);
|
55
|
boolean isAvailable = UriUtils.isServiceAvailable(url.toURI());
|
56
|
|
57
|
if (isAvailable) {
|
58
|
inputStream = UriUtils.getInputStream(url.toURI());
|
59
|
}
|
60
|
|
61
|
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "iso-8859-1"), 8);
|
62
|
StringBuilder sb = new StringBuilder();
|
63
|
String line = null;
|
64
|
while ((line = reader.readLine()) != null) {
|
65
|
sb.append(line + "\n");
|
66
|
logger.debug(line);
|
67
|
|
68
|
}
|
69
|
|
70
|
ObjectMapper m = new ObjectMapper();
|
71
|
|
72
|
JsonNode rootNode = m.readTree(sb.toString());
|
73
|
logger.debug(rootNode.toString());
|
74
|
logger.debug(rootNode.isArray());
|
75
|
|
76
|
int num = 0;
|
77
|
if (rootNode.isArray()) {
|
78
|
Iterator<JsonNode> arrayElements = rootNode.elements();
|
79
|
while (arrayElements.hasNext()) {
|
80
|
JsonNode element = arrayElements.next();
|
81
|
JsonNode website = element.path("field_website");
|
82
|
//logger.debug(website.getValueAsText());
|
83
|
String fieldWebsite = website.asText();
|
84
|
|
85
|
if (fieldWebsite.startsWith("http")) {
|
86
|
|
87
|
url = new URL(fieldWebsite + "/dwca.zip");
|
88
|
URI uri = url.toURI();
|
89
|
isAvailable = UriUtils.isServiceAvailable(uri);
|
90
|
logger.debug("Is " + fieldWebsite + " available :" + isAvailable);
|
91
|
|
92
|
String websiteName = "";
|
93
|
//websiteName = (fieldWebsite.toString().split("//")[1]).split(".*")[0];
|
94
|
websiteName = websiteName + fieldWebsite.split("//")[1];
|
95
|
//if (websiteName.contains(".")){
|
96
|
//websiteName = websiteName.substring(0, websiteName.indexOf("."));
|
97
|
websiteName = websiteName.replaceAll("\\.", "_");
|
98
|
//websiteName = websiteName.substring(0, websiteName.indexOf("."));
|
99
|
|
100
|
//}
|
101
|
|
102
|
//logger.debug("the website name " + websiteName);
|
103
|
|
104
|
for (int j = 0; j < ILLEGAL_CHARACTERS.length; j++) {
|
105
|
|
106
|
char ch = '_';
|
107
|
websiteName.replace(ILLEGAL_CHARACTERS[j], ch);
|
108
|
}
|
109
|
|
110
|
websiteName = websiteName.substring(0, websiteName.length());
|
111
|
|
112
|
if (isAvailable) {
|
113
|
|
114
|
HttpResponse response = UriUtils.getResponse(uri, null);
|
115
|
if (UriUtils.isOk(response)) {
|
116
|
|
117
|
|
118
|
logger.debug("There is a dwca " + websiteName);
|
119
|
|
120
|
try {
|
121
|
inputStream = UriUtils.getInputStream(url.toURI());
|
122
|
|
123
|
num++;
|
124
|
|
125
|
if (inputStream != null) {
|
126
|
|
127
|
copyDwcaZip(inputStream, websiteName);
|
128
|
//createDwcaZip(inputStream);
|
129
|
}
|
130
|
|
131
|
} catch (HttpException e) {
|
132
|
// TODO Auto-generated catch block
|
133
|
logger.error("Failed to get dwca for " + websiteName + " as there was an error " + e);
|
134
|
}
|
135
|
|
136
|
}
|
137
|
|
138
|
}
|
139
|
}
|
140
|
}
|
141
|
}
|
142
|
|
143
|
inputStream.close();
|
144
|
|
145
|
|
146
|
} catch (URISyntaxException e) {
|
147
|
throw new RuntimeException(e);
|
148
|
} catch (IOException ie) {
|
149
|
throw new RuntimeException(ie);
|
150
|
} catch (HttpException e) {
|
151
|
throw new RuntimeException(e);
|
152
|
}
|
153
|
}
|
154
|
|
155
|
/**
|
156
|
* FIXME
|
157
|
* This is a hack as dwca.zip files from Scratchpads sites have an extra directory when unzipped. i.e. all the text
|
158
|
* and meta.xml are in the sub-directory dwca, but the should be directly in the top-level unzipped directory
|
159
|
*/
|
160
|
private void createDwcaZip (InputStream inputStream, String websiteName) {
|
161
|
|
162
|
ZipInputStream zis = new ZipInputStream(inputStream);
|
163
|
|
164
|
byte[] buffer = new byte[4096];
|
165
|
ZipEntry ze;
|
166
|
try {
|
167
|
while ((ze = zis.getNextEntry()) != null)
|
168
|
{
|
169
|
System.out.println("Extracting: " + ze);
|
170
|
|
171
|
FileOutputStream fos = new FileOutputStream(ze.getName());
|
172
|
{
|
173
|
int numBytes;
|
174
|
while ((numBytes = zis.read(buffer, 0, buffer.length)) != -1) {
|
175
|
fos.write(buffer, 0, numBytes);
|
176
|
}
|
177
|
}
|
178
|
zis.closeEntry();
|
179
|
}
|
180
|
} catch (FileNotFoundException e) {
|
181
|
// TODO Auto-generated catch block
|
182
|
e.printStackTrace();
|
183
|
} catch (IOException e) {
|
184
|
// TODO Auto-generated catch block
|
185
|
e.printStackTrace();
|
186
|
}
|
187
|
|
188
|
}
|
189
|
|
190
|
/*
|
191
|
* Use this method instead of createDwcaZip, once the dwca.zip structure is fixed in Scratchpads
|
192
|
*/
|
193
|
private void copyDwcaZip (InputStream inputStream, String websiteName) {
|
194
|
|
195
|
FileOutputStream outputStream;
|
196
|
try {
|
197
|
outputStream = new FileOutputStream("dwca_" + websiteName + ".zip");//dir +
|
198
|
|
199
|
byte[] b = new byte[1024];
|
200
|
int count;
|
201
|
while ((count = inputStream.read(b)) >= 0) {
|
202
|
outputStream.write(b, 0, count);
|
203
|
}
|
204
|
outputStream.flush();
|
205
|
outputStream.close();
|
206
|
inputStream.close();
|
207
|
} catch (FileNotFoundException e) {
|
208
|
// TODO Auto-generated catch block
|
209
|
e.printStackTrace();
|
210
|
} catch (IOException ie) {
|
211
|
ie.printStackTrace();
|
212
|
}
|
213
|
}
|
214
|
|
215
|
/**
|
216
|
* @param args
|
217
|
*/
|
218
|
public static void main(String[] args) {
|
219
|
|
220
|
ScratchpadsService spService = new ScratchpadsService();
|
221
|
spService.harvest();
|
222
|
// TODO Auto-generated method stub
|
223
|
|
224
|
}
|
225
|
|
226
|
}
|