1
|
package eu.etaxonomy.cdm.io.taxonx2013;
|
2
|
|
3
|
import java.net.URI;
|
4
|
import java.net.URISyntaxException;
|
5
|
import java.util.ArrayList;
|
6
|
import java.util.HashMap;
|
7
|
import java.util.List;
|
8
|
import java.util.Map;
|
9
|
import java.util.UUID;
|
10
|
|
11
|
import org.apache.commons.lang.StringUtils;
|
12
|
import org.apache.log4j.Logger;
|
13
|
import org.w3c.dom.NamedNodeMap;
|
14
|
import org.w3c.dom.Node;
|
15
|
import org.w3c.dom.NodeList;
|
16
|
|
17
|
import eu.etaxonomy.cdm.common.DOI;
|
18
|
import eu.etaxonomy.cdm.model.agent.Person;
|
19
|
import eu.etaxonomy.cdm.model.agent.Team;
|
20
|
import eu.etaxonomy.cdm.model.common.TimePeriod;
|
21
|
import eu.etaxonomy.cdm.model.reference.Reference;
|
22
|
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
|
23
|
import eu.etaxonomy.cdm.model.reference.ReferenceType;
|
24
|
import eu.etaxonomy.cdm.strategy.parser.TimePeriodParser;
|
25
|
|
26
|
public class TaxonXModsExtractor extends TaxonXExtractor{
|
27
|
|
28
|
private final Map<String,UUID> personMap = new HashMap<String, UUID>();
|
29
|
|
30
|
private final Logger logger = Logger.getLogger(getClass());
|
31
|
|
32
|
|
33
|
private final String AUTHOR = "author";
|
34
|
private final String EDITOR = "editor";
|
35
|
|
36
|
/**
|
37
|
* @param agentService
|
38
|
*/
|
39
|
public TaxonXModsExtractor(TaxonXImport importer) {
|
40
|
this.importer = importer;
|
41
|
}
|
42
|
|
43
|
public Reference<?> extractMods(Node node){
|
44
|
|
45
|
//TODO needed? currently only filled but never read
|
46
|
Map<String, String> modsMap = new HashMap<String, String>();
|
47
|
NodeList children = node.getChildNodes();
|
48
|
List<String> roleList = new ArrayList<String>();
|
49
|
String content="";
|
50
|
|
51
|
Reference<?> ref = tryMakeReferenceByClassification(children);
|
52
|
|
53
|
if (ref == null){
|
54
|
// int reftype = askQuestion("What kind of reference is it?\n 1: Generic\n 2: Book\n 3: Article\n" +
|
55
|
// " 4 : BookSection\n 5 : Journal\n 6 : Printseries\n 7: Thesis ");
|
56
|
int reftype=1;
|
57
|
|
58
|
ref = getReferenceWithType(reftype);
|
59
|
}
|
60
|
handleModsNames(children, ref);
|
61
|
|
62
|
for (int i=0; i<children.getLength();i++){
|
63
|
Node modsChildNode = children.item(i);
|
64
|
String modsChildNodeName = modsChildNode.getNodeName();
|
65
|
if (modsChildNodeName.equalsIgnoreCase("mods:titleinfo")){
|
66
|
NodeList tmp = modsChildNode.getChildNodes();
|
67
|
for (int j=0;j<tmp.getLength();j++){
|
68
|
if (tmp.item(j).getNodeName().equalsIgnoreCase("mods:title")) {
|
69
|
content=tmp.item(j).getTextContent().trim();
|
70
|
if (!content.isEmpty()) {
|
71
|
modsMap.put("mainTitle",content);
|
72
|
// ref.setTitleCache(content,true);
|
73
|
ref.setTitle(content);
|
74
|
// ref.generateTitle();
|
75
|
}
|
76
|
}
|
77
|
}
|
78
|
}else if (modsChildNodeName.equalsIgnoreCase("mods:name")){
|
79
|
//handled separately
|
80
|
}else if (modsChildNodeName.equalsIgnoreCase("mods:typeofresource")){
|
81
|
content = modsChildNode.getTextContent().trim();
|
82
|
if (!content.isEmpty()) {
|
83
|
modsMap.put("typeofresource",content);
|
84
|
}
|
85
|
}else if (modsChildNodeName.equalsIgnoreCase("mods:identifier")){
|
86
|
content = modsChildNode.getTextContent().trim();
|
87
|
if (!content.isEmpty()) {
|
88
|
modsMap.put(modsChildNode.getAttributes().getNamedItem("type").getNodeValue(),content);
|
89
|
if (modsChildNode.getAttributes().getNamedItem("type").getNodeValue().equalsIgnoreCase("isbn")) {
|
90
|
ref.setIsbn(content);
|
91
|
}else if (modsChildNode.getAttributes().getNamedItem("type").getNodeValue().equalsIgnoreCase("issn")) {
|
92
|
ref.setIssn(content);
|
93
|
}else if (modsChildNode.getAttributes().getNamedItem("type").getNodeValue().equalsIgnoreCase("DOI")) {
|
94
|
try {
|
95
|
ref.setDoi(DOI.fromString(content));
|
96
|
} catch (IllegalArgumentException e) {
|
97
|
logger.warn(content + " is not a vaild DOI");
|
98
|
}
|
99
|
}else if (modsChildNode.getAttributes().getNamedItem("type").getNodeValue().equalsIgnoreCase("GenericHash")) {
|
100
|
ref.setIssn("GenericHash: "+content);
|
101
|
try {
|
102
|
ref.setUri(new URI("http://plazi.cs.umb.edu/GgServer/search?MODS.ModsDocID="+content));
|
103
|
} catch (URISyntaxException e) {
|
104
|
// TODO Auto-generated catch block
|
105
|
e.printStackTrace();
|
106
|
}
|
107
|
}else{
|
108
|
logger.info("identifier " + modsChildNode.getAttributes().getNamedItem("type").getNodeValue() + " not yet handled.");
|
109
|
}
|
110
|
}
|
111
|
}else if (modsChildNodeName.equalsIgnoreCase("mods:location")){
|
112
|
NodeList tmp = modsChildNode.getChildNodes();
|
113
|
for (int j=0;j<tmp.getLength();j++){
|
114
|
// System.out.println("Child of mods:location: "+tmp.item(j).getNodeName());
|
115
|
if (tmp.item(j).getNodeName().equalsIgnoreCase("mods:url")) {
|
116
|
content = tmp.item(j).getTextContent().trim();
|
117
|
if (!content.isEmpty() && (content != "http://un.availab.le")) {
|
118
|
modsMap.put("url",content);
|
119
|
ref.setUri(URI.create(content));
|
120
|
}
|
121
|
}
|
122
|
}
|
123
|
}
|
124
|
else if (modsChildNodeName.equalsIgnoreCase("mods:relatedItem")){
|
125
|
addRelatedMods(modsChildNode, modsMap, ref);
|
126
|
}else if (modsChildNodeName.equalsIgnoreCase("mods:classification")){
|
127
|
//already handled before
|
128
|
}else if (modsChildNodeName.equalsIgnoreCase("#text") && modsChildNode.getTextContent().matches("\\s*")){
|
129
|
//already handled before
|
130
|
}else{
|
131
|
logger.warn("mods item not recognized yet: " + modsChildNodeName);
|
132
|
}
|
133
|
|
134
|
|
135
|
}
|
136
|
modsMap.put("people",StringUtils.join(roleList.toArray(),SPLITTER));
|
137
|
|
138
|
List<Reference> references = importer.getReferenceService().list(Reference.class, 0, 0, null, null);
|
139
|
for(Reference<?> refe:references){
|
140
|
if (refe.getCitation().equalsIgnoreCase(ref.getCitation())) {
|
141
|
ref=refe;
|
142
|
}
|
143
|
}
|
144
|
// System.out.println(modsMap);
|
145
|
//
|
146
|
// System.out.println("REFERENCE "+ref.getCitation());
|
147
|
// System.out.println("REFERENCE "+ref.getTitle());
|
148
|
// System.out.println("REFERENCE "+ref.getTitleCache());
|
149
|
return ref;
|
150
|
}
|
151
|
|
152
|
private void handleModsNames(NodeList children, Reference<?> ref) {
|
153
|
|
154
|
List<String> roleList = new ArrayList<String>();
|
155
|
|
156
|
List<Person> persons = new ArrayList<Person>();
|
157
|
List<String> editors= new ArrayList<String>();
|
158
|
|
159
|
|
160
|
//handle all mods:name
|
161
|
for ( int i = 0; i<children.getLength(); i++){
|
162
|
if (children.item(i).getNodeName().equalsIgnoreCase("mods:name")){
|
163
|
NamedNodeMap attributeMap = children.item(i).getAttributes();
|
164
|
if ((attributeMap.getNamedItem("type") != null) && attributeMap.getNamedItem("type").getNodeValue().equalsIgnoreCase("personal")) {
|
165
|
handleNameTypePersonal(children.item(i), roleList, persons, editors);
|
166
|
} else if (attributeMap.getNamedItem("type") == null){
|
167
|
logger.warn("mods:name attribute 'type' is missing. Name not handled");
|
168
|
}else {
|
169
|
logger.warn("mods:name 'type' " + attributeMap.getNamedItem("type").getNodeValue() + " not yet supported");
|
170
|
}
|
171
|
}
|
172
|
}
|
173
|
//evaluate authors and editors
|
174
|
if (persons.size()>0){
|
175
|
if (ref == null){
|
176
|
logger.warn("mods:name exists but reference is null");
|
177
|
}else if (persons.size()==1){
|
178
|
ref.setAuthorship(persons.get(0));
|
179
|
}
|
180
|
else{
|
181
|
Team authorship = Team.NewInstance();
|
182
|
for (Person pers:persons){
|
183
|
authorship.addTeamMember(pers);
|
184
|
}
|
185
|
|
186
|
if (!personMap.containsKey(authorship.getTitleCache()) && (authorship.getTeamMembers().size()>0)){
|
187
|
UUID uuid = importer.getAgentService().saveOrUpdate(authorship);
|
188
|
personMap.put(authorship.getTitleCache(),uuid);
|
189
|
}else{
|
190
|
if(authorship.getTeamMembers().size()>1) {
|
191
|
UUID uuid = personMap.get(authorship.getTitleCache());
|
192
|
authorship = (Team) importer.getAgentService().find(uuid);
|
193
|
}
|
194
|
}
|
195
|
|
196
|
ref.setAuthorship(authorship);
|
197
|
}
|
198
|
if (editors.size()>0) {
|
199
|
ref.setEditor(StringUtils.join(editors,", "));
|
200
|
}
|
201
|
}
|
202
|
}
|
203
|
|
204
|
/**
|
205
|
* Extracts the reference with correct type from mods:classification.
|
206
|
* Incomplete implementation. Will be filled whenever new cases show up
|
207
|
* @param children list of all children of mods:mods
|
208
|
* @return
|
209
|
*/
|
210
|
//http://www.loc.gov/standards/mods/userguide/classification.html
|
211
|
private Reference<?> tryMakeReferenceByClassification(NodeList children) {
|
212
|
for (int i=0; i<children.getLength();i++){
|
213
|
if (children.item(i).getNodeName().equalsIgnoreCase("mods:classification")){
|
214
|
Node classificationNode = children.item(i);
|
215
|
String text = classificationNode.getTextContent();
|
216
|
if ("journal article".equals(text)){
|
217
|
return ReferenceFactory.newArticle();
|
218
|
}else if ("book".equals(text)){
|
219
|
return ReferenceFactory.newBook();
|
220
|
}else{
|
221
|
if (StringUtils.isNotBlank(text)){
|
222
|
logger.warn("mods:classification could not be recognized: " + text);
|
223
|
}else{
|
224
|
logger.warn("mods:classification has not text. ");
|
225
|
}
|
226
|
}
|
227
|
}
|
228
|
}
|
229
|
return null;
|
230
|
}
|
231
|
|
232
|
|
233
|
private void handleNameTypePersonal(Node node, List<String> roleList, List<Person> persons, List<String> editors) {
|
234
|
boolean newRole=false;
|
235
|
String content="";
|
236
|
String role =null;
|
237
|
|
238
|
List<String> nameParts = new ArrayList<String>();
|
239
|
|
240
|
NodeList tmp = node.getChildNodes();
|
241
|
for (int j=0;j<tmp.getLength();j++){
|
242
|
|
243
|
if (tmp.item(j).getNodeName().equalsIgnoreCase("mods:namePart")) {
|
244
|
content=tmp.item(j).getTextContent().trim();
|
245
|
if (! content.isEmpty()) {
|
246
|
nameParts.add(content);
|
247
|
}
|
248
|
} else if (tmp.item(j).getNodeName().equalsIgnoreCase("mods:role")) {
|
249
|
NodeList roleChildren = tmp.item(j).getChildNodes();
|
250
|
for (int k=0; k< roleChildren.getLength();k++){
|
251
|
if (roleChildren.item(k).getNodeName().equalsIgnoreCase("mods:roleTerm")){
|
252
|
content = roleChildren.item(k).getTextContent().trim();
|
253
|
if (!content.isEmpty()) {
|
254
|
roleList.add(content);
|
255
|
// p.setNomenclaturalTitle(content);
|
256
|
if (content.equalsIgnoreCase(EDITOR)) {
|
257
|
role=EDITOR;
|
258
|
}
|
259
|
else if (content.equalsIgnoreCase(AUTHOR)) {
|
260
|
role=AUTHOR;
|
261
|
}
|
262
|
newRole=true;
|
263
|
}
|
264
|
}
|
265
|
}
|
266
|
}
|
267
|
|
268
|
}
|
269
|
|
270
|
Person p=null;
|
271
|
if (! nameParts.isEmpty()){
|
272
|
p = Person.NewInstance();
|
273
|
p.setTitleCache(StringUtils.join(nameParts.toArray(), " "), true);
|
274
|
}
|
275
|
|
276
|
if (newRole){
|
277
|
if ((p!=null) && role.equals(AUTHOR)) {
|
278
|
UUID uuid = null;
|
279
|
if (!personMap.containsKey(p.getTitleCache())){
|
280
|
uuid = importer.getAgentService().saveOrUpdate(p);
|
281
|
p = (Person) importer.getAgentService().find(uuid);
|
282
|
personMap.put(p.getTitleCache(),uuid);
|
283
|
}else{
|
284
|
uuid = personMap.get(p.getTitleCache());
|
285
|
p = (Person) importer.getAgentService().find(uuid);
|
286
|
}
|
287
|
// logger.info("ADD PERSON "+p);
|
288
|
persons.add(p);
|
289
|
}
|
290
|
else if ((p!=null) && role.equals(EDITOR)) {
|
291
|
editors.add(p.getTitleCache());
|
292
|
}
|
293
|
}
|
294
|
}
|
295
|
|
296
|
/**
|
297
|
* @param item
|
298
|
* @param modsMap
|
299
|
*/
|
300
|
private void addRelatedMods(Node node, Map<String, String> modsMap, Reference<?> ref) {
|
301
|
NodeList tmp =node.getChildNodes();
|
302
|
NodeList partNodes = null;
|
303
|
NodeList children = null;
|
304
|
|
305
|
List<String> originInfo = null;
|
306
|
List<String> partList = null;
|
307
|
|
308
|
TimePeriod date;
|
309
|
|
310
|
String publisher="";
|
311
|
String publishplace="";
|
312
|
String pstart="";
|
313
|
String pend="";
|
314
|
|
315
|
Map<String,String> mapmap=null;
|
316
|
|
317
|
Map<String, String> relatedInfoMap = new HashMap<String, String>();
|
318
|
List<String> roleList = new ArrayList<String>();
|
319
|
String content="";
|
320
|
|
321
|
relatedInfoMap.put("type",node.getAttributes().getNamedItem("type").getNodeValue());
|
322
|
|
323
|
|
324
|
Reference<?> inRef = null;
|
325
|
for (int j=0;j<tmp.getLength();j++){
|
326
|
Node childNode = tmp.item(j);
|
327
|
String childNodeName = childNode.getNodeName();
|
328
|
if (childNodeName.equalsIgnoreCase("#text") && childNode.getTextContent().matches("\\s*")){
|
329
|
//do nothing
|
330
|
} else if (childNodeName.equalsIgnoreCase("mods:titleInfo")) {
|
331
|
content=childNode.getTextContent().trim();
|
332
|
if (!content.isEmpty()) {
|
333
|
relatedInfoMap.put("titleInfo",content);
|
334
|
if (node.getAttributes().getNamedItem("type").getNodeValue().equalsIgnoreCase("host")){
|
335
|
List<Reference> references = importer.getReferenceService().list(Reference.class, 0, 0, null, null);
|
336
|
boolean refFound = false;
|
337
|
for (Reference<?> tmpRef:references){
|
338
|
if(tmpRef.getTitleCache().equalsIgnoreCase(content)){
|
339
|
refFound = true;
|
340
|
inRef= tmpRef;
|
341
|
}
|
342
|
}
|
343
|
if (!refFound){
|
344
|
inRef = getBestInreference(ref);
|
345
|
if (inRef == null){
|
346
|
inRef = ReferenceFactory.newGeneric();
|
347
|
}
|
348
|
|
349
|
//book.setTitleCache(content,true);
|
350
|
inRef.setTitle(content);
|
351
|
}
|
352
|
if ((ref.getInReference() == null) || !ref.getInReference().equals(inRef)) {
|
353
|
ref.setInReference(inRef);
|
354
|
}else{
|
355
|
//TODO
|
356
|
}
|
357
|
}
|
358
|
}
|
359
|
} else if (childNodeName.equalsIgnoreCase("mods:originInfo")) {
|
360
|
children = childNode.getChildNodes();
|
361
|
originInfo = new ArrayList<String>();
|
362
|
for (int i=0;i<children.getLength();i++){
|
363
|
content=children.item(i).getTextContent().trim();
|
364
|
if (!content.isEmpty()) {
|
365
|
originInfo.add(children.item(i).getNodeName()+":"+content);
|
366
|
if (children.item(i).getNodeName().contains("dateIssued")) {
|
367
|
ref.setDatePublished(TimePeriodParser.parseString(content));
|
368
|
}
|
369
|
}
|
370
|
publisher="";
|
371
|
publishplace="";
|
372
|
if (children.item(i).getNodeName().contains("publisher")) {
|
373
|
try{
|
374
|
publisher=children.item(i).getChildNodes().item(0).getTextContent().trim();
|
375
|
// System.out.println("PUBLISHER "+publisher);
|
376
|
}catch(Exception e){System.out.println("oups "+e);}
|
377
|
}
|
378
|
if (children.item(i).getNodeName().contains("place")) {
|
379
|
try{
|
380
|
publishplace=children.item(i).getTextContent().trim();
|
381
|
// System.out.println("PUBLISHED "+publishplace);
|
382
|
}catch(Exception e){System.out.println("oups "+e);}
|
383
|
}
|
384
|
if (publishplace.isEmpty() && !publisher.isEmpty()) {
|
385
|
ref.setPublisher(publisher);
|
386
|
}
|
387
|
if (!publishplace.isEmpty() && !publisher.isEmpty()) {
|
388
|
ref.setPublisher(publisher, publishplace);
|
389
|
}
|
390
|
}
|
391
|
relatedInfoMap.put("originInfo", StringUtils.join(originInfo.toArray(),SPLITTER));
|
392
|
} else if (childNodeName.equalsIgnoreCase("mods:name")){
|
393
|
//handled later
|
394
|
} else if (childNodeName.equalsIgnoreCase("mods:part")){
|
395
|
children = childNode.getChildNodes();
|
396
|
partList = new ArrayList<String>();
|
397
|
for (int i=0;i<children.getLength();i++){
|
398
|
mapmap = new HashMap<String, String>();
|
399
|
// System.out.println(children.item(i).getNodeName());
|
400
|
|
401
|
if (children.item(i).getNodeName().equalsIgnoreCase("#text") && children.item(i).getTextContent().matches("\\s*")){
|
402
|
//do nothing
|
403
|
} else if (children.item(i).getNodeName().equalsIgnoreCase("mods:date")){
|
404
|
content = children.item(i).getTextContent().trim();
|
405
|
if (!content.isEmpty()){
|
406
|
date = TimePeriodParser.parseString(content);
|
407
|
//TODO need to check if date belongs to ref or inref
|
408
|
ref.setDatePublished(date);
|
409
|
}
|
410
|
} else if (children.item(i).getNodeName().equalsIgnoreCase("mods:detail") &&
|
411
|
children.item(i).getAttributes().getNamedItem("type").getNodeValue().equalsIgnoreCase("volume")){
|
412
|
partNodes = children.item(i).getChildNodes();
|
413
|
for (int k=0; k<partNodes.getLength();k++){
|
414
|
if (partNodes.item(k).getNodeName().equalsIgnoreCase("mods:number")) {
|
415
|
content = partNodes.item(k).getTextContent().trim();
|
416
|
if (!content.isEmpty()) {
|
417
|
ref.setVolume(content);
|
418
|
}
|
419
|
}
|
420
|
}
|
421
|
} else if (children.item(i).getNodeName().equalsIgnoreCase("mods:extent")) {
|
422
|
mapmap.put("unit", children.item(i).getAttributes().getNamedItem("unit").getNodeValue());
|
423
|
partNodes = children.item(i).getChildNodes();
|
424
|
pstart="";
|
425
|
pend="";
|
426
|
for (int k=0; k<partNodes.getLength();k++){
|
427
|
if (partNodes.item(k).getNodeName().equalsIgnoreCase("mods:start")) {
|
428
|
content = partNodes.item(k).getTextContent().trim();
|
429
|
if (!content.isEmpty()) {
|
430
|
mapmap.put("start",content);
|
431
|
pstart=content;
|
432
|
}
|
433
|
}
|
434
|
if (partNodes.item(k).getNodeName().equalsIgnoreCase("mods:end")) {
|
435
|
content = partNodes.item(k).getTextContent().trim();
|
436
|
if (!content.isEmpty()) {
|
437
|
mapmap.put("end",content);
|
438
|
pend=content;
|
439
|
}
|
440
|
}
|
441
|
}
|
442
|
// System.out.println("SET PAGES "+pstart+"-"+pend);
|
443
|
ref.setPages(pstart+"-"+pend);
|
444
|
}else{
|
445
|
logger.warn("mods:part not yet supported: " + children.item(i).getNodeName());
|
446
|
}
|
447
|
partList.add(mapmap.toString());
|
448
|
}
|
449
|
modsMap.put("part",StringUtils.join(partList.toArray(),SPLITTER));
|
450
|
}else{
|
451
|
logger.warn("relatedItem child not yet supported: " + childNodeName);
|
452
|
}
|
453
|
}
|
454
|
|
455
|
|
456
|
handleModsNames(children, inRef);
|
457
|
|
458
|
relatedInfoMap.put("relatedRoles", StringUtils.join(roleList.toArray(),SPLITTER));
|
459
|
modsMap.put("relatedInfo",relatedInfoMap.toString());
|
460
|
}
|
461
|
|
462
|
|
463
|
/**
|
464
|
* Returns empty reference which best fits to the given ref as in-reference.
|
465
|
* TODO move to {@link ReferenceType} or {@link ReferenceFactory}
|
466
|
* @param ref
|
467
|
* @return
|
468
|
*/
|
469
|
private Reference<?> getBestInreference(Reference<?> ref) {
|
470
|
if (ref.getType().equals(ReferenceType.Article)){
|
471
|
return ReferenceFactory.newJournal();
|
472
|
}else if (ref.getType().equals(ReferenceType.BookSection)){
|
473
|
return ReferenceFactory.newBook();
|
474
|
}else{
|
475
|
//TODO support more types
|
476
|
logger.warn("In-Reference type not yet supported for :" + ref.getType());
|
477
|
|
478
|
}
|
479
|
return null;
|
480
|
}
|
481
|
|
482
|
}
|