Project

General

Profile

Download (21.5 KB) Statistics
| Branch: | Tag: | Revision:
1
package eu.etaxonomy.cdm.io.taxonx2013;
2

    
3
import java.net.URI;
4
import java.net.URISyntaxException;
5
import java.util.ArrayList;
6
import java.util.HashMap;
7
import java.util.List;
8
import java.util.Map;
9
import java.util.UUID;
10

    
11
import org.apache.commons.lang.StringUtils;
12
import org.apache.log4j.Logger;
13
import org.w3c.dom.NamedNodeMap;
14
import org.w3c.dom.Node;
15
import org.w3c.dom.NodeList;
16

    
17
import eu.etaxonomy.cdm.common.DOI;
18
import eu.etaxonomy.cdm.model.agent.Person;
19
import eu.etaxonomy.cdm.model.agent.Team;
20
import eu.etaxonomy.cdm.model.common.TimePeriod;
21
import eu.etaxonomy.cdm.model.reference.Reference;
22
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
23
import eu.etaxonomy.cdm.model.reference.ReferenceType;
24
import eu.etaxonomy.cdm.strategy.parser.TimePeriodParser;
25

    
26
public class TaxonXModsExtractor extends TaxonXExtractor{
27

    
28
    private final Map<String,UUID> personMap = new HashMap<String, UUID>();
29

    
30
    private final Logger logger = Logger.getLogger(getClass());
31
    
32

    
33
	private final String AUTHOR = "author";
34
    private final String EDITOR = "editor";
35

    
36
    /**
37
     * @param agentService
38
     */
39
    public TaxonXModsExtractor(TaxonXImport importer) {
40
        this.importer = importer;
41
    }
42

    
43
    public Reference<?> extractMods(Node node){
44
    	
45
    	//TODO needed? currently only filled but never read
46
        Map<String, String> modsMap = new HashMap<String, String>();
47
        NodeList children = node.getChildNodes();
48
        List<String> roleList = new ArrayList<String>();
49
        String content="";
50

    
51
        Reference<?> ref = tryMakeReferenceByClassification(children);
52

    
53
        if (ref == null){
54
	        //        int reftype = askQuestion("What kind of reference is it?\n 1: Generic\n 2: Book\n 3: Article\n" +
55
	        //                " 4 : BookSection\n 5 : Journal\n 6 : Printseries\n 7: Thesis ");
56
	        int reftype=1;
57
	        
58
	        ref = getReferenceWithType(reftype);
59
        }
60
        handleModsNames(children, ref);
61
        
62
        for (int i=0; i<children.getLength();i++){
63
        	Node modsChildNode = children.item(i);
64
        	String modsChildNodeName = modsChildNode.getNodeName();
65
            if (modsChildNodeName.equalsIgnoreCase("mods:titleinfo")){
66
                NodeList tmp = modsChildNode.getChildNodes();
67
                for (int j=0;j<tmp.getLength();j++){
68
                    if (tmp.item(j).getNodeName().equalsIgnoreCase("mods:title")) {
69
                        content=tmp.item(j).getTextContent().trim();
70
                        if (!content.isEmpty()) {
71
                            modsMap.put("mainTitle",content);
72
                            //                            ref.setTitleCache(content,true);
73
                            ref.setTitle(content);
74
                            //                            ref.generateTitle();
75
                        }
76
                    }
77
                }
78
            }else if (modsChildNodeName.equalsIgnoreCase("mods:name")){
79
               //handled separately
80
            }else if (modsChildNodeName.equalsIgnoreCase("mods:typeofresource")){
81
                content = modsChildNode.getTextContent().trim();
82
                if (!content.isEmpty()) {
83
                    modsMap.put("typeofresource",content);
84
                }
85
            }else if (modsChildNodeName.equalsIgnoreCase("mods:identifier")){
86
                content = modsChildNode.getTextContent().trim();
87
                if (!content.isEmpty()) {
88
                    modsMap.put(modsChildNode.getAttributes().getNamedItem("type").getNodeValue(),content);
89
                    if (modsChildNode.getAttributes().getNamedItem("type").getNodeValue().equalsIgnoreCase("isbn")) {
90
                        ref.setIsbn(content);
91
                    }else if (modsChildNode.getAttributes().getNamedItem("type").getNodeValue().equalsIgnoreCase("issn")) {
92
                        ref.setIssn(content);
93
                    }else if (modsChildNode.getAttributes().getNamedItem("type").getNodeValue().equalsIgnoreCase("DOI")) {
94
                        try {
95
							ref.setDoi(DOI.fromString(content));
96
						} catch (IllegalArgumentException e) {
97
							logger.warn(content + " is not a vaild DOI");
98
						}
99
                    }else if (modsChildNode.getAttributes().getNamedItem("type").getNodeValue().equalsIgnoreCase("GenericHash")) {
100
                        ref.setIssn("GenericHash: "+content);
101
                        try {
102
                            ref.setUri(new URI("http://plazi.cs.umb.edu/GgServer/search?MODS.ModsDocID="+content));
103
                        } catch (URISyntaxException e) {
104
                            // TODO Auto-generated catch block
105
                            e.printStackTrace();
106
                        }
107
                    }else{
108
                    	logger.info("identifier " + modsChildNode.getAttributes().getNamedItem("type").getNodeValue() + " not yet handled.");
109
                    }
110
                }
111
            }else if (modsChildNodeName.equalsIgnoreCase("mods:location")){
112
                NodeList tmp = modsChildNode.getChildNodes();
113
                for (int j=0;j<tmp.getLength();j++){
114
                    //                    System.out.println("Child of mods:location: "+tmp.item(j).getNodeName());
115
                    if (tmp.item(j).getNodeName().equalsIgnoreCase("mods:url")) {
116
                        content = tmp.item(j).getTextContent().trim();
117
                        if (!content.isEmpty() && (content != "http://un.availab.le")) {
118
                            modsMap.put("url",content);
119
                            ref.setUri(URI.create(content));
120
                        }
121
                    }
122
                }
123
            }
124
            else if (modsChildNodeName.equalsIgnoreCase("mods:relatedItem")){
125
                addRelatedMods(modsChildNode, modsMap, ref);
126
            }else if (modsChildNodeName.equalsIgnoreCase("mods:classification")){
127
                    //already handled before
128
            }else if (modsChildNodeName.equalsIgnoreCase("#text") && modsChildNode.getTextContent().matches("\\s*")){
129
                //already handled before
130
            }else{
131
            	logger.warn("mods item not recognized yet: " + modsChildNodeName);
132
            }
133

    
134

    
135
        }
136
        modsMap.put("people",StringUtils.join(roleList.toArray(),SPLITTER));
137

    
138
        List<Reference> references = importer.getReferenceService().list(Reference.class, 0, 0, null, null);
139
        for(Reference<?> refe:references){
140
            if (refe.getCitation().equalsIgnoreCase(ref.getCitation())) {
141
                ref=refe;
142
            }
143
        }
144
        //        System.out.println(modsMap);
145
        //
146
        //        System.out.println("REFERENCE "+ref.getCitation());
147
        //        System.out.println("REFERENCE "+ref.getTitle());
148
        //        System.out.println("REFERENCE "+ref.getTitleCache());
149
        return ref;
150
    }
151

    
152
    private void handleModsNames(NodeList children, Reference<?> ref) {
153
    	 
154
        List<String> roleList = new ArrayList<String>();
155
        
156
        List<Person> persons = new ArrayList<Person>();
157
        List<String> editors= new ArrayList<String>();
158

    
159
    	
160
    	//handle all mods:name
161
        for ( int i = 0; i<children.getLength(); i++){
162
    		if (children.item(i).getNodeName().equalsIgnoreCase("mods:name")){
163
    			NamedNodeMap attributeMap = children.item(i).getAttributes();
164
    			if ((attributeMap.getNamedItem("type") != null) && attributeMap.getNamedItem("type").getNodeValue().equalsIgnoreCase("personal")) {
165
    				handleNameTypePersonal(children.item(i), roleList, persons, editors);
166
    			} else if (attributeMap.getNamedItem("type") == null){
167
    				logger.warn("mods:name attribute 'type' is missing. Name not handled");
168
    			}else {
169
    				logger.warn("mods:name 'type' " + attributeMap.getNamedItem("type").getNodeValue() + " not yet supported"); 
170
    			}
171
    		}
172
    	}
173
        //evaluate authors and editors
174
       	if (persons.size()>0){
175
       		if (ref == null){
176
       			logger.warn("mods:name exists but reference is null");
177
       		}else if (persons.size()==1){
178
                ref.setAuthorship(persons.get(0));
179
            }
180
            else{
181
                Team authorship = Team.NewInstance();
182
                for (Person pers:persons){
183
                    authorship.addTeamMember(pers);
184
                }
185

    
186
                if (!personMap.containsKey(authorship.getTitleCache()) && (authorship.getTeamMembers().size()>0)){
187
                    UUID uuid = importer.getAgentService().saveOrUpdate(authorship);
188
                    personMap.put(authorship.getTitleCache(),uuid);
189
                }else{
190
                    if(authorship.getTeamMembers().size()>1) {
191
                    	UUID uuid = personMap.get(authorship.getTitleCache());
192
                        authorship =  (Team) importer.getAgentService().find(uuid);
193
                    }
194
                }
195

    
196
                ref.setAuthorship(authorship);
197
            }
198
            if (editors.size()>0) {
199
                ref.setEditor(StringUtils.join(editors,", "));
200
            }
201
        }
202
	}
203

    
204
	/**
205
     * Extracts the reference with correct type from mods:classification.
206
     * Incomplete implementation. Will be filled whenever new cases show up
207
     * @param children list of all children of mods:mods
208
     * @return
209
     */
210
    //http://www.loc.gov/standards/mods/userguide/classification.html
211
    private Reference<?> tryMakeReferenceByClassification(NodeList children) {
212
        for (int i=0; i<children.getLength();i++){
213
            if (children.item(i).getNodeName().equalsIgnoreCase("mods:classification")){
214
            	Node classificationNode = children.item(i);
215
            	String text = classificationNode.getTextContent();
216
            	if ("journal article".equals(text)){
217
            		return ReferenceFactory.newArticle();
218
            	}else if ("book".equals(text)){
219
                	return ReferenceFactory.newBook();
220
            	}else{
221
            		if (StringUtils.isNotBlank(text)){
222
            			logger.warn("mods:classification could not be recognized: " + text);
223
            		}else{
224
            			logger.warn("mods:classification has not text. ");
225
            		}
226
            	}
227
            }
228
        }    
229
		return null;
230
	}
231

    
232

    
233
    private void handleNameTypePersonal(Node node, List<String> roleList, List<Person> persons, List<String> editors) {
234
    	boolean newRole=false;
235
        String content="";
236
        String role =null;
237
        
238
        List<String> nameParts = new ArrayList<String>();
239
        
240
    	NodeList tmp = node.getChildNodes();
241
        for (int j=0;j<tmp.getLength();j++){
242

    
243
            if (tmp.item(j).getNodeName().equalsIgnoreCase("mods:namePart")) {
244
                content=tmp.item(j).getTextContent().trim();
245
                if (! content.isEmpty()) {
246
                	nameParts.add(content);
247
                }
248
            } else if (tmp.item(j).getNodeName().equalsIgnoreCase("mods:role")) {
249
                NodeList roleChildren = tmp.item(j).getChildNodes();
250
                for (int k=0; k< roleChildren.getLength();k++){
251
                    if (roleChildren.item(k).getNodeName().equalsIgnoreCase("mods:roleTerm")){
252
                        content = roleChildren.item(k).getTextContent().trim();
253
                        if (!content.isEmpty()) {
254
                            roleList.add(content);
255
                            //                                p.setNomenclaturalTitle(content);
256
                            if (content.equalsIgnoreCase(EDITOR)) {
257
                                role=EDITOR;
258
                            }
259
                            else if (content.equalsIgnoreCase(AUTHOR)) {
260
                                role=AUTHOR;
261
                            }
262
                            newRole=true;
263
                        }
264
                    }
265
                }
266
            }                    
267

    
268
        }
269
        
270
        Person p=null;
271
        if (! nameParts.isEmpty()){
272
            p = Person.NewInstance();
273
            p.setTitleCache(StringUtils.join(nameParts.toArray(), " "), true);
274
        }
275
        
276
        if (newRole){
277
            if ((p!=null) && role.equals(AUTHOR)) {
278
                UUID uuid = null;
279
                if (!personMap.containsKey(p.getTitleCache())){
280
                    uuid = importer.getAgentService().saveOrUpdate(p);
281
                    p = (Person) importer.getAgentService().find(uuid);
282
                    personMap.put(p.getTitleCache(),uuid);
283
                }else{
284
                    uuid = personMap.get(p.getTitleCache());
285
                    p = (Person) importer.getAgentService().find(uuid);
286
                }
287
                //                        logger.info("ADD PERSON "+p);
288
                persons.add(p);
289
            }
290
            else if ((p!=null) && role.equals(EDITOR)) {
291
                editors.add(p.getTitleCache());
292
            }
293
        }		
294
	}
295

    
296
	/**
297
     * @param item
298
     * @param modsMap
299
     */
300
    private void addRelatedMods(Node node, Map<String, String> modsMap, Reference<?> ref) {
301
        NodeList tmp =node.getChildNodes();
302
        NodeList partNodes = null;
303
        NodeList children = null;
304

    
305
        List<String> originInfo = null;
306
        List<String> partList = null;
307

    
308
        TimePeriod date;
309

    
310
        String publisher="";
311
        String publishplace="";
312
        String pstart="";
313
        String pend="";
314

    
315
        Map<String,String> mapmap=null;
316

    
317
        Map<String, String> relatedInfoMap = new HashMap<String, String>();
318
        List<String> roleList = new ArrayList<String>();
319
        String content="";
320

    
321
        relatedInfoMap.put("type",node.getAttributes().getNamedItem("type").getNodeValue());
322

    
323
        
324
        Reference<?> inRef = null;
325
        for (int j=0;j<tmp.getLength();j++){
326
        	Node childNode = tmp.item(j);
327
        	String childNodeName = childNode.getNodeName();
328
	        if (childNodeName.equalsIgnoreCase("#text")  && childNode.getTextContent().matches("\\s*")){
329
	        	//do nothing
330
	        } else if (childNodeName.equalsIgnoreCase("mods:titleInfo")) {
331
                content=childNode.getTextContent().trim();
332
                if (!content.isEmpty()) {
333
                    relatedInfoMap.put("titleInfo",content);
334
                    if (node.getAttributes().getNamedItem("type").getNodeValue().equalsIgnoreCase("host")){
335
                        List<Reference> references = importer.getReferenceService().list(Reference.class, 0, 0, null, null);
336
                        boolean refFound = false;
337
                        for (Reference<?> tmpRef:references){
338
                            if(tmpRef.getTitleCache().equalsIgnoreCase(content)){
339
                                refFound = true;
340
                                inRef= tmpRef;
341
                            }
342
                        }
343
                        if (!refFound){
344
                            inRef = getBestInreference(ref);
345
                            if (inRef == null){
346
                            	inRef = ReferenceFactory.newGeneric();
347
                            }
348
                            
349
                            //book.setTitleCache(content,true);
350
                            inRef.setTitle(content);
351
                        }
352
                        if ((ref.getInReference() == null) || !ref.getInReference().equals(inRef)) {
353
                            ref.setInReference(inRef);
354
                        }else{
355
                        	//TODO
356
                        }
357
                    }
358
                }
359
            } else if (childNodeName.equalsIgnoreCase("mods:originInfo")) {
360
                children = childNode.getChildNodes();
361
                originInfo = new ArrayList<String>();
362
                for (int i=0;i<children.getLength();i++){
363
                    content=children.item(i).getTextContent().trim();
364
                    if (!content.isEmpty()) {
365
                        originInfo.add(children.item(i).getNodeName()+":"+content);
366
                        if (children.item(i).getNodeName().contains("dateIssued")) {
367
                            ref.setDatePublished(TimePeriodParser.parseString(content));
368
                        }
369
                    }
370
                    publisher="";
371
                    publishplace="";
372
                    if (children.item(i).getNodeName().contains("publisher")) {
373
                        try{
374
                            publisher=children.item(i).getChildNodes().item(0).getTextContent().trim();
375
                            //                            System.out.println("PUBLISHER "+publisher);
376
                        }catch(Exception e){System.out.println("oups "+e);}
377
                    }
378
                    if (children.item(i).getNodeName().contains("place")) {
379
                        try{
380
                            publishplace=children.item(i).getTextContent().trim();
381
                            //                            System.out.println("PUBLISHED "+publishplace);
382
                        }catch(Exception e){System.out.println("oups "+e);}
383
                    }
384
                    if (publishplace.isEmpty() && !publisher.isEmpty()) {
385
                        ref.setPublisher(publisher);
386
                    }
387
                    if (!publishplace.isEmpty() && !publisher.isEmpty()) {
388
                        ref.setPublisher(publisher, publishplace);
389
                    }
390
                }
391
                relatedInfoMap.put("originInfo", StringUtils.join(originInfo.toArray(),SPLITTER));
392
            } else if (childNodeName.equalsIgnoreCase("mods:name")){
393
            	//handled later
394
            } else if (childNodeName.equalsIgnoreCase("mods:part")){
395
                children = childNode.getChildNodes();
396
                partList = new ArrayList<String>();
397
                for (int i=0;i<children.getLength();i++){
398
                    mapmap = new HashMap<String, String>();
399
                    //                    System.out.println(children.item(i).getNodeName());
400

    
401
                    if (children.item(i).getNodeName().equalsIgnoreCase("#text")  && children.item(i).getTextContent().matches("\\s*")){
402
        	        	//do nothing
403
        	        } else if (children.item(i).getNodeName().equalsIgnoreCase("mods:date")){
404
                        content = children.item(i).getTextContent().trim();
405
                        if (!content.isEmpty()){
406
                            date = TimePeriodParser.parseString(content);
407
                            //TODO need to check if date belongs to ref or inref
408
                            ref.setDatePublished(date);
409
                        }
410
                    } else if (children.item(i).getNodeName().equalsIgnoreCase("mods:detail") &&
411
                            children.item(i).getAttributes().getNamedItem("type").getNodeValue().equalsIgnoreCase("volume")){
412
                        partNodes = children.item(i).getChildNodes();
413
                        for (int k=0; k<partNodes.getLength();k++){
414
                            if (partNodes.item(k).getNodeName().equalsIgnoreCase("mods:number")) {
415
                                content = partNodes.item(k).getTextContent().trim();
416
                                if (!content.isEmpty()) {
417
                                    ref.setVolume(content);
418
                                }
419
                            }
420
                        }
421
                    } else if (children.item(i).getNodeName().equalsIgnoreCase("mods:extent")) {
422
                        mapmap.put("unit", children.item(i).getAttributes().getNamedItem("unit").getNodeValue());
423
                        partNodes = children.item(i).getChildNodes();
424
                        pstart="";
425
                        pend="";
426
                        for (int k=0; k<partNodes.getLength();k++){
427
                            if (partNodes.item(k).getNodeName().equalsIgnoreCase("mods:start")) {
428
                                content = partNodes.item(k).getTextContent().trim();
429
                                if (!content.isEmpty()) {
430
                                    mapmap.put("start",content);
431
                                    pstart=content;
432
                                }
433
                            }
434
                            if (partNodes.item(k).getNodeName().equalsIgnoreCase("mods:end")) {
435
                                content = partNodes.item(k).getTextContent().trim();
436
                                if (!content.isEmpty()) {
437
                                    mapmap.put("end",content);
438
                                    pend=content;
439
                                }
440
                            }
441
                        }
442
                        //                        System.out.println("SET PAGES "+pstart+"-"+pend);
443
                        ref.setPages(pstart+"-"+pend);
444
                    }else{
445
                    	logger.warn("mods:part not yet supported: " + children.item(i).getNodeName());
446
                    }
447
                    partList.add(mapmap.toString());
448
                }
449
                modsMap.put("part",StringUtils.join(partList.toArray(),SPLITTER));
450
            }else{
451
            	logger.warn("relatedItem child not yet supported: " + childNodeName);
452
            }
453
        }
454
        
455
        
456
        handleModsNames(children, inRef);
457
        
458
        relatedInfoMap.put("relatedRoles", StringUtils.join(roleList.toArray(),SPLITTER));
459
        modsMap.put("relatedInfo",relatedInfoMap.toString());
460
    }
461

    
462
    
463
	/**
464
	 * Returns empty reference which best fits to the given ref as in-reference.
465
	 * TODO move to {@link ReferenceType} or {@link ReferenceFactory}
466
	 * @param ref
467
	 * @return
468
	 */
469
	private Reference<?> getBestInreference(Reference<?> ref) {
470
		if (ref.getType().equals(ReferenceType.Article)){
471
			return ReferenceFactory.newJournal();
472
		}else if (ref.getType().equals(ReferenceType.BookSection)){
473
			return ReferenceFactory.newBook();
474
		}else{
475
			//TODO support more types
476
			logger.warn("In-Reference type not yet supported for :" + ref.getType());
477
			
478
		}
479
		return null;
480
	}
481

    
482
}
(7-7/9)