Project

General

Profile

Revision d9aa603a

IDd9aa603aade3ded73eedd57d346e478e820073a4
Parent 63367139
Child 8e78f363

Added by Andreas Müller 9 months ago

fix #9551 and #9550 seriesPart parsing for articles (and books?) and special titles like PhytoKeys

View differences:

cdmlib-model/src/main/java/eu/etaxonomy/cdm/model/reference/IArticle.java
23 23
 * <li> "MagazineArticle"
24 24
 * </ul>
25 25
 */
26
public interface IArticle extends ISection, IVolumeReference{
27

  
28
	/**
29
	 * Returns the series information for this article
30
	 */
31
	public String getSeriesPart();
32

  
33
	/**
34
	 * Sets the series information for this article
35
	 * @param series
36
	 */
37
	public void setSeriesPart(String series);
38

  
39

  
26
public interface IArticle extends ISection, IVolumeReference, ISeriesPart{
40 27

  
41 28
	/**
42 29
	 * Returns this articles journal.
cdmlib-model/src/main/java/eu/etaxonomy/cdm/model/reference/IPrintedUnitBase.java
16 16
 * In this case it is generally possible to distinguish authors, editors and
17 17
 * publishers.
18 18
 */
19
public interface IPrintedUnitBase extends IAuthoredPublicationBase, ISection, IVolumeReference {
19
public interface IPrintedUnitBase extends IAuthoredPublicationBase, ISection, IVolumeReference, ISeriesPart {
20 20

  
21 21
	/**
22 22
	 * Returns the print series of this printed unit
......
42 42
	 */
43 43
	public void setEditor(String editor);
44 44

  
45
	/**
46
	 * Returns the series part for this printed unit
47
	 */
48
	public String getSeriesPart();
49

  
50
	/**
51
	 * Sets the series part for this printed unit
52
	 * @param seriesPart
53
	 */
54
	public void setSeriesPart(String seriesPart);
55

  
56 45
}
cdmlib-model/src/main/java/eu/etaxonomy/cdm/model/reference/ISeriesPart.java
1
/**
2
* Copyright (C) 2021 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9
package eu.etaxonomy.cdm.model.reference;
10

  
11
/**
12
 * @author a.mueller
13
 * @since 24.03.2021
14
 */
15
public interface ISeriesPart {
16

  
17
    /**
18
     * Returns the series part for this printed unit
19
     */
20
    public String getSeriesPart();
21

  
22
    /**
23
     * Sets the series part for this printed unit
24
     * @param seriesPart
25
     */
26
    public void setSeriesPart(String seriesPart);
27
}
cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/parser/NonViralNameParserImpl.java
42 42
import eu.etaxonomy.cdm.model.reference.IBook;
43 43
import eu.etaxonomy.cdm.model.reference.IBookSection;
44 44
import eu.etaxonomy.cdm.model.reference.INomenclaturalReference;
45
import eu.etaxonomy.cdm.model.reference.ISeriesPart;
45 46
import eu.etaxonomy.cdm.model.reference.IVolumeReference;
46 47
import eu.etaxonomy.cdm.model.reference.Reference;
47 48
import eu.etaxonomy.cdm.model.reference.ReferenceFactory;
......
631 632
		}else if (nomRef instanceof Reference){
632 633
			((Reference)nomRef).setDatePublished(datePublished);
633 634
		}else{
634
			throw new ClassCastException("nom Ref is not of type Reference but " + (nomRef == null? "(null)" : nomRef.getClass()));
635
			throw new ClassCastException("nom Ref is not of type Reference but " + (nomRef.getClass()));
635 636
		}
636 637
		return result;
637 638
	}
638 639

  
639
	private String makeVolume(IVolumeReference nomRef, String strReference){
640
	private String makeVolumeAndSeries(IVolumeReference nomRef, String strReference){
640 641
		//volume
641 642
		String volPart = null;
642 643
		String pVolPhrase = volumeSeparator +  volume + end;
......
647 648
			volPart = volPart.replaceFirst(pStart + volumeSeparator, "").trim();
648 649
			nomRef.setVolume(volPart);
649 650
		}
651
		strReference = parseSeries(strReference, (ISeriesPart)nomRef);
650 652
		return strReference;
651 653
	}
652 654

  
653
	private String makeEdition(IBook book, String strReference){
655
    private String parseSeries(String strReference, ISeriesPart nomRef) {
656
        String seriesPart = null;
657
        String seriesPhrase = pSeriesPart + end;
658
        Matcher seriesPhraseMatcher = getMatcher(seriesPhrase, strReference);
659
        if (seriesPhraseMatcher.find()){
660
            seriesPart = seriesPhraseMatcher.group(0);
661
            strReference = strReference.substring(0, strReference.length() - seriesPart.length());
662
            seriesPart = seriesPart.startsWith(",")? seriesPart.substring(1): seriesPart;
663
            seriesPart = seriesPart.endsWith(",")? seriesPart.substring(0, seriesPart.length()-1): seriesPart;
664
            nomRef.setSeriesPart(seriesPart.trim());
665
        }
666
        return strReference;
667
    }
668

  
669
    private String makeEdition(IBook book, String strReference){
654 670

  
655 671
		String editionPart = null;
656 672
		Matcher editionPhraseMatcher = getMatcher(pEditionPart, strReference);
......
681 697
	private IBook parseBook(String reference){
682 698
		IBook result = ReferenceFactory.newBook();
683 699
		reference = makeEdition(result, reference);
684
		reference = makeVolume(result, reference);
700
		reference = makeVolumeAndSeries(result, reference);
685 701
		result.setAbbrevTitle(reference);
686 702
		return result;
687 703
	}
......
690 706
		//if (articlePattern)
691 707
		//(type, author, title, volume, editor, series;
692 708
		Reference result = ReferenceFactory.newArticle();
693
		reference = makeVolume(result, reference);
709
		reference = makeVolumeAndSeries(result, reference);
694 710
		Reference inJournal = ReferenceFactory.newJournal();
695 711
		inJournal.setAbbrevTitle(reference);
696 712
		result.setInReference(inJournal);
cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/parser/NonViralNameParserImplRegExBase.java
193 193

  
194 194
    protected static int authorSeparatorMaxPosition = 3;  //author may have a maximum of 2 words
195 195
    protected static String pTitleWordSeparator = "(\\."+ fWs+"|" + oWs + "|\\.?[-\u2013]"+oWs+"|\\.?" + oWs + "&(?!\\s*al\\.)" + oWs + ")";
196
    protected static String pSeriesPart = ",?" + fWs + "(([sS][e\u00E9]r|сер)("+oWs+"|\\."+fWs+")(\\d{1,2}|[A-Z](\\s*\\d{1,2})?)|n(ov)?\\.\\s*[sS](er)?\\.|Jerusalem Ser\\.|(Pt|Sect)\\.\\s*\\d{1,2}),?";  //Pt. (Part) and Sect. (Section) currently handled as series part, which is part of title, may be handled different later
196
    protected static String pSeriesPart = fWs + ",?" + fWs + "(([sS][e\u00E9]r|сер)("+oWs+"|\\."+fWs+")(\\d{1,2}|[A-Z](\\s*\\d{1,2})?)|n(ov)?\\.\\s*[sS](er)?\\.|Jerusalem Ser\\.|(Pt|Sect)\\.\\s*\\d{1,2}),?";  //Pt. (Part) and Sect. (Section) currently handled as series part, which is part of title, may be handled different later
197 197

  
198 198
    protected static String authorPrefix = "(Da(lla)?|Van|La|De)" + oWs; //should not include words allowed in first part of reference title
199 199
    protected static String firstTitleWord = "(?!"+authorPrefix+")" + word + "('\\p{javaLowerCase}*|[-\u2013]"+word+")?"; //word with optional apostrophe in between
200 200

  
201
    protected static String singleJournalTitles = "PhytoKeys"; //for further titles use "|"
201 202
    protected static String referenceTitleFirstPart = "(" + firstTitleWord + pTitleWordSeparator + "|" + twoCapitalDotWord + fWs + ")";
202
    protected static String referenceTitleBase = referenceTitleFirstPart + "*" + "("+ dotWord + "|" + uppercaseWord + "|" + quotations + "|" + pSeriesPart + ")";  //reference title may have words separated by whitespace or dot. The last word may not have a whitespace at the end. There must be at least one word
203
    protected static String referenceTitle = "("+referenceTitleBase + "|PhytoKeys)";
203
    protected static String referenceTitleBase = "("+ referenceTitleFirstPart + "*" + "("+ dotWord + "|" + uppercaseWord + "|" + quotations + ")"
204
                    + "|" +singleJournalTitles + ")";  //reference title may have words separated by whitespace or dot. The last word may not have a whitespace at the end. There must be at least one word
205
    protected static String referenceTitleBaseWithSeries = referenceTitleBase + "("+ pSeriesPart + ")?";
206
    protected static String referenceTitle = "("+referenceTitleBaseWithSeries +")";
204 207
    protected static String referenceTitleWithSepCharacters = "(((" + referenceTitle +"|\\(.+\\))"  + anySepChar + ")*" + referenceTitle + ")"; //,?
205 208
    //TODO test performance ??
206 209
    protected static String referenceTitleWithSepCharactersAndBrackets = referenceTitleWithSepCharacters + fWs + "(\\(" + referenceTitleWithSepCharacters + "\\)"+fWs+ ")?(" + referenceTitleWithSepCharacters +")?"  ;
......
212 215
    protected static String editionSeparator = "(" + oWs + "|," + fWs + ")ed\\.?" + oWs;  //
213 216
    public static String pEdition = nr2;
214 217

  
215
    protected static String pVolPart = volumeSeparator +  volume;
218
    protected static String pVolPart = volumeSeparator + volume;
216 219
    protected static String pEditionPart = "(" + editionSeparator +  pEdition +"([A-Z]|\\s*bis)?|,\\s*(jubilee|nouv\\.) ed\\.)";
217 220
    protected static String pEditionVolPart = pEditionPart + fWs + "," + volumeSeparator + volume;
218 221
    protected static String pEditionVolAlternative = "(" + pEditionPart + "|" + pVolPart + "|" + pEditionVolPart + ")?";
cdmlib-model/src/test/java/eu/etaxonomy/cdm/strategy/parser/NonViralNameParserImplTest.java
1725 1725

  
1726 1726
    @Test
1727 1727
    public final void testSeries(){
1728
        //TODO should work also for the original string:  #9014
1729
//        String parseStr = "Mazus pumilus (Burm.f.) Steenis in Nova Guinea, n.s., 9: 31. 1958";
1730
        String parseStr = "Mazus pumilus (Burm.f.) Steenis in Nova Guinea Bla, n.s., 9: 31. 1958";
1728
        String parseStr = "Mazus pumilus (Burm.f.) Steenis in Nova Guinea, n.s., 9: 31. 1958";
1731 1729
        INonViralName name = parser.parseReferencedName(parseStr);
1732 1730
        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
1733 1731
        Reference nomRef = name.getNomenclaturalReference();
......
1735 1733

  
1736 1734
        assertEquals(ReferenceType.Article, nomRef.getType());
1737 1735
        assertEquals(name.getNomenclaturalMicroReference(), "31");
1738
        //TODO series should be parsed and handled better
1739
        assertEquals("Nova Guinea Bla, n.s.,", nomRef.getInJournal().getAbbrevTitle());
1740
//        assertEquals("n.s.", nomRef.getSeriesPart());
1736
        assertEquals("Nova Guinea", nomRef.getInJournal().getAbbrevTitle());
1737
        assertEquals("n.s.", nomRef.getSeriesPart());
1741 1738
    }
1742 1739

  
1743 1740
    @Test
......
1751 1748

  
1752 1749
        assertEquals(ReferenceType.Article, nomRef.getType());
1753 1750
        assertEquals(name.getNomenclaturalMicroReference(), "239");
1754
        //TODO series should be parsed and handled better
1755
        assertEquals("Тр. Бот. инст. Aкад. наук СССР, сер. 1,", nomRef.getInJournal().getAbbrevTitle());
1756
//        assertEquals("сер. 1", nomRef.getSeriesPart());
1751
        assertEquals("Тр. Бот. инст. Aкад. наук СССР", nomRef.getInJournal().getAbbrevTitle());
1752
        assertEquals("сер. 1", nomRef.getSeriesPart());
1757 1753
    }
1758 1754

  
1759 1755
    @Test
......
2037 2033
        Assert.assertFalse("nom.ref. should be parsable", nomRef.isProtectedTitleCache());
2038 2034
        assertEquals(ReferenceType.Article, nomRef.getType());
2039 2035
        //, n.s., is not necessarily part of the title in future
2040
        assertEquals("Verh. Vereins Natur- Heilk. Presburg, n.s.,", nomRef.getInReference().getAbbrevTitle());
2036
        assertEquals("Verh. Vereins Natur- Heilk. Presburg", nomRef.getInReference().getAbbrevTitle());
2041 2037
        assertNull(nomRef.getEdition());
2038
        assertEquals("n.s.", nomRef.getSeriesPart());
2042 2039
        assertEquals("2", nomRef.getVolume());
2043 2040

  
2044 2041
          //Note: space in E+M, no space in IPNI; is it really a book?
......
2115 2112

  
2116 2113
    }
2117 2114

  
2115
    @Test
2116
    public final void testArticlePattern(){
2117
        Pattern articlePattern = Pattern.compile(NonViralNameParserImplRegExBase.pArticleReference);
2118
        Matcher matcher = articlePattern.matcher("Acta Bot. Hung. 46 (1-2)");
2119
        Assert.assertTrue("", matcher.matches());
2120
        matcher = articlePattern.matcher("Nova Guinea Bla 9");
2121
        Assert.assertTrue("", matcher.matches());
2122
        matcher = articlePattern.matcher("Nova Guinea Bla , n.s., 9");
2123
        Assert.assertTrue("", matcher.matches());
2124
    }
2125

  
2118 2126

  
2119 2127
    @Test
2120 2128
    public final void testSeriesPart(){
2121 2129
        Pattern seriesPattern = Pattern.compile(NonViralNameParserImplRegExBase.pSeriesPart);
2122
        Matcher matcher = seriesPattern.matcher("ser. 2");
2130
        Matcher matcher = seriesPattern.matcher(", ser. 2,");
2123 2131
        Assert.assertTrue("", matcher.matches());
2124 2132

  
2125 2133
        matcher = seriesPattern.matcher("n.s.");
......
2143 2151

  
2144 2152
        matcher = seriesPattern.matcher("nov. Ser.");
2145 2153
        Assert.assertTrue("", matcher.matches());
2146

  
2147

  
2148

  
2149 2154
    }
2150 2155

  
2151
    /**
2152
     * Test method for {@link eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl#fullTeams(java.lang.String)}.
2153
     */
2154 2156
    @Test
2155 2157
    public final void testFullTeams() {
2156 2158
        logger.warn("Not yet implemented"); // TODO
2157 2159
    }
2158 2160

  
2159
    /**
2160
     * Test method for {@link eu.etaxonomy.cdm.strategy.parser.NonViralNameParserImpl#AuthorshipAndEx(java.lang.String)}.
2161
     * @throws StringNotParsableException
2162
     */
2163 2161
    @Test
2164 2162
    public final void testParseAuthorsTaxonNameString() throws StringNotParsableException {
2165 2163
        INonViralName nvn = TaxonNameFactory.NewZoologicalInstance(null);
......
2844 2842
        Assert.assertNotNull("Nomenclatural reference should be an article and therefore have an in reference", ref.getInReference());
2845 2843
        Assert.assertEquals(ReferenceType.Journal, ref.getInReference().getType());
2846 2844

  
2847
        //PhytoKeys
2845
        //PhytoKeys #9550
2848 2846
        nameStr = "Pseudopodospermum baeticum (DC.) Zaika & al. in PhytoKeys 137: 68. 2020";
2849 2847
        name = parser.parseReferencedName(nameStr);
2850 2848
        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
......
2853 2851
        Assert.assertEquals(ReferenceType.Journal, ref.getInReference().getType());
2854 2852
        Assert.assertEquals("PhytoKeys", ref.getInReference().getAbbrevTitle());
2855 2853

  
2856
//
2854
        //Adansonia #9014, #9551
2855
        nameStr = "Casearia annamensis (Gagnep.) Lescot & Sleumer in Adansonia, n.s., 10: 290. 1970";
2856
        name = parser.parseReferencedName(nameStr);
2857
        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
2858
        ref = name.getNomenclaturalReference();
2859
        Assert.assertEquals(ReferenceType.Article, ref.getType());
2860
        Assert.assertNotNull("Nomenclatural reference should be an article and therefore have an in reference", ref.getInReference());
2861
        Assert.assertEquals(ReferenceType.Journal, ref.getInReference().getType());
2862
        Assert.assertEquals("Adansonia", ref.getInReference().getAbbrevTitle());
2863

  
2864
        //, Bot., sér. 4 #9014, #9551
2865
        nameStr = "Asteropeia amblyocarpa Tul. in Ann. Sci. Nat., Bot., sér. 4, 8: 81. 1857";
2866
        name = parser.parseReferencedName(nameStr);
2867
        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
2868
        ref = name.getNomenclaturalReference();
2869
        Assert.assertEquals(ReferenceType.Article, ref.getType());
2870
        Assert.assertNotNull("Nomenclatural reference should be an article and therefore have an in reference", ref.getInReference());
2871
        Assert.assertEquals(ReferenceType.Journal, ref.getInReference().getType());
2872
        Assert.assertEquals("Ann. Sci. Nat., Bot.", ref.getInReference().getAbbrevTitle());
2873
        Assert.assertEquals("sér. 4", ref.getSeriesPart());
2874

  
2857 2875
    }
2858 2876

  
2859 2877
    @Test

Also available in: Unified diff

Add picture from clipboard (Maximum size: 40 MB)