Project

General

Profile

« Previous | Next » 

Revision dfabd5b6

Added by Andreas Müller almost 8 years ago

#5909 further improve referenced name parser

View differences:

cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/parser/NonViralNameParserImplRegExBase.java
63 63
   //years
64 64
    protected static String month = "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)";
65 65
    protected static String singleYear = "\\b" + "(?:17|18|19|20)" + "\\d{2}" + "\\b";                      // word boundary followed by either 17,18,19, or 20 (not captured) followed by 2 digits
66
    protected static String yearPhrase = singleYear + "("+ fWs + "-" + fWs + singleYear + ")?" ;
66
    protected static String correctYearPhrase = singleYear + "("+ fWs + "-" + fWs + singleYear + ")?" ;
67 67
    								//+ "(" + month + ")?)" ;                 // optional month
68
    //!! also used by TimePeriodParser
69
    public static String incorrectYearPhrase = "(\"" + correctYearPhrase + "\"|" + correctYearPhrase + "|"
70
            + UTF8.ENGLISH_QUOT_START + correctYearPhrase + UTF8.ENGLISH_QUOT_END + ")"
71
			+ fWs + "\\[" + singleYear + "\\]"  ;
72
    protected static String yearPhrase = "(" + correctYearPhrase + "|" + incorrectYearPhrase + ")";
68 73

  
69 74
    protected static String yearSeperator = "\\." + oWs;
70 75
    protected static String detailSeparator = ":" + oWs;
......
116 121
    protected static String fullAuthorString = "(" + fullBotanicAuthorString + "|" + fullZooAuthorString+ ")";
117 122

  
118 123
    //details
119
    //TODO still very simple
120

  
124
    //TODO still not all parsed
121 125

  
122 126
    protected static String nr2 = "\\d{1,2}";
123 127
    protected static String nr4 = "\\d{1,4}";
124 128
    protected static String nr5 = "\\d{1,5}";
125 129

  
126 130

  
127
    protected static String pPage = nr5 + "[a-z]?";
131
    protected static String pPage = nr5 + "[a-zA-Z]?";
128 132
    protected static String pStrNo = "n\u00B0" + fWs + "(" + nr4 + ")";
129 133

  
130 134
    protected static String pBracketNr = "\\[" + nr4 + "\\]";
131
    protected static String pFolBracket = "\\[fol\\." + fWs + "\\d{1,2}(-\\d{1,2})?\\]";
135
    protected static String pFolBracket = "\\[fol\\." + fWs + "\\d{1,2}(-\\d{1,2})?\\]";  //maybe merge with pTabFigPlate (see below)
136

  
132 137

  
133
    protected static String pStrTab = "[tT]((ab)?\\.|ab\\s)" + fWs + nr4 + "(" + fWs + "(B|\u00DF|\\(\\d{1,3}\\)))?";
134
    protected static String pFig = "[fF]((ig)?\\.|ig\\s)" + fWs + nr4 + "([a-zA-Z]([-\u2013,]\\s*[a-zA-Z])?)?";
135
    protected static String pFigs = pFig + "([-\u2013]" + nr4 + ")?";
136
    protected static String pPlate = "[pP]((l)?\\.|l\\s)" + fWs + nr4 + "([a-zA-Z]([-\u2013,]\\s*[a-zA-Z])?)?";
138
    protected static String pRangeSep = "[-\u2013]";
139
    protected static String pRangeSepCo = "[-\u2013,]";
137 140

  
141
    protected static String pTabFigPlateStart = "([tT](abs?)?|[fF](igs?)?|[pP]l?s?)(\\.|\\s|$)";   //$ for only 'f'
142
    protected static String pAbcNr = "[a-zA-Z\u00DF]";
143
    protected static String pTabFigPlateNumber = "(" + nr4 + "|" + pAbcNr + "|" + nr4 + fWs + pAbcNr + ")" + "("+ pRangeSepCo + fWs + pAbcNr + ")?";
144
    protected static String pTabFigPlateNumbers = "(" + pTabFigPlateNumber + "(" + pRangeSepCo + fWs + pTabFigPlateNumber + ")?)";
138 145

  
139
    //static String pTabFig = pStrTab + "(," + fWs + pFigs + ")?";
140
    protected static String pTabFigPl = "(" + pStrTab + "|" + pFigs + "|" +  pPlate + ")";
146
    protected static String pTabFigPlate = pTabFigPlateStart + fWs + pTabFigPlateNumbers + "?";
147
    protected static String pTabFigPl = pTabFigPlate;
141 148

  
142 149
    //e.g.: p455; p.455; pp455-456; pp.455-456; pp.455,456; 455, 456; pages 456-457; pages 456,567
143
    protected static String pSinglePages = "(p\\.?)?" + fWs + pPage + "(," + pTabFigPl +")?";
144
    protected static String pMultiPages = "(pp\\.?|pages)?" + fWs + pPage + fWs + "[-\u2013,]" +fWs + pPage ;
150
    protected static String pSinglePages = "(p\\.?)?" + fWs + pPage + "(," + pTabFigPl +"){0,2}";
151
    protected static String pMultiPages = "(pp\\.?|pages)?" + fWs + pPage + fWs + pRangeSepCo +fWs + pPage ;
145 152
    //static String pPages = pPage + "(," + fWs + "(" + pPage + "|" + pTabFig + ")" + ")?";
146 153
    protected static String pPages = "(" + pSinglePages +"|" + pMultiPages +")";
147
    protected static String pPagesTabFig = pPages +"[,\\.]" + fWs + pTabFigPl;
154
    protected static String pPagesTabFig = pPages +"([,\\.]" + fWs + pTabFigPl + "){1,2}";
148 155

  
149 156

  
150 157

  
......
171 178
//    romNr = "(?=[MDCLXVImdclxvi])(((" & romM & ")?" & romHun & ")?" & romTen & ")?" & romOne
172 179
    protected static String pRomNr = "ljfweffaflas"; //TODO rom number have to be tested first
173 180

  
181
//    "(,\\s*" + pTabFigPl + ")?" +
174 182
    protected static String pDetailAlternatives = "(" + pPages + "|" + pPageSpecial + "|" + pStrNo + "|" + pBracketNr +
175
    			"|" + pTabFigPl + "|" + pTabSpecial + "|" + pFolBracket + "|" + pCouv + "|" + pRomNr + "|" +
183
    			"|" + pTabFigPl + "(,\\s*" + pTabFigPl + ")?" + "|" + pTabSpecial + "|" + pFolBracket + "|" + pCouv + "|" + pRomNr + "|" +
176 184
    			pSpecialGardDict + "|" + pSpecialDetail + "|" + pPagesTabFig + ")";
177 185

  
178 186
    protected static String detail = pDetailAlternatives;
179 187

  
180 188
    //reference
181
    protected static String volume = nr4 + "[a-z]?" + fWs + "(\\("+ nr4  + "([-\u2013]" + nr4 + ")?\\))?" + "(\\((Suppl|Beibl)\\.\\))?";
189
    protected static String volume = nr4 + "[a-z]?" + fWs + "(\\("+ nr4  + "([-\u2013]" + nr4 + ")?\\))?" + "(\\((Suppl|Beibl|App|Beil|Misc)\\.\\))?";
182 190
    //this line caused problem https://dev.e-taxonomy.eu/trac/ticket/1556 in its original form: "([\u005E:\\.]" + fWs + ")";
183 191
    protected static String anySepChar = "([\u005E:a-zA-Z]" + fWs + ")"; //all characters except for the detail separator, a stricter version would be [,\\-\\&] and some other characters
184 192
//  protected static String anySepChar = "([,\\-\\&\\.\\+\\']" + fWs + ")";
185 193

  
186 194
    protected static int authorSeparatorMaxPosition = 4;  //Author may have a maximum of 4 words
187
    protected static String pTitleWordSeparator = "(\\."+ fWs+"|" + oWs + ")";
195
    protected static String pTitleWordSeparator = "(\\."+ fWs+"|" + oWs + "|\\.?[-\u2013])";
188 196
    protected static String pSeriesPart = ",?" + fWs + "[sS]er(\\.)?" + oWs + "\\d{1,2},?";
189 197

  
190 198
    protected static String referenceTitleFirstPart = "(" + apostrophWord + pTitleWordSeparator + "|" + twoCapitalDotWord + fWs + ")";
cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/parser/TimePeriodParser.java
22 22
import org.joda.time.Partial;
23 23

  
24 24
import eu.etaxonomy.cdm.common.CdmUtils;
25
import eu.etaxonomy.cdm.common.UTF8;
26 25
import eu.etaxonomy.cdm.model.common.TimePeriod;
27 26

  
28 27
/**
......
36 35
	//patter for first year in string;
37 36
	private static final Pattern firstYearPattern =  Pattern.compile("\\d{4}");
38 37
	//case "1806"[1807];
39
	private static final Pattern uncorrectYearPatter =  Pattern.compile("[\""+UTF8.ENGLISH_QUOT_START+"]\\d{4}[\""+UTF8.ENGLISH_QUOT_END+"]\\s*\\[\\d{4}\\]");
38
	private static final Pattern uncorrectYearPatter = Pattern.compile(NonViralNameParserImplRegExBase.incorrectYearPhrase);
39
//OLD	        Pattern.compile("[\""+UTF8.ENGLISH_QUOT_START+"]\\d{4}[\""+UTF8.ENGLISH_QUOT_END+"]\\s*\\[\\d{4}\\]");
40

  
40 41
	//case fl. 1806 or c. 1806 or fl. 1806?
41 42
	private static final Pattern prefixedYearPattern =  Pattern.compile("(fl|c)\\.\\s*\\d{4}(\\s*-\\s*\\d{4})?\\??");
42 43
	//standard
......
62 63
		periodString = periodString.trim();
63 64

  
64 65
		result.setFreeText(null);
65
		Date date;
66 66

  
67 67
		//case "1806"[1807];
68 68
		if (uncorrectYearPatter.matcher(periodString).matches()){
cdmlib-model/src/test/java/eu/etaxonomy/cdm/strategy/parser/NonViralNameParserImplTest.java
1750 1750
        assertEquals("19(4)", nomRef.getVolume());
1751 1751
        assertEquals("901" + UTF8.EN_DASH + "911, f. 1" + UTF8.EN_DASH + "2", name.getNomenclaturalMicroReference());
1752 1752

  
1753
        //detail with figs
1754
        name = parser.parseReferencedName("Randia sonorensis Wiggins"
1755
                + " in Contr. Dudley Herb. 3: 75, figs 4-6. 1940");
1756
        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
1757
        combinationAuthor = name.getCombinationAuthorship();
1758
        assertEquals( "Wiggins", combinationAuthor.getNomenclaturalTitle());
1759
        nomRef = (Reference)name.getNomenclaturalReference();
1760
        assertEquals(ReferenceType.Article, nomRef.getType());
1761
        assertEquals("3", nomRef.getVolume());
1762
        assertEquals("75, figs 4-6", name.getNomenclaturalMicroReference());
1763

  
1764
        //detail with pl. and figs
1765
        name = parser.parseReferencedName("Randia sonorensis Wiggins"
1766
                + " in Contr. Dudley Herb. 3: 75, pl. 19, figs 4-6. 1940");
1767
        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
1768
        combinationAuthor = name.getCombinationAuthorship();
1769
        assertEquals( "Wiggins", combinationAuthor.getNomenclaturalTitle());
1770
        nomRef = (Reference)name.getNomenclaturalReference();
1771
        assertEquals(ReferenceType.Article, nomRef.getType());
1772
        assertEquals("3", nomRef.getVolume());
1773
        assertEquals("75, pl. 19, figs 4-6", name.getNomenclaturalMicroReference());
1774

  
1775

  
1753 1776
        //pl
1754 1777
        name = parser.parseReferencedName("Carapichea  Aubl."
1755 1778
                + " in Hist. Pl. Guiane 1: 167, pl. 64. 1775");
......
1772 1795
        assertEquals("4", nomRef.getVolume());
1773 1796
        assertEquals("121. fig. 2a, b", name.getNomenclaturalMicroReference());
1774 1797

  
1798
        //detail with , to number
1799
        name = parser.parseReferencedName("Deppea martinez-calderonii Lorence"
1800
                + " in Allertonia 4: 399. figs 1e, 2. 1988");
1801
        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
1802
        combinationAuthor = name.getCombinationAuthorship();
1803
        assertEquals( "Lorence", combinationAuthor.getNomenclaturalTitle());
1804
        nomRef = (Reference)name.getNomenclaturalReference();
1805
        assertEquals(ReferenceType.Article, nomRef.getType());
1806
        assertEquals("4", nomRef.getVolume());
1807
        assertEquals("399. figs 1e, 2", name.getNomenclaturalMicroReference());
1808

  
1775 1809
        //(Suppl.)
1776 1810
        name = parser.parseReferencedName("Manettia costaricensis  Wernham"
1777 1811
                + " in J. Bot. 57(Suppl.): 38. 1919");
......
1805 1839
        assertEquals("3", nomRef.getVolume());
1806 1840
        assertEquals("219", name.getNomenclaturalMicroReference());
1807 1841

  
1808
        //
1809
        //(Hannover) place published
1842
        // place published e.g. (Hannover)
1810 1843
        name = parser.parseReferencedName("Pittoniotis trichantha Griseb."
1811 1844
                  + " in Bonplandia (Hannover) 6 (1): 8. 1858");
1812 1845
        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
......
1816 1849
        assertEquals(ReferenceType.Article, nomRef.getType());
1817 1850
        assertEquals("6 (1)", nomRef.getVolume());
1818 1851
        assertEquals("8", name.getNomenclaturalMicroReference());
1852

  
1853
        //komplex / incorrect year without quotation marks
1854
        name = parser.parseReferencedName("Javorkaea Borhidi & Jarai-Koml."
1855
                + " in Acta Bot. Hung. 29(1\u20134): 16, f. 1\u20132, t. 1-8. 1983 [1984]");
1856
        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
1857
        combinationAuthor = name.getCombinationAuthorship();
1858
        assertEquals( "Borhidi & Jarai-Koml.", combinationAuthor.getNomenclaturalTitle());
1859
        nomRef = (Reference)name.getNomenclaturalReference();
1860
        assertEquals(ReferenceType.Article, nomRef.getType());
1861
        assertEquals("29(1\u20134)", nomRef.getVolume());
1862
        assertEquals("16, f. 1\u20132, t. 1-8", name.getNomenclaturalMicroReference());
1863
        assertEquals("1983 [1984]", nomRef.getDatePublishedString());
1864
        assertEquals("1984", nomRef.getYear());
1865

  
1866
        //incorrect year with \u201e \u201f  (s. eu.etaxonomy.cdm.common.UTF8.ENGLISH_QUOT_START
1867
        name = parser.parseReferencedName("Javorkaea Borhidi & Jarai-Koml."
1868
                + " in Acta Bot. Hung. 29(1-4): 16, f. 1-2. \u201e1983\u201f [1984]");
1869
        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
1870
        combinationAuthor = name.getCombinationAuthorship();
1871
        assertEquals( "Borhidi & Jarai-Koml.", combinationAuthor.getNomenclaturalTitle());
1872
        nomRef = (Reference)name.getNomenclaturalReference();
1873
        assertEquals(ReferenceType.Article, nomRef.getType());
1874
        assertEquals("29(1-4)", nomRef.getVolume());
1875
        assertEquals("16, f. 1-2", name.getNomenclaturalMicroReference());
1876
        assertEquals("\u201e1983\u201f [1984]", nomRef.getDatePublishedString());
1877
        assertEquals("1984", nomRef.getYear());
1878

  
1879
        //incorrect year with "
1880
        name = parser.parseReferencedName("Javorkaea Borhidi & Jarai-Koml."
1881
                + " in Acta Bot. Hung. 29(1-4): 16, f. 1-2. \"1983\" [1984]");
1882
        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
1883
        combinationAuthor = name.getCombinationAuthorship();
1884
        assertEquals( "Borhidi & Jarai-Koml.", combinationAuthor.getNomenclaturalTitle());
1885
        nomRef = (Reference)name.getNomenclaturalReference();
1886
        assertEquals(ReferenceType.Article, nomRef.getType());
1887
        assertEquals("29(1-4)", nomRef.getVolume());
1888
        assertEquals("16, f. 1-2", name.getNomenclaturalMicroReference());
1889
        assertEquals("\"1983\" [1984]", nomRef.getDatePublishedString());
1890
        assertEquals("1984", nomRef.getYear());
1891

  
1892
        //fig. a
1893
        name = parser.parseReferencedName("Psychotria capitata  Ruiz & Pav."
1894
                + " in Fl. Peruv. 2: 59, pl. 206, fig. a. 1799");
1895
        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
1896
        combinationAuthor = name.getCombinationAuthorship();
1897
        assertEquals( "Ruiz & Pav.", combinationAuthor.getNomenclaturalTitle());
1898
        nomRef = (Reference)name.getNomenclaturalReference();
1899
        assertEquals(ReferenceType.Article, nomRef.getType());
1900
        assertEquals("2", nomRef.getVolume());
1901
        assertEquals("59, pl. 206, fig. a", name.getNomenclaturalMicroReference());
1902

  
1903
        //442A.
1904
        name = parser.parseReferencedName("Rogiera elegans Planch."
1905
                + " in Fl. Serres Jard. Eur. 5: 442A. 1849");
1906
        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
1907
        combinationAuthor = name.getCombinationAuthorship();
1908
        assertEquals( "Planch.", combinationAuthor.getNomenclaturalTitle());
1909
        nomRef = (Reference)name.getNomenclaturalReference();
1910
        assertEquals(ReferenceType.Article, nomRef.getType());
1911
        assertEquals("5", nomRef.getVolume());
1912
        assertEquals("442A", name.getNomenclaturalMicroReference());
1913

  
1914
        //f
1915
        name = parser.parseReferencedName("Coussarea imitans L.O. Williams"
1916
                + " in Phytologia 26 (6): 488–489, f. 1973");
1917
        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
1918
        combinationAuthor = name.getCombinationAuthorship();
1919
        assertEquals( "L.O. Williams", combinationAuthor.getNomenclaturalTitle());
1920
        nomRef = (Reference)name.getNomenclaturalReference();
1921
        assertEquals(ReferenceType.Article, nomRef.getType());
1922
        assertEquals("26 (6)", nomRef.getVolume());
1923
        assertEquals("488–489, f", name.getNomenclaturalMicroReference());
1924

  
1925
        //Phys.-Med.
1926
        name = parser.parseReferencedName("Coccocypselum cordifolium Nees & Mart."
1927
                + " in Nova Acta Phys.-Med. Acad. Caes.\u2013Leop. Nat. Cur. 12: 14. 1824");
1928
        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
1929
        combinationAuthor = name.getCombinationAuthorship();
1930
        assertEquals( "Nees & Mart.", combinationAuthor.getNomenclaturalTitle());
1931
        nomRef = (Reference)name.getNomenclaturalReference();
1932
        assertEquals(ReferenceType.Article, nomRef.getType());
1933
        assertEquals("Nova Acta Phys.-Med. Acad. Caes.\u2013Leop. Nat. Cur.", nomRef.getInReference().getTitle());
1934
        assertEquals("12", nomRef.getVolume());
1935
        assertEquals("14", name.getNomenclaturalMicroReference());
1936
        assertEquals("1824", nomRef.getYear());
1937

  
1938
        //(ed. 10)  wanted?
1939
//        Syst. Nat. (ed. 10) 2: 930. 1759
1940
//        name = parser.parseReferencedName("Erithalis fruticosa L."
1941
//                + ", Syst. Nat. ed. 10, 2: 930. 1759");
1942
//        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
1943
//        combinationAuthor = name.getCombinationAuthorship();
1944
//        assertEquals( "L.", combinationAuthor.getNomenclaturalTitle());
1945
//        nomRef = (Reference)name.getNomenclaturalReference();
1946
//        assertEquals(ReferenceType.Book, nomRef.getType());
1947
//        assertEquals("2", nomRef.getVolume());
1948
//        assertEquals("10", nomRef.getEdition());
1949
//        assertEquals("930", name.getNomenclaturalMicroReference());
1950
//        assertEquals("1759", nomRef.getYear());
1951

  
1819 1952
 }
1820 1953

  
1821 1954
}

Also available in: Unified diff