Revision dfabd5b6
Added by Andreas Müller almost 8 years ago
cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/parser/NonViralNameParserImplRegExBase.java | ||
---|---|---|
63 | 63 |
//years |
64 | 64 |
protected static String month = "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)"; |
65 | 65 |
protected static String singleYear = "\\b" + "(?:17|18|19|20)" + "\\d{2}" + "\\b"; // word boundary followed by either 17,18,19, or 20 (not captured) followed by 2 digits |
66 |
protected static String yearPhrase = singleYear + "("+ fWs + "-" + fWs + singleYear + ")?" ;
|
|
66 |
protected static String correctYearPhrase = singleYear + "("+ fWs + "-" + fWs + singleYear + ")?" ;
|
|
67 | 67 |
//+ "(" + month + ")?)" ; // optional month |
68 |
//!! also used by TimePeriodParser |
|
69 |
public static String incorrectYearPhrase = "(\"" + correctYearPhrase + "\"|" + correctYearPhrase + "|" |
|
70 |
+ UTF8.ENGLISH_QUOT_START + correctYearPhrase + UTF8.ENGLISH_QUOT_END + ")" |
|
71 |
+ fWs + "\\[" + singleYear + "\\]" ; |
|
72 |
protected static String yearPhrase = "(" + correctYearPhrase + "|" + incorrectYearPhrase + ")"; |
|
68 | 73 |
|
69 | 74 |
protected static String yearSeperator = "\\." + oWs; |
70 | 75 |
protected static String detailSeparator = ":" + oWs; |
... | ... | |
116 | 121 |
protected static String fullAuthorString = "(" + fullBotanicAuthorString + "|" + fullZooAuthorString+ ")"; |
117 | 122 |
|
118 | 123 |
//details |
119 |
//TODO still very simple |
|
120 |
|
|
124 |
//TODO still not all parsed |
|
121 | 125 |
|
122 | 126 |
protected static String nr2 = "\\d{1,2}"; |
123 | 127 |
protected static String nr4 = "\\d{1,4}"; |
124 | 128 |
protected static String nr5 = "\\d{1,5}"; |
125 | 129 |
|
126 | 130 |
|
127 |
protected static String pPage = nr5 + "[a-z]?"; |
|
131 |
protected static String pPage = nr5 + "[a-zA-Z]?";
|
|
128 | 132 |
protected static String pStrNo = "n\u00B0" + fWs + "(" + nr4 + ")"; |
129 | 133 |
|
130 | 134 |
protected static String pBracketNr = "\\[" + nr4 + "\\]"; |
131 |
protected static String pFolBracket = "\\[fol\\." + fWs + "\\d{1,2}(-\\d{1,2})?\\]"; |
|
135 |
protected static String pFolBracket = "\\[fol\\." + fWs + "\\d{1,2}(-\\d{1,2})?\\]"; //maybe merge with pTabFigPlate (see below) |
|
136 |
|
|
132 | 137 |
|
133 |
protected static String pStrTab = "[tT]((ab)?\\.|ab\\s)" + fWs + nr4 + "(" + fWs + "(B|\u00DF|\\(\\d{1,3}\\)))?"; |
|
134 |
protected static String pFig = "[fF]((ig)?\\.|ig\\s)" + fWs + nr4 + "([a-zA-Z]([-\u2013,]\\s*[a-zA-Z])?)?"; |
|
135 |
protected static String pFigs = pFig + "([-\u2013]" + nr4 + ")?"; |
|
136 |
protected static String pPlate = "[pP]((l)?\\.|l\\s)" + fWs + nr4 + "([a-zA-Z]([-\u2013,]\\s*[a-zA-Z])?)?"; |
|
138 |
protected static String pRangeSep = "[-\u2013]"; |
|
139 |
protected static String pRangeSepCo = "[-\u2013,]"; |
|
137 | 140 |
|
141 |
protected static String pTabFigPlateStart = "([tT](abs?)?|[fF](igs?)?|[pP]l?s?)(\\.|\\s|$)"; //$ for only 'f' |
|
142 |
protected static String pAbcNr = "[a-zA-Z\u00DF]"; |
|
143 |
protected static String pTabFigPlateNumber = "(" + nr4 + "|" + pAbcNr + "|" + nr4 + fWs + pAbcNr + ")" + "("+ pRangeSepCo + fWs + pAbcNr + ")?"; |
|
144 |
protected static String pTabFigPlateNumbers = "(" + pTabFigPlateNumber + "(" + pRangeSepCo + fWs + pTabFigPlateNumber + ")?)"; |
|
138 | 145 |
|
139 |
//static String pTabFig = pStrTab + "(," + fWs + pFigs + ")?";
|
|
140 |
protected static String pTabFigPl = "(" + pStrTab + "|" + pFigs + "|" + pPlate + ")";
|
|
146 |
protected static String pTabFigPlate = pTabFigPlateStart + fWs + pTabFigPlateNumbers + "?";
|
|
147 |
protected static String pTabFigPl = pTabFigPlate;
|
|
141 | 148 |
|
142 | 149 |
//e.g.: p455; p.455; pp455-456; pp.455-456; pp.455,456; 455, 456; pages 456-457; pages 456,567 |
143 |
protected static String pSinglePages = "(p\\.?)?" + fWs + pPage + "(," + pTabFigPl +")?";
|
|
144 |
protected static String pMultiPages = "(pp\\.?|pages)?" + fWs + pPage + fWs + "[-\u2013,]" +fWs + pPage ;
|
|
150 |
protected static String pSinglePages = "(p\\.?)?" + fWs + pPage + "(," + pTabFigPl +"){0,2}";
|
|
151 |
protected static String pMultiPages = "(pp\\.?|pages)?" + fWs + pPage + fWs + pRangeSepCo +fWs + pPage ;
|
|
145 | 152 |
//static String pPages = pPage + "(," + fWs + "(" + pPage + "|" + pTabFig + ")" + ")?"; |
146 | 153 |
protected static String pPages = "(" + pSinglePages +"|" + pMultiPages +")"; |
147 |
protected static String pPagesTabFig = pPages +"[,\\.]" + fWs + pTabFigPl;
|
|
154 |
protected static String pPagesTabFig = pPages +"([,\\.]" + fWs + pTabFigPl + "){1,2}";
|
|
148 | 155 |
|
149 | 156 |
|
150 | 157 |
|
... | ... | |
171 | 178 |
// romNr = "(?=[MDCLXVImdclxvi])(((" & romM & ")?" & romHun & ")?" & romTen & ")?" & romOne |
172 | 179 |
protected static String pRomNr = "ljfweffaflas"; //TODO rom number have to be tested first |
173 | 180 |
|
181 |
// "(,\\s*" + pTabFigPl + ")?" + |
|
174 | 182 |
protected static String pDetailAlternatives = "(" + pPages + "|" + pPageSpecial + "|" + pStrNo + "|" + pBracketNr + |
175 |
"|" + pTabFigPl + "|" + pTabSpecial + "|" + pFolBracket + "|" + pCouv + "|" + pRomNr + "|" + |
|
183 |
"|" + pTabFigPl + "(,\\s*" + pTabFigPl + ")?" + "|" + pTabSpecial + "|" + pFolBracket + "|" + pCouv + "|" + pRomNr + "|" +
|
|
176 | 184 |
pSpecialGardDict + "|" + pSpecialDetail + "|" + pPagesTabFig + ")"; |
177 | 185 |
|
178 | 186 |
protected static String detail = pDetailAlternatives; |
179 | 187 |
|
180 | 188 |
//reference |
181 |
protected static String volume = nr4 + "[a-z]?" + fWs + "(\\("+ nr4 + "([-\u2013]" + nr4 + ")?\\))?" + "(\\((Suppl|Beibl)\\.\\))?"; |
|
189 |
protected static String volume = nr4 + "[a-z]?" + fWs + "(\\("+ nr4 + "([-\u2013]" + nr4 + ")?\\))?" + "(\\((Suppl|Beibl|App|Beil|Misc)\\.\\))?";
|
|
182 | 190 |
//this line caused problem https://dev.e-taxonomy.eu/trac/ticket/1556 in its original form: "([\u005E:\\.]" + fWs + ")"; |
183 | 191 |
protected static String anySepChar = "([\u005E:a-zA-Z]" + fWs + ")"; //all characters except for the detail separator, a stricter version would be [,\\-\\&] and some other characters |
184 | 192 |
// protected static String anySepChar = "([,\\-\\&\\.\\+\\']" + fWs + ")"; |
185 | 193 |
|
186 | 194 |
protected static int authorSeparatorMaxPosition = 4; //Author may have a maximum of 4 words |
187 |
protected static String pTitleWordSeparator = "(\\."+ fWs+"|" + oWs + ")"; |
|
195 |
protected static String pTitleWordSeparator = "(\\."+ fWs+"|" + oWs + "|\\.?[-\u2013])";
|
|
188 | 196 |
protected static String pSeriesPart = ",?" + fWs + "[sS]er(\\.)?" + oWs + "\\d{1,2},?"; |
189 | 197 |
|
190 | 198 |
protected static String referenceTitleFirstPart = "(" + apostrophWord + pTitleWordSeparator + "|" + twoCapitalDotWord + fWs + ")"; |
cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/parser/TimePeriodParser.java | ||
---|---|---|
22 | 22 |
import org.joda.time.Partial; |
23 | 23 |
|
24 | 24 |
import eu.etaxonomy.cdm.common.CdmUtils; |
25 |
import eu.etaxonomy.cdm.common.UTF8; |
|
26 | 25 |
import eu.etaxonomy.cdm.model.common.TimePeriod; |
27 | 26 |
|
28 | 27 |
/** |
... | ... | |
36 | 35 |
//patter for first year in string; |
37 | 36 |
private static final Pattern firstYearPattern = Pattern.compile("\\d{4}"); |
38 | 37 |
//case "1806"[1807]; |
39 |
private static final Pattern uncorrectYearPatter = Pattern.compile("[\""+UTF8.ENGLISH_QUOT_START+"]\\d{4}[\""+UTF8.ENGLISH_QUOT_END+"]\\s*\\[\\d{4}\\]"); |
|
38 |
private static final Pattern uncorrectYearPatter = Pattern.compile(NonViralNameParserImplRegExBase.incorrectYearPhrase); |
|
39 |
//OLD Pattern.compile("[\""+UTF8.ENGLISH_QUOT_START+"]\\d{4}[\""+UTF8.ENGLISH_QUOT_END+"]\\s*\\[\\d{4}\\]"); |
|
40 |
|
|
40 | 41 |
//case fl. 1806 or c. 1806 or fl. 1806? |
41 | 42 |
private static final Pattern prefixedYearPattern = Pattern.compile("(fl|c)\\.\\s*\\d{4}(\\s*-\\s*\\d{4})?\\??"); |
42 | 43 |
//standard |
... | ... | |
62 | 63 |
periodString = periodString.trim(); |
63 | 64 |
|
64 | 65 |
result.setFreeText(null); |
65 |
Date date; |
|
66 | 66 |
|
67 | 67 |
//case "1806"[1807]; |
68 | 68 |
if (uncorrectYearPatter.matcher(periodString).matches()){ |
cdmlib-model/src/test/java/eu/etaxonomy/cdm/strategy/parser/NonViralNameParserImplTest.java | ||
---|---|---|
1750 | 1750 |
assertEquals("19(4)", nomRef.getVolume()); |
1751 | 1751 |
assertEquals("901" + UTF8.EN_DASH + "911, f. 1" + UTF8.EN_DASH + "2", name.getNomenclaturalMicroReference()); |
1752 | 1752 |
|
1753 |
//detail with figs |
|
1754 |
name = parser.parseReferencedName("Randia sonorensis Wiggins" |
|
1755 |
+ " in Contr. Dudley Herb. 3: 75, figs 4-6. 1940"); |
|
1756 |
Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache()); |
|
1757 |
combinationAuthor = name.getCombinationAuthorship(); |
|
1758 |
assertEquals( "Wiggins", combinationAuthor.getNomenclaturalTitle()); |
|
1759 |
nomRef = (Reference)name.getNomenclaturalReference(); |
|
1760 |
assertEquals(ReferenceType.Article, nomRef.getType()); |
|
1761 |
assertEquals("3", nomRef.getVolume()); |
|
1762 |
assertEquals("75, figs 4-6", name.getNomenclaturalMicroReference()); |
|
1763 |
|
|
1764 |
//detail with pl. and figs |
|
1765 |
name = parser.parseReferencedName("Randia sonorensis Wiggins" |
|
1766 |
+ " in Contr. Dudley Herb. 3: 75, pl. 19, figs 4-6. 1940"); |
|
1767 |
Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache()); |
|
1768 |
combinationAuthor = name.getCombinationAuthorship(); |
|
1769 |
assertEquals( "Wiggins", combinationAuthor.getNomenclaturalTitle()); |
|
1770 |
nomRef = (Reference)name.getNomenclaturalReference(); |
|
1771 |
assertEquals(ReferenceType.Article, nomRef.getType()); |
|
1772 |
assertEquals("3", nomRef.getVolume()); |
|
1773 |
assertEquals("75, pl. 19, figs 4-6", name.getNomenclaturalMicroReference()); |
|
1774 |
|
|
1775 |
|
|
1753 | 1776 |
//pl |
1754 | 1777 |
name = parser.parseReferencedName("Carapichea Aubl." |
1755 | 1778 |
+ " in Hist. Pl. Guiane 1: 167, pl. 64. 1775"); |
... | ... | |
1772 | 1795 |
assertEquals("4", nomRef.getVolume()); |
1773 | 1796 |
assertEquals("121. fig. 2a, b", name.getNomenclaturalMicroReference()); |
1774 | 1797 |
|
1798 |
//detail with , to number |
|
1799 |
name = parser.parseReferencedName("Deppea martinez-calderonii Lorence" |
|
1800 |
+ " in Allertonia 4: 399. figs 1e, 2. 1988"); |
|
1801 |
Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache()); |
|
1802 |
combinationAuthor = name.getCombinationAuthorship(); |
|
1803 |
assertEquals( "Lorence", combinationAuthor.getNomenclaturalTitle()); |
|
1804 |
nomRef = (Reference)name.getNomenclaturalReference(); |
|
1805 |
assertEquals(ReferenceType.Article, nomRef.getType()); |
|
1806 |
assertEquals("4", nomRef.getVolume()); |
|
1807 |
assertEquals("399. figs 1e, 2", name.getNomenclaturalMicroReference()); |
|
1808 |
|
|
1775 | 1809 |
//(Suppl.) |
1776 | 1810 |
name = parser.parseReferencedName("Manettia costaricensis Wernham" |
1777 | 1811 |
+ " in J. Bot. 57(Suppl.): 38. 1919"); |
... | ... | |
1805 | 1839 |
assertEquals("3", nomRef.getVolume()); |
1806 | 1840 |
assertEquals("219", name.getNomenclaturalMicroReference()); |
1807 | 1841 |
|
1808 |
// |
|
1809 |
//(Hannover) place published |
|
1842 |
// place published e.g. (Hannover) |
|
1810 | 1843 |
name = parser.parseReferencedName("Pittoniotis trichantha Griseb." |
1811 | 1844 |
+ " in Bonplandia (Hannover) 6 (1): 8. 1858"); |
1812 | 1845 |
Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache()); |
... | ... | |
1816 | 1849 |
assertEquals(ReferenceType.Article, nomRef.getType()); |
1817 | 1850 |
assertEquals("6 (1)", nomRef.getVolume()); |
1818 | 1851 |
assertEquals("8", name.getNomenclaturalMicroReference()); |
1852 |
|
|
1853 |
//komplex / incorrect year without quotation marks |
|
1854 |
name = parser.parseReferencedName("Javorkaea Borhidi & Jarai-Koml." |
|
1855 |
+ " in Acta Bot. Hung. 29(1\u20134): 16, f. 1\u20132, t. 1-8. 1983 [1984]"); |
|
1856 |
Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache()); |
|
1857 |
combinationAuthor = name.getCombinationAuthorship(); |
|
1858 |
assertEquals( "Borhidi & Jarai-Koml.", combinationAuthor.getNomenclaturalTitle()); |
|
1859 |
nomRef = (Reference)name.getNomenclaturalReference(); |
|
1860 |
assertEquals(ReferenceType.Article, nomRef.getType()); |
|
1861 |
assertEquals("29(1\u20134)", nomRef.getVolume()); |
|
1862 |
assertEquals("16, f. 1\u20132, t. 1-8", name.getNomenclaturalMicroReference()); |
|
1863 |
assertEquals("1983 [1984]", nomRef.getDatePublishedString()); |
|
1864 |
assertEquals("1984", nomRef.getYear()); |
|
1865 |
|
|
1866 |
//incorrect year with \u201e \u201f (s. eu.etaxonomy.cdm.common.UTF8.ENGLISH_QUOT_START |
|
1867 |
name = parser.parseReferencedName("Javorkaea Borhidi & Jarai-Koml." |
|
1868 |
+ " in Acta Bot. Hung. 29(1-4): 16, f. 1-2. \u201e1983\u201f [1984]"); |
|
1869 |
Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache()); |
|
1870 |
combinationAuthor = name.getCombinationAuthorship(); |
|
1871 |
assertEquals( "Borhidi & Jarai-Koml.", combinationAuthor.getNomenclaturalTitle()); |
|
1872 |
nomRef = (Reference)name.getNomenclaturalReference(); |
|
1873 |
assertEquals(ReferenceType.Article, nomRef.getType()); |
|
1874 |
assertEquals("29(1-4)", nomRef.getVolume()); |
|
1875 |
assertEquals("16, f. 1-2", name.getNomenclaturalMicroReference()); |
|
1876 |
assertEquals("\u201e1983\u201f [1984]", nomRef.getDatePublishedString()); |
|
1877 |
assertEquals("1984", nomRef.getYear()); |
|
1878 |
|
|
1879 |
//incorrect year with " |
|
1880 |
name = parser.parseReferencedName("Javorkaea Borhidi & Jarai-Koml." |
|
1881 |
+ " in Acta Bot. Hung. 29(1-4): 16, f. 1-2. \"1983\" [1984]"); |
|
1882 |
Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache()); |
|
1883 |
combinationAuthor = name.getCombinationAuthorship(); |
|
1884 |
assertEquals( "Borhidi & Jarai-Koml.", combinationAuthor.getNomenclaturalTitle()); |
|
1885 |
nomRef = (Reference)name.getNomenclaturalReference(); |
|
1886 |
assertEquals(ReferenceType.Article, nomRef.getType()); |
|
1887 |
assertEquals("29(1-4)", nomRef.getVolume()); |
|
1888 |
assertEquals("16, f. 1-2", name.getNomenclaturalMicroReference()); |
|
1889 |
assertEquals("\"1983\" [1984]", nomRef.getDatePublishedString()); |
|
1890 |
assertEquals("1984", nomRef.getYear()); |
|
1891 |
|
|
1892 |
//fig. a |
|
1893 |
name = parser.parseReferencedName("Psychotria capitata Ruiz & Pav." |
|
1894 |
+ " in Fl. Peruv. 2: 59, pl. 206, fig. a. 1799"); |
|
1895 |
Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache()); |
|
1896 |
combinationAuthor = name.getCombinationAuthorship(); |
|
1897 |
assertEquals( "Ruiz & Pav.", combinationAuthor.getNomenclaturalTitle()); |
|
1898 |
nomRef = (Reference)name.getNomenclaturalReference(); |
|
1899 |
assertEquals(ReferenceType.Article, nomRef.getType()); |
|
1900 |
assertEquals("2", nomRef.getVolume()); |
|
1901 |
assertEquals("59, pl. 206, fig. a", name.getNomenclaturalMicroReference()); |
|
1902 |
|
|
1903 |
//442A. |
|
1904 |
name = parser.parseReferencedName("Rogiera elegans Planch." |
|
1905 |
+ " in Fl. Serres Jard. Eur. 5: 442A. 1849"); |
|
1906 |
Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache()); |
|
1907 |
combinationAuthor = name.getCombinationAuthorship(); |
|
1908 |
assertEquals( "Planch.", combinationAuthor.getNomenclaturalTitle()); |
|
1909 |
nomRef = (Reference)name.getNomenclaturalReference(); |
|
1910 |
assertEquals(ReferenceType.Article, nomRef.getType()); |
|
1911 |
assertEquals("5", nomRef.getVolume()); |
|
1912 |
assertEquals("442A", name.getNomenclaturalMicroReference()); |
|
1913 |
|
|
1914 |
//f |
|
1915 |
name = parser.parseReferencedName("Coussarea imitans L.O. Williams" |
|
1916 |
+ " in Phytologia 26 (6): 488–489, f. 1973"); |
|
1917 |
Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache()); |
|
1918 |
combinationAuthor = name.getCombinationAuthorship(); |
|
1919 |
assertEquals( "L.O. Williams", combinationAuthor.getNomenclaturalTitle()); |
|
1920 |
nomRef = (Reference)name.getNomenclaturalReference(); |
|
1921 |
assertEquals(ReferenceType.Article, nomRef.getType()); |
|
1922 |
assertEquals("26 (6)", nomRef.getVolume()); |
|
1923 |
assertEquals("488–489, f", name.getNomenclaturalMicroReference()); |
|
1924 |
|
|
1925 |
//Phys.-Med. |
|
1926 |
name = parser.parseReferencedName("Coccocypselum cordifolium Nees & Mart." |
|
1927 |
+ " in Nova Acta Phys.-Med. Acad. Caes.\u2013Leop. Nat. Cur. 12: 14. 1824"); |
|
1928 |
Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache()); |
|
1929 |
combinationAuthor = name.getCombinationAuthorship(); |
|
1930 |
assertEquals( "Nees & Mart.", combinationAuthor.getNomenclaturalTitle()); |
|
1931 |
nomRef = (Reference)name.getNomenclaturalReference(); |
|
1932 |
assertEquals(ReferenceType.Article, nomRef.getType()); |
|
1933 |
assertEquals("Nova Acta Phys.-Med. Acad. Caes.\u2013Leop. Nat. Cur.", nomRef.getInReference().getTitle()); |
|
1934 |
assertEquals("12", nomRef.getVolume()); |
|
1935 |
assertEquals("14", name.getNomenclaturalMicroReference()); |
|
1936 |
assertEquals("1824", nomRef.getYear()); |
|
1937 |
|
|
1938 |
//(ed. 10) wanted? |
|
1939 |
// Syst. Nat. (ed. 10) 2: 930. 1759 |
|
1940 |
// name = parser.parseReferencedName("Erithalis fruticosa L." |
|
1941 |
// + ", Syst. Nat. ed. 10, 2: 930. 1759"); |
|
1942 |
// Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache()); |
|
1943 |
// combinationAuthor = name.getCombinationAuthorship(); |
|
1944 |
// assertEquals( "L.", combinationAuthor.getNomenclaturalTitle()); |
|
1945 |
// nomRef = (Reference)name.getNomenclaturalReference(); |
|
1946 |
// assertEquals(ReferenceType.Book, nomRef.getType()); |
|
1947 |
// assertEquals("2", nomRef.getVolume()); |
|
1948 |
// assertEquals("10", nomRef.getEdition()); |
|
1949 |
// assertEquals("930", name.getNomenclaturalMicroReference()); |
|
1950 |
// assertEquals("1759", nomRef.getYear()); |
|
1951 |
|
|
1819 | 1952 |
} |
1820 | 1953 |
|
1821 | 1954 |
} |
Also available in: Unified diff
#5909 further improve referenced name parser