Project

General

Profile

Revision 8643e56e

ID8643e56e5f89fae97302fc5e95d178ffec239e01
Parent cfec8840
Child f98c2d3d

Added by Andreas Müller about 2 years ago

ref #7429 fix order in "publ." parsing and implement for nom. ref. parser

View differences:

cdmlib-commons/src/main/java/eu/etaxonomy/cdm/common/UTF8.java
23 23
	NO_BREAK_SPACE("\u00A0"),
24 24
	POLISH_L("\u0142"),
25 25
	SMALL_A_ACUTE("\u00E1"),
26
	QUOT_SINGLE_RIGHT("\u2019"), // Right single quotation mark
26
    SMALL_O_ACUTE("\u00F3"),
27
    QUOT_SINGLE_RIGHT("\u2019"), // Right single quotation mark
27 28
	QUOT_SINGLE_HIGH_REV9("\u201b"), // Left high single quotation mark
28 29
    QUOT_DBL_LEFT("\u201c"),  //LEFT DOUBLE QUOTATION MARK Left English quotation mark
29 30
	QUOT_DBL_RIGHT("\u201d"),  //RIGHT DOUBLE QUOTATION MARK Right English quotation mark
cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/parser/NonViralNameParserImplRegExBase.java
66 66
    protected static String singleYear = "\\b" + "(?:17|18|19|20)" + "\\d{2}" + "\\b";                      // word boundary followed by either 17,18,19, or 20 (not captured) followed by 2 digits
67 67
    protected static String correctYearPhrase = singleYear + "("+ fWs + "-" + fWs + singleYear + ")?" ;
68 68
    								//+ "(" + month + ")?)" ;                 // optional month
69
    //!! also used by TimePeriodParser
70
    public static String incorrectYearPhrase = "(\"" + correctYearPhrase + "\"|" + correctYearPhrase + "|"
71
            + UTF8.QUOT_DBL_LOW9 + correctYearPhrase + UTF8.QUOT_DBL_HIGH_REV9 + ")"
72
			+ fWs + "\\[" + singleYear + "\\]"  ;
73
    protected static String yearPhrase = "(" + correctYearPhrase + "|" + incorrectYearPhrase + ")";
69

  
70

  
71
    public static String verbStart = TimePeriodParser.verbatimStart;
72
    public static String verbEnd = TimePeriodParser.verbatimEnd;
73

  
74
    public static String verbatimYearPhrase = "(" + verbStart + correctYearPhrase + verbEnd + fWs + "\\[" + singleYear + "\\]" +"|"
75
            + correctYearPhrase + oWs+  "publ\\.?" + fWs + correctYearPhrase + ")" ;
76
    public static String undefinedYearPhrase = correctYearPhrase + fWs + "\\[" + correctYearPhrase + "\\]";
77
    protected static String yearPhrase = "(" + correctYearPhrase + "|" + verbatimYearPhrase + "|" + undefinedYearPhrase + ")";
74 78

  
75 79
    protected static String yearSeperator = "\\." + oWs;
76 80
    protected static String detailSeparator = ":" + oWs;
cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/parser/TimePeriodParser.java
39 39
	//patter for first year in string;
40 40
	private static final Pattern firstYearPattern =  Pattern.compile("\\d{4}");
41 41
	//case "1806"[1807];
42
	private static final Pattern uncorrectYearPatter = Pattern.compile(NonViralNameParserImplRegExBase.incorrectYearPhrase);
43
//OLD	        Pattern.compile("[\""+UTF8.ENGLISH_QUOT_START+"]\\d{4}[\""+UTF8.ENGLISH_QUOT_END+"]\\s*\\[\\d{4}\\]");
42
//	private static final Pattern uncorrectYearPatter = Pattern.compile(NonViralNameParserImplRegExBase.incorrectYearPhrase);
44 43

  
45 44
	//case fl. 1806 or c. 1806 or fl. 1806?
46 45
	private static final Pattern prefixedYearPattern =  Pattern.compile("(fl|c)\\.\\s*\\d{4}(\\s*-\\s*\\d{4})?\\??");
......
73 72
		result.setFreeText(null);
74 73

  
75 74
		//case "1806"[1807];  => TODO this should (and is?) handled in parse verbatim, should be removed here
76
		if (uncorrectYearPatter.matcher(periodString).matches()){
77
			result.setFreeText(periodString);
78
			String realYear = periodString.split("\\[")[1];
79
			realYear = realYear.replace("]", "");
80
			result.setStartYear(Integer.valueOf(realYear));
81
			result.setFreeText(periodString);
75
//		if (uncorrectYearPatter.matcher(periodString).matches()){
76
//			result.setFreeText(periodString);
77
//			String realYear = periodString.split("\\[")[1];
78
//			realYear = realYear.replace("]", "");
79
//			result.setStartYear(Integer.valueOf(realYear));
80
//			result.setFreeText(periodString);
81
//	    }else
82 82
		//case fl. 1806 or c. 1806 or fl. 1806?  => TODO questionable if this should really be handled here, fl. probably stands for flowering and is not part of the date but of the date  context. What stands "c." for? Used by Markup import?
83
		}else if(prefixedYearPattern.matcher(periodString).matches()){
83
		if(prefixedYearPattern.matcher(periodString).matches()){
84 84
			result.setFreeText(periodString);
85 85
			Matcher yearMatcher = firstYearPattern.matcher(periodString);
86 86
			yearMatcher.find();
......
444 444
    static Pattern patVerbatim2;
445 445
    static Pattern patVerbatim3;
446 446

  
447
    public static String verbatimStart;
448
    public static String verbatimEnd;
449

  
447 450
    static {
448
        String verbatimStart = "[\"'" + UTF8.QUOT_DBL_LEFT + UTF8.QUOT_SINGLE_HIGH_REV9 + UTF8.QUOT_DBL_LOW9 + "]";
449
        String verbatimEnd = "[\"'" + UTF8.QUOT_DBL_RIGHT + UTF8.QUOT_SINGLE_RIGHT + UTF8.QUOT_DBL_HIGH_REV9 + "]";
451
        verbatimStart = "[\"'" + UTF8.QUOT_DBL_LEFT + UTF8.QUOT_SINGLE_HIGH_REV9 + UTF8.QUOT_DBL_LOW9 + "]";
452
        verbatimEnd = "[\"'" + UTF8.QUOT_DBL_RIGHT + UTF8.QUOT_SINGLE_RIGHT + UTF8.QUOT_DBL_HIGH_REV9 + "]";
450 453
        String fWs = "\\s*"; //facultative whitespace
451 454
        String oWs = "\\s+"; //obligate whitespace
452 455
        String anyDate = "([^\"]+)";
......
494 497

  
495 498
        matcher = patVerbatim3.matcher(strPeriod);
496 499
        if (matcher.matches()){
497
            String verbatimDate = matcher.group(3);
500
            String verbatimDate = matcher.group(1).trim();
498 501
            timePeriod.setVerbatimDate(verbatimDate);
499
            strPeriod = matcher.group(1).trim();
502
            strPeriod = matcher.group(3).trim();
500 503
        }
501 504
        return strPeriod;
502 505
    }
cdmlib-model/src/test/java/eu/etaxonomy/cdm/strategy/parser/NonViralNameParserImplTest.java
1924 1924

  
1925 1925
    }
1926 1926

  
1927

  
1928
    @Test
1929
    public final void testDatePublished(){
1930

  
1931
        INonViralName name = parser.parseReferencedName("Calamintha transsilvanica (J\u00e1v.) So\u00f3 in Acta Bot. Acad. Sci. Hung. 23: 382. 1977 publ. 1978");
1932
        Assert.assertFalse("Name should be parsable", name.isProtectedTitleCache());
1933
        Reference nomRef = name.getNomenclaturalReference();
1934
        assertEquals(ReferenceType.Article, nomRef.getType());
1935
        assertEquals("1978 [\"1977\"]", nomRef.getDatePublished().toString());
1936
    }
1937

  
1938

  
1927 1939
    @Test
1928 1940
    public final void testExistingProblems(){
1929 1941
        //Canabio, issue with space
......
2091 2103
        assertEquals("29(1\u20134)", nomRef.getVolume());
2092 2104
        assertEquals("16, f. 1\u20132, t. 1-8", name.getNomenclaturalMicroReference());
2093 2105
        assertEquals("1983 [1984]", nomRef.getDatePublishedString());
2094
        assertEquals("1984", nomRef.getYear());
2106
//        assertEquals("1984", nomRef.getYear()); //was like this, but is not necessarily correct, see #7429
2095 2107

  
2096 2108
        //incorrect year with \u201e \u201f  (s. eu.etaxonomy.cdm.common.UTF8.ENGLISH_QUOT_START
2097 2109
        name = parser.parseReferencedName("Javorkaea Borhidi & Jarai-Koml."
cdmlib-model/src/test/java/eu/etaxonomy/cdm/strategy/parser/TimePeriodParserTest.java
106 106

  
107 107
		//"1806"[1807]
108 108
		String strCorrectedPeriod = "\"1806\"[1807]";
109
		TimePeriod tpcorrected = TimePeriodParser.parseString(strCorrectedPeriod);
109
		VerbatimTimePeriod tpcorrected = TimePeriodParser.parseStringVerbatim(strCorrectedPeriod);
110 110
		assertNotNull(tpcorrected);
111
		Assert.assertEquals(strCorrectedPeriod, tpcorrected.getFreeText());
111
		Assert.assertEquals(null, tpcorrected.getFreeText());
112
		Assert.assertEquals("1806", tpcorrected.getVerbatimDate());
112 113
		Assert.assertEquals("1807", tpcorrected.getYear());
113 114

  
114 115
	      //„1806‟[1807]
115 116
        String strCorrectedEnPeriod = UTF8.QUOT_DBL_LOW9 + "1806"+UTF8.QUOT_DBL_HIGH_REV9+"[1807]";
116
        TimePeriod tpcorrectedEn = TimePeriodParser.parseString(strCorrectedEnPeriod);
117
        VerbatimTimePeriod tpcorrectedEn = TimePeriodParser.parseStringVerbatim(strCorrectedEnPeriod);
117 118
        assertNotNull(tpcorrectedEn);
118
        Assert.assertEquals(strCorrectedEnPeriod, tpcorrectedEn.getFreeText());
119
        Assert.assertEquals(null, tpcorrectedEn.getFreeText());
120
        Assert.assertEquals("1806", tpcorrected.getVerbatimDate());
119 121
        Assert.assertEquals("1807", tpcorrectedEn.getYear());
120 122

  
121 123

  
......
265 267
        strDate = "1947 publ. 1948";
266 268
        tp = TimePeriodParser.parseStringVerbatim(strDate);
267 269
        assertNotNull(tp);
268
        Assert.assertEquals("1947 [\"1948\"]", tp.toString());
269
        Assert.assertEquals("1947", tp.getYear());
270
        Assert.assertEquals(Integer.valueOf(1947), tp.getStartYear());
271
        Assert.assertEquals("1948", tp.getVerbatimDate());
270
        Assert.assertEquals("1948 [\"1947\"]", tp.toString());
271
        Assert.assertEquals("1948", tp.getYear());
272
        Assert.assertEquals(Integer.valueOf(1948), tp.getStartYear());
273
        Assert.assertEquals("1947", tp.getVerbatimDate());
272 274

  
273 275
        strDate = "\"1884-1885\" [1886]";
274 276
        tp = TimePeriodParser.parseStringVerbatim(strDate);

Also available in: Unified diff

Add picture from clipboard (Maximum size: 40 MB)