Project

General

Profile

« Previous | Next » 

Revision 4f5219a9

Added by Andreas Müller almost 8 years ago

#5909 Improve referenced name parser

View differences:

cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/parser/TimePeriodParser.java
1
/**
2
* Copyright (C) 2007 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9
package eu.etaxonomy.cdm.strategy.parser;
10

  
11
import java.text.DateFormat;
12
import java.text.ParsePosition;
13
import java.util.Calendar;
14
import java.util.Date;
15
import java.util.regex.Matcher;
16
import java.util.regex.Pattern;
17

  
18
import org.apache.commons.lang.StringUtils;
19
import org.apache.log4j.Logger;
20
import org.joda.time.DateTime;
21
import org.joda.time.DateTimeFieldType;
22
import org.joda.time.Partial;
23

  
24
import eu.etaxonomy.cdm.common.CdmUtils;
25
import eu.etaxonomy.cdm.model.common.TimePeriod;
26

  
27
/**
28
 * Class for parsing all types of date string to TimePeriod
29
 * @author a.mueller
30
 * @created 14-Jul-2013
31
 */
32
public class TimePeriodParser {
33
	private static final Logger logger = Logger.getLogger(TimePeriodParser.class);
34
	
35
	//patter for first year in string;
36
	private static final Pattern firstYearPattern =  Pattern.compile("\\d{4}");
37
	//case "1806"[1807];
38
	private static final Pattern uncorrectYearPatter =  Pattern.compile("\"\\d{4}\"\\s*\\[\\d{4}\\]");
39
	//case fl. 1806 or c. 1806 or fl. 1806?
40
	private static final Pattern prefixedYearPattern =  Pattern.compile("(fl|c)\\.\\s*\\d{4}(\\s*-\\s*\\d{4})?\\??");
41
	//standard
42
	private static final Pattern standardPattern =  Pattern.compile("\\s*\\d{2,4}(\\s*-(\\s*\\d{2,4})?)?");
43
	private static final String strDotDate = "[0-3]?\\d\\.[01]?\\d\\.\\d{4,4}";
44
	private static final String strDotDatePeriodPattern = String.format("%s(\\s*-\\s*%s?)?", strDotDate, strDotDate);
45
	private static final Pattern dotDatePattern =  Pattern.compile(strDotDatePeriodPattern);
46
	private static final Pattern lifeSpanPattern =  Pattern.compile(String.format("%s--%s", firstYearPattern, firstYearPattern));
47

  
48

  
49
	public static TimePeriod parseString(TimePeriod timePeriod, String periodString){
50
		//TODO move to parser class
51
		//TODO until now only quick and dirty (and partly wrong)
52
		TimePeriod result = timePeriod;
53

  
54
		if(timePeriod == null){
55
			return timePeriod;
56
		}
57

  
58
		if (periodString == null){
59
			return result;
60
		}
61
		periodString = periodString.trim();
62

  
63
		result.setFreeText(null);
64
		Date date;
65

  
66
		//case "1806"[1807];
67
		if (uncorrectYearPatter.matcher(periodString).matches()){
68
			result.setFreeText(periodString);
69
			String realYear = periodString.split("\\[")[1];
70
			realYear = realYear.replace("]", "");
71
			result.setStartYear(Integer.valueOf(realYear));
72
			result.setFreeText(periodString);
73
		//case fl. 1806 or c. 1806 or fl. 1806?
74
		}else if(prefixedYearPattern.matcher(periodString).matches()){
75
			result.setFreeText(periodString);
76
			Matcher yearMatcher = firstYearPattern.matcher(periodString);
77
			yearMatcher.find();
78
			String startYear = yearMatcher.group();
79
			result.setStartYear(Integer.valueOf(startYear));
80
			if (yearMatcher.find()){
81
				String endYear = yearMatcher.group();
82
				result.setEndYear(Integer.valueOf(endYear));
83
			}
84
		}else if (dotDatePattern.matcher(periodString).matches()){
85
			parseDotDatePattern(periodString, result);
86
		}else if (lifeSpanPattern.matcher(periodString).matches()){
87
			parseLifeSpanPattern(periodString, result);
88
		}else if (standardPattern.matcher(periodString).matches()){
89
			parseStandardPattern(periodString, result);
90
//TODO first check ambiguity of parser results e.g. for 7/12/11
91
//			}else if (isDateString(periodString)){
92
//				String[] startEnd = makeStartEnd(periodString);
93
//				String start = startEnd[0];
94
//				DateTime startDateTime = dateStringParse(start, true);
95
//				result.setStart(startDateTime);
96
//				if (startEnd.length > 1){
97
//					DateTime endDateTime = dateStringParse(startEnd[1], true);
98
//					;
99
//					result.setEnd(endDateTime.toLocalDate());
100
//				}
101

  
102
		}else{
103
			result.setFreeText(periodString);
104
		}
105
		return result;
106
	}
107

  
108
	private static boolean isDateString(String periodString) {
109
		String[] startEnd = makeStartEnd(periodString);
110
		String start = startEnd[0];
111
		DateTime startDateTime = dateStringParse(start, true);
112
		if (startDateTime == null){
113
			return false;
114
		}
115
		if (startEnd.length > 1){
116
			DateTime endDateTime = dateStringParse(startEnd[1], true);
117
			if (endDateTime != null){
118
				return true;
119
			}
120
		}
121
		return false;
122
	}
123

  
124

  
125
	/**
126
	 * @param periodString
127
	 * @return
128
	 */
129
	private static String[] makeStartEnd(String periodString) {
130
		String[] startEnd = new String[]{periodString};
131
		if (periodString.contains("-") && periodString.matches("^-{2,}-^-{2,}")){
132
			startEnd = periodString.split("-");
133
		}
134
		return startEnd;
135
	}
136

  
137

  
138
	private static DateTime dateStringParse(String string, boolean strict) {
139
		DateFormat dateFormat = DateFormat.getDateInstance();
140
		ParsePosition pos = new ParsePosition(0);
141
		Date a = dateFormat.parse(string, pos);
142
		if (a == null || pos.getIndex() != string.length()){
143
			return null;
144
		}
145
		Calendar cal = Calendar.getInstance();
146
		cal.setTime(a);
147
		DateTime result = new DateTime(cal);
148
		return result;
149
	}
150

  
151

  
152
	/**
153
	 * @param periodString
154
	 * @param result
155
	 */
156
	private static void parseDotDatePattern(String periodString,TimePeriod result) {
157
		String[] dates = periodString.split("-");
158
		Partial dtStart = null;
159
		Partial dtEnd = null;
160

  
161
		if (dates.length > 2 || dates.length <= 0){
162
			logger.warn("More than 1 '-' in period String: " + periodString);
163
			result.setFreeText(periodString);
164
		}else {
165
			try {
166
				//start
167
				if (! StringUtils.isBlank(dates[0])){
168
					dtStart = parseSingleDotDate(dates[0].trim());
169
				}
170

  
171
				//end
172
				if (dates.length >= 2 && ! StringUtils.isBlank(dates[1])){
173
					dtEnd = parseSingleDotDate(dates[1].trim());
174
				}
175

  
176
				result.setStart(dtStart);
177
				result.setEnd(dtEnd);
178
			} catch (IllegalArgumentException e) {
179
				//logger.warn(e.getMessage());
180
				result.setFreeText(periodString);
181
			}
182
		}
183
	}
184
	
185
	private static void parseLifeSpanPattern(String periodString, TimePeriod result) {
186
		
187
		try{
188
			String[] years = periodString.split("--");
189
			String start = years[0];
190
			String end = years[1];
191
			
192
			result.setStartYear(Integer.valueOf(start));
193
			result.setEndYear(Integer.valueOf(end));
194
		} catch (Exception e) {
195
			//logger.warn(e.getMessage());
196
			result.setFreeText(periodString);
197
		}
198
	}
199

  
200

  
201
	/**
202
	 * @param periodString
203
	 * @param result
204
	 */
205
	private static void parseStandardPattern(String periodString,
206
			TimePeriod result) {
207
		String[] years = periodString.split("-");
208
		Partial dtStart = null;
209
		Partial dtEnd = null;
210

  
211
		if (years.length > 2 || years.length <= 0){
212
			logger.warn("More than 1 '-' in period String: " + periodString);
213
		}else {
214
			try {
215
				//start
216
				if (! CdmUtils.isEmpty(years[0])){
217
					dtStart = parseSingleDate(years[0].trim());
218
				}
219

  
220
				//end
221
				if (years.length >= 2 && ! CdmUtils.isEmpty(years[1])){
222
					years[1] = years[1].trim();
223
					if (years[1].length()==2 && dtStart != null && dtStart.isSupported(DateTimeFieldType.year())){
224
						years[1] = String.valueOf(dtStart.get(DateTimeFieldType.year())/100) + years[1];
225
					}
226
					dtEnd = parseSingleDate(years[1]);
227
				}
228

  
229
				result.setStart(dtStart);
230
				result.setEnd(dtEnd);
231
			} catch (IllegalArgumentException e) {
232
				//logger.warn(e.getMessage());
233
				result.setFreeText(periodString);
234
			}
235
		}
236
	}
237

  
238
	public static TimePeriod parseString(String strPeriod) {
239
		TimePeriod timePeriod = TimePeriod.NewInstance();
240
		return parseString(timePeriod, strPeriod);
241
	}
242

  
243

  
244
	protected static Partial parseSingleDate(String singleDateString) throws IllegalArgumentException{
245
		//FIXME until now only quick and dirty and incomplete
246
		Partial partial =  new Partial();
247
		singleDateString = singleDateString.trim();
248
		if (CdmUtils.isNumeric(singleDateString)){
249
			try {
250
				Integer year = Integer.valueOf(singleDateString.trim());
251
				if (year < 1000 && year > 2100){
252
					logger.warn("Not a valid year: " + year + ". Year must be between 1000 and 2100");
253
				}else if (year < 1700 && year > 2100){
254
					logger.warn("Not a valid taxonomic year: " + year + ". Year must be between 1750 and 2100");
255
					partial = partial.with(TimePeriod.YEAR_TYPE, year);
256
				}else{
257
					partial = partial.with(TimePeriod.YEAR_TYPE, year);
258
				}
259
			} catch (NumberFormatException e) {
260
				logger.debug("Not a Integer format in getCalendar()");
261
				throw new IllegalArgumentException(e);
262
			}
263
		}else{
264
			throw new IllegalArgumentException("Until now only years can be parsed as single dates. But date is: " + singleDateString);
265
		}
266
		return partial;
267

  
268
	}
269

  
270
	protected static Partial parseSingleDotDate(String singleDateString) throws IllegalArgumentException{
271
		Partial partial =  new Partial();
272
		singleDateString = singleDateString.trim();
273
		String[] split = singleDateString.split("\\.");
274
		int length = split.length;
275
		if (length > 3){
276
			throw new IllegalArgumentException(String.format("More than 2 dots in date '%s'", singleDateString));
277
		}
278
		String strYear = split[split.length-1];
279
		String strMonth = length >= 2? split[split.length-2]: null;
280
		String strDay = length >= 3? split[split.length-3]: null;
281

  
282

  
283
		try {
284
			Integer year = Integer.valueOf(strYear.trim());
285
			Integer month = Integer.valueOf(strMonth.trim());
286
			Integer day = Integer.valueOf(strDay.trim());
287
			if (year < 1000 && year > 2100){
288
				logger.warn("Not a valid year: " + year + ". Year must be between 1000 and 2100");
289
			}else if (year < 1700 && year > 2100){
290
				logger.warn("Not a valid taxonomic year: " + year + ". Year must be between 1750 and 2100");
291
				partial = partial.with(TimePeriod.YEAR_TYPE, year);
292
			}else{
293
				partial = partial.with(TimePeriod.YEAR_TYPE, year);
294
			}
295
			if (month != null && month != 0){
296
				partial = partial.with(TimePeriod.MONTH_TYPE, month);
297
			}
298
			if (day != null && day != 0){
299
				partial = partial.with(TimePeriod.DAY_TYPE, day);
300
			}
301
		} catch (NumberFormatException e) {
302
			logger.debug("Not a Integer format somewhere in " + singleDateString);
303
			throw new IllegalArgumentException(e);
304
		}
305
		return partial;
306

  
307
	}
308

  
309
}
1
/**
2
* Copyright (C) 2007 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9
package eu.etaxonomy.cdm.strategy.parser;
10

  
11
import java.text.DateFormat;
12
import java.text.ParsePosition;
13
import java.util.Calendar;
14
import java.util.Date;
15
import java.util.regex.Matcher;
16
import java.util.regex.Pattern;
17

  
18
import org.apache.commons.lang.StringUtils;
19
import org.apache.log4j.Logger;
20
import org.joda.time.DateTime;
21
import org.joda.time.DateTimeFieldType;
22
import org.joda.time.Partial;
23

  
24
import eu.etaxonomy.cdm.common.CdmUtils;
25
import eu.etaxonomy.cdm.common.UTF8;
26
import eu.etaxonomy.cdm.model.common.TimePeriod;
27

  
28
/**
29
 * Class for parsing all types of date string to TimePeriod
30
 * @author a.mueller
31
 * @created 14-Jul-2013
32
 */
33
public class TimePeriodParser {
34
	private static final Logger logger = Logger.getLogger(TimePeriodParser.class);
35

  
36
	//patter for first year in string;
37
	private static final Pattern firstYearPattern =  Pattern.compile("\\d{4}");
38
	//case "1806"[1807];
39
	private static final Pattern uncorrectYearPatter =  Pattern.compile("[\""+UTF8.ENGLISH_QUOT_START+"]\\d{4}[\""+UTF8.ENGLISH_QUOT_END+"]\\s*\\[\\d{4}\\]");
40
	//case fl. 1806 or c. 1806 or fl. 1806?
41
	private static final Pattern prefixedYearPattern =  Pattern.compile("(fl|c)\\.\\s*\\d{4}(\\s*-\\s*\\d{4})?\\??");
42
	//standard
43
	private static final Pattern standardPattern =  Pattern.compile("\\s*\\d{2,4}(\\s*-(\\s*\\d{2,4})?)?");
44
	private static final String strDotDate = "[0-3]?\\d\\.[01]?\\d\\.\\d{4,4}";
45
	private static final String strDotDatePeriodPattern = String.format("%s(\\s*-\\s*%s?)?", strDotDate, strDotDate);
46
	private static final Pattern dotDatePattern =  Pattern.compile(strDotDatePeriodPattern);
47
	private static final Pattern lifeSpanPattern =  Pattern.compile(String.format("%s--%s", firstYearPattern, firstYearPattern));
48

  
49

  
50
	public static TimePeriod parseString(TimePeriod timePeriod, String periodString){
51
		//TODO move to parser class
52
		//TODO until now only quick and dirty (and partly wrong)
53
		TimePeriod result = timePeriod;
54

  
55
		if(timePeriod == null){
56
			return timePeriod;
57
		}
58

  
59
		if (periodString == null){
60
			return result;
61
		}
62
		periodString = periodString.trim();
63

  
64
		result.setFreeText(null);
65
		Date date;
66

  
67
		//case "1806"[1807];
68
		if (uncorrectYearPatter.matcher(periodString).matches()){
69
			result.setFreeText(periodString);
70
			String realYear = periodString.split("\\[")[1];
71
			realYear = realYear.replace("]", "");
72
			result.setStartYear(Integer.valueOf(realYear));
73
			result.setFreeText(periodString);
74
		//case fl. 1806 or c. 1806 or fl. 1806?
75
		}else if(prefixedYearPattern.matcher(periodString).matches()){
76
			result.setFreeText(periodString);
77
			Matcher yearMatcher = firstYearPattern.matcher(periodString);
78
			yearMatcher.find();
79
			String startYear = yearMatcher.group();
80
			result.setStartYear(Integer.valueOf(startYear));
81
			if (yearMatcher.find()){
82
				String endYear = yearMatcher.group();
83
				result.setEndYear(Integer.valueOf(endYear));
84
			}
85
		}else if (dotDatePattern.matcher(periodString).matches()){
86
			parseDotDatePattern(periodString, result);
87
		}else if (lifeSpanPattern.matcher(periodString).matches()){
88
			parseLifeSpanPattern(periodString, result);
89
		}else if (standardPattern.matcher(periodString).matches()){
90
			parseStandardPattern(periodString, result);
91
//TODO first check ambiguity of parser results e.g. for 7/12/11
92
//			}else if (isDateString(periodString)){
93
//				String[] startEnd = makeStartEnd(periodString);
94
//				String start = startEnd[0];
95
//				DateTime startDateTime = dateStringParse(start, true);
96
//				result.setStart(startDateTime);
97
//				if (startEnd.length > 1){
98
//					DateTime endDateTime = dateStringParse(startEnd[1], true);
99
//					;
100
//					result.setEnd(endDateTime.toLocalDate());
101
//				}
102

  
103
		}else{
104
			result.setFreeText(periodString);
105
		}
106
		return result;
107
	}
108

  
109
	private static boolean isDateString(String periodString) {
110
		String[] startEnd = makeStartEnd(periodString);
111
		String start = startEnd[0];
112
		DateTime startDateTime = dateStringParse(start, true);
113
		if (startDateTime == null){
114
			return false;
115
		}
116
		if (startEnd.length > 1){
117
			DateTime endDateTime = dateStringParse(startEnd[1], true);
118
			if (endDateTime != null){
119
				return true;
120
			}
121
		}
122
		return false;
123
	}
124

  
125

  
126
	/**
127
	 * @param periodString
128
	 * @return
129
	 */
130
	private static String[] makeStartEnd(String periodString) {
131
		String[] startEnd = new String[]{periodString};
132
		if (periodString.contains("-") && periodString.matches("^-{2,}-^-{2,}")){
133
			startEnd = periodString.split("-");
134
		}
135
		return startEnd;
136
	}
137

  
138

  
139
	private static DateTime dateStringParse(String string, boolean strict) {
140
		DateFormat dateFormat = DateFormat.getDateInstance();
141
		ParsePosition pos = new ParsePosition(0);
142
		Date a = dateFormat.parse(string, pos);
143
		if (a == null || pos.getIndex() != string.length()){
144
			return null;
145
		}
146
		Calendar cal = Calendar.getInstance();
147
		cal.setTime(a);
148
		DateTime result = new DateTime(cal);
149
		return result;
150
	}
151

  
152

  
153
	/**
154
	 * @param periodString
155
	 * @param result
156
	 */
157
	private static void parseDotDatePattern(String periodString,TimePeriod result) {
158
		String[] dates = periodString.split("-");
159
		Partial dtStart = null;
160
		Partial dtEnd = null;
161

  
162
		if (dates.length > 2 || dates.length <= 0){
163
			logger.warn("More than 1 '-' in period String: " + periodString);
164
			result.setFreeText(periodString);
165
		}else {
166
			try {
167
				//start
168
				if (! StringUtils.isBlank(dates[0])){
169
					dtStart = parseSingleDotDate(dates[0].trim());
170
				}
171

  
172
				//end
173
				if (dates.length >= 2 && ! StringUtils.isBlank(dates[1])){
174
					dtEnd = parseSingleDotDate(dates[1].trim());
175
				}
176

  
177
				result.setStart(dtStart);
178
				result.setEnd(dtEnd);
179
			} catch (IllegalArgumentException e) {
180
				//logger.warn(e.getMessage());
181
				result.setFreeText(periodString);
182
			}
183
		}
184
	}
185

  
186
	private static void parseLifeSpanPattern(String periodString, TimePeriod result) {
187

  
188
		try{
189
			String[] years = periodString.split("--");
190
			String start = years[0];
191
			String end = years[1];
192

  
193
			result.setStartYear(Integer.valueOf(start));
194
			result.setEndYear(Integer.valueOf(end));
195
		} catch (Exception e) {
196
			//logger.warn(e.getMessage());
197
			result.setFreeText(periodString);
198
		}
199
	}
200

  
201

  
202
	/**
203
	 * @param periodString
204
	 * @param result
205
	 */
206
	private static void parseStandardPattern(String periodString,
207
			TimePeriod result) {
208
		String[] years = periodString.split("-");
209
		Partial dtStart = null;
210
		Partial dtEnd = null;
211

  
212
		if (years.length > 2 || years.length <= 0){
213
			logger.warn("More than 1 '-' in period String: " + periodString);
214
		}else {
215
			try {
216
				//start
217
				if (! CdmUtils.isEmpty(years[0])){
218
					dtStart = parseSingleDate(years[0].trim());
219
				}
220

  
221
				//end
222
				if (years.length >= 2 && ! CdmUtils.isEmpty(years[1])){
223
					years[1] = years[1].trim();
224
					if (years[1].length()==2 && dtStart != null && dtStart.isSupported(DateTimeFieldType.year())){
225
						years[1] = String.valueOf(dtStart.get(DateTimeFieldType.year())/100) + years[1];
226
					}
227
					dtEnd = parseSingleDate(years[1]);
228
				}
229

  
230
				result.setStart(dtStart);
231
				result.setEnd(dtEnd);
232
			} catch (IllegalArgumentException e) {
233
				//logger.warn(e.getMessage());
234
				result.setFreeText(periodString);
235
			}
236
		}
237
	}
238

  
239
	public static TimePeriod parseString(String strPeriod) {
240
		TimePeriod timePeriod = TimePeriod.NewInstance();
241
		return parseString(timePeriod, strPeriod);
242
	}
243

  
244

  
245
	protected static Partial parseSingleDate(String singleDateString) throws IllegalArgumentException{
246
		//FIXME until now only quick and dirty and incomplete
247
		Partial partial =  new Partial();
248
		singleDateString = singleDateString.trim();
249
		if (CdmUtils.isNumeric(singleDateString)){
250
			try {
251
				Integer year = Integer.valueOf(singleDateString.trim());
252
				if (year < 1000 && year > 2100){
253
					logger.warn("Not a valid year: " + year + ". Year must be between 1000 and 2100");
254
				}else if (year < 1700 && year > 2100){
255
					logger.warn("Not a valid taxonomic year: " + year + ". Year must be between 1750 and 2100");
256
					partial = partial.with(TimePeriod.YEAR_TYPE, year);
257
				}else{
258
					partial = partial.with(TimePeriod.YEAR_TYPE, year);
259
				}
260
			} catch (NumberFormatException e) {
261
				logger.debug("Not a Integer format in getCalendar()");
262
				throw new IllegalArgumentException(e);
263
			}
264
		}else{
265
			throw new IllegalArgumentException("Until now only years can be parsed as single dates. But date is: " + singleDateString);
266
		}
267
		return partial;
268

  
269
	}
270

  
271
	protected static Partial parseSingleDotDate(String singleDateString) throws IllegalArgumentException{
272
		Partial partial =  new Partial();
273
		singleDateString = singleDateString.trim();
274
		String[] split = singleDateString.split("\\.");
275
		int length = split.length;
276
		if (length > 3){
277
			throw new IllegalArgumentException(String.format("More than 2 dots in date '%s'", singleDateString));
278
		}
279
		String strYear = split[split.length-1];
280
		String strMonth = length >= 2? split[split.length-2]: null;
281
		String strDay = length >= 3? split[split.length-3]: null;
282

  
283

  
284
		try {
285
			Integer year = Integer.valueOf(strYear.trim());
286
			Integer month = Integer.valueOf(strMonth.trim());
287
			Integer day = Integer.valueOf(strDay.trim());
288
			if (year < 1000 && year > 2100){
289
				logger.warn("Not a valid year: " + year + ". Year must be between 1000 and 2100");
290
			}else if (year < 1700 && year > 2100){
291
				logger.warn("Not a valid taxonomic year: " + year + ". Year must be between 1750 and 2100");
292
				partial = partial.with(TimePeriod.YEAR_TYPE, year);
293
			}else{
294
				partial = partial.with(TimePeriod.YEAR_TYPE, year);
295
			}
296
			if (month != null && month != 0){
297
				partial = partial.with(TimePeriod.MONTH_TYPE, month);
298
			}
299
			if (day != null && day != 0){
300
				partial = partial.with(TimePeriod.DAY_TYPE, day);
301
			}
302
		} catch (NumberFormatException e) {
303
			logger.debug("Not a Integer format somewhere in " + singleDateString);
304
			throw new IllegalArgumentException(e);
305
		}
306
		return partial;
307

  
308
	}
309

  
310
}

Also available in: Unified diff