Project

General

Profile

Download (19.2 KB) Statistics
| Branch: | Tag: | Revision:
1
/**
2
* Copyright (C) 2009 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9

    
10
package eu.etaxonomy.cdm.strategy.parser;
11

    
12
import java.util.regex.Pattern;
13

    
14
import org.apache.log4j.Logger;
15

    
16
import eu.etaxonomy.cdm.common.UTF8;
17

    
18

    
19
/**
20
 * This class is a base class that separates regex parts of the parser from methods
21
 * @author a.mueller
22
 *
23
 */
24
public abstract class NonViralNameParserImplRegExBase  {
25
	@SuppressWarnings("unused")
26
	private static final Logger logger = Logger.getLogger(NonViralNameParserImplRegExBase.class);
27

    
28
	// good intro: http://java.sun.com/docs/books/tutorial/essential/regex/index.html
29

    
30
    //splitter
31
    protected static String epiSplitter = "(\\s+|\\(|\\))"; //( ' '+| '(' | ')' )
32
    protected static Pattern pattern = Pattern.compile(epiSplitter);
33

    
34
	public static final String hybridSign = UTF8.HYBRID.toString();  //  "\u00D7";
35

    
36
    //some useful non-terminals
37
    protected static String pStart = "^";
38
    protected static String end = "$";
39
    protected static String anyEnd = ".*" + end;
40
    protected static String oWs = "\\s+"; //obligatory whitespaces
41
    protected static String fWs = "\\s*"; //facultative whitespcace
42

    
43
    public static String capitalWord = "\\p{javaUpperCase}\\p{javaLowerCase}*";
44
    protected static String capital2LetterWord = "\\p{javaUpperCase}\\p{javaLowerCase}+";
45
    protected static String nonCapitalWord = "\\p{javaLowerCase}+";
46
    protected static String word = "(" + capitalWord + "|" + nonCapitalWord + ")"; //word (capital or non-capital) with no '.' at the end
47
    protected static String uppercaseWord = "\\p{javaUpperCase}{2,}";
48

    
49
    protected static String capitalDotWord = capitalWord + "\\.?"; //capitalWord with facultativ '.' at the end
50
    protected static String capital2charDotWord = "(" + capital2LetterWord + "\\.?|\\p{javaUpperCase}\\.)"; //capitalWord with facultativ '.' but minimum 2 characters (single capital word like 'L' is not allowed
51
    protected static String nonCapitalDotWord = nonCapitalWord + "\\.?"; //nonCapitalWord with facultativ '.' at the end
52
    protected static String dotWord = "(" + capitalWord + "|" + nonCapitalWord + ")\\.?"; //word (capital or non-capital) with facultativ '.' at the end
53
    protected static String obligateDotWord = "(" + capitalWord + "|" + nonCapitalWord + ")\\.+"; //word (capital or non-capital) with obligate '.' at the end
54

    
55
    //Words used in an epethiton for a TaxonName
56
    protected static String nonCapitalEpiWord = "[a-z\u00EF\u00EB\u00F6\\-]+";   //a-z + diaeresis for ieo
57
    protected static String capitalEpiWord = "[A-Z]"+ nonCapitalEpiWord;
58

    
59

    
60
   //years
61
    protected static String month = "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)";
62
    protected static String singleYear = "\\b" + "(?:17|18|19|20)" + "\\d{2}" + "\\b";                      // word boundary followed by either 17,18,19, or 20 (not captured) followed by 2 digits
63
    protected static String yearPhrase = singleYear + "("+ fWs + "-" + fWs + singleYear + ")?" ;
64
    								//+ "(" + month + ")?)" ;                 // optional month
65

    
66
    protected static String yearSeperator = "\\." + oWs;
67
    protected static String detailSeparator = ":" + oWs;
68
    protected static String referenceSeparator1 = "," + oWs ;
69
    protected static String inReferenceSeparator = oWs + "in" + oWs;
70
    protected static String referenceSeperator = "(" + referenceSeparator1 +"|" + inReferenceSeparator + ")" ;
71
    protected static String referenceAuthorSeparator = ","+ oWs;
72
    protected static String volumeSeparator = oWs ;
73
    protected static String referenceEnd = "\\.";
74

    
75

    
76
    //status
77
    protected static String status = "";
78

    
79
    //marker
80
    protected static String InfraGenusMarker = "(n|notho)?(subgen\\.|subg\\.|sect\\.|subsect\\.|ser\\.|subser\\.|t\\.infgen\\.|\\[unranked\\])";
81
    protected static String aggrOrGroupMarker = "(aggr\\.|agg\\.|group)";
82
    protected static String infraSpeciesMarker = "(n|notho)?(subsp\\.|convar\\.|var\\.|subvar\\.|f\\.|subf\\.|f\\.\\ssp\\.|f\\.spec\\.|f\\.sp\\.|\\[unranked\\]|tax\\." + fWs + "infrasp\\.)";
83
    protected static String oldInfraSpeciesMarker = "(prol\\.|proles|race|taxon|sublusus)";
84

    
85

    
86
    //AuthorString
87
    protected static String qm = "[" + UTF8.RIGHT_SINGLE_QUOT + "']";
88
    protected static String authorPart = "(" + "([OdDL]"+qm+"|[’']t|ten\\s||le\\s|zur\\s)?" + "(" + capital2charDotWord + "|DC.)" + "('" + nonCapitalDotWord + ")?" + "|[vV][ao]n(\\sder)?|da|du|de(n|l|\\sla)?)" ;
89
    protected static String author = "(" + authorPart + "(" + fWs + "|-)" + ")+" + "(f\\.|fil\\.|secundus)?" ;
90
    protected static String finalTeamSplitter = "(" + fWs + "(&)" + fWs + "|" + oWs + "et" + oWs + ")";
91
    protected static String notFinalTeamSplitter = "(?:" + fWs + "," + fWs + "|" + finalTeamSplitter + ")";
92
    protected static String authorTeam = fWs + "((?>" + author + notFinalTeamSplitter + ")*" + author + finalTeamSplitter + ")?(?:"  + author + "|al\\.)" +  fWs;
93
    protected static String exString = "(ex\\.?)";
94
    protected static String authorAndExTeam = "(" + authorTeam + oWs + exString + oWs + ")?" + authorTeam;
95
    protected static String basStart = "\\(";
96
    protected static String basEnd = "\\)";
97
    protected static String botanicBasionymAuthor = basStart + "(" + authorAndExTeam + ")" + basEnd;  // '(' and ')' is for evaluation with RE.paren(x)
98
    protected static String fullBotanicAuthorString = fWs + "((" + botanicBasionymAuthor +")?" + fWs + authorAndExTeam + "|" + botanicBasionymAuthor +")"+ fWs;
99
    protected static String facultFullBotanicAuthorString = "(" +  fullBotanicAuthorString + ")?" ;
100

    
101
    //Zoo. Author
102
    //TODO does zoo author have ex-Author?
103
    protected static String zooAuthorYearSeperator = "(,|\\s)";
104
    protected static String zooAuthorAddidtion = fWs + zooAuthorYearSeperator + fWs + singleYear;
105
    protected static String zooAuthorTeam = authorTeam + zooAuthorAddidtion;
106
    protected static String zooBasionymAuthor = basStart + "(" + zooAuthorTeam + ")" + basEnd;
107
    protected static String fullZooAuthorString = fWs + "((" + zooBasionymAuthor +")?" + fWs + zooAuthorTeam + "|" + zooBasionymAuthor +")"+ fWs;
108
    protected static String facultFullZooAuthorString = "(" +  fullZooAuthorString + ")?" ;
109

    
110
    protected static String facultFullAuthorString2 = "(" + facultFullBotanicAuthorString + "|" + facultFullZooAuthorString + ")";
111

    
112
    protected static String basionymAuthor = "(" + botanicBasionymAuthor + "|" + zooBasionymAuthor+ ")";
113
    protected static String fullAuthorString = "(" + fullBotanicAuthorString + "|" + fullZooAuthorString+ ")";
114

    
115
    //details
116
    //TODO still very simple
117

    
118

    
119
    protected static String nr2 = "\\d{1,2}";
120
    protected static String nr4 = "\\d{1,4}";
121
    protected static String nr5 = "\\d{1,5}";
122

    
123

    
124
    protected static String pPage = nr5 + "[a-z]?";
125
    protected static String pStrNo = "n\u00B0" + fWs + "(" + nr4 + ")";
126

    
127
    protected static String pBracketNr = "\\[" + nr4 + "\\]";
128
    protected static String pFolBracket = "\\[fol\\." + fWs + "\\d{1,2}(-\\d{1,2})?\\]";
129

    
130
    protected static String pStrTab = "tab\\." + fWs + nr4 + "(" + fWs + "(B|\u00DF|\\(\\d{1,3}\\)))?";
131
    protected static String pFig = "fig\\." + fWs + nr4 + "[a-z]?";
132
    protected static String pFigs = pFig + "(-" + nr4 + ")?";
133
    //static String pTabFig = pStrTab + "(," + fWs + pFigs + ")?";
134
    protected static String pTabFig = "(" + pStrTab + "|" + pFigs + ")";
135

    
136
    //e.g.: p455; p.455; pp455-456; pp.455-456; pp.455,456; 455, 456; pages 456-457; pages 456,567
137
    protected static String pSinglePages = "(p\\.?)?" + fWs + pPage + "(," + pTabFig +")?";
138
    protected static String pMultiPages = "(pp\\.?|pages)?" + fWs + pPage + fWs + "(-|,)" +fWs + pPage ;
139
    //static String pPages = pPage + "(," + fWs + "(" + pPage + "|" + pTabFig + ")" + ")?";
140
    protected static String pPages = "(" + pSinglePages +"|" + pMultiPages +")";
141

    
142

    
143
    protected static String pCouv = "couv\\." + fWs + "\\d{1,3}";
144

    
145
    protected static String pTabSpecial = "tab\\." + fWs + "(ad" + fWs + "\\d{1,3}|alphab)";
146
    protected static String pPageSpecial = nr4 + fWs + "(in obs|, Expl\\. Tab)";
147
    protected static String pSpecialGardDict = capitalWord + oWs + "n\u00B0" + oWs + "\\d{1,2}";
148
    //TODO
149
    // protected static String pSpecialDetail = "(in err|in tab|sine pag|add\\. & emend|Emend|""\\d{3}"" \\[\\d{3}\\])";
150
 // protected static String pSpecialDetail = "(in err|in tab|sine pag|add\\. & emend|Emend|""\\d{3}"" \\[\\d{3}\\])";
151
    protected static String pSpecialDetail = "(in err|in tab|sine pag|add\\.)";
152

    
153

    
154
//    Const romI = "[Ii]{0,3}"
155
//    	Const romX = "[Xx]{0,3}"
156
//    	Const romC = "[Cc]{0,3}"
157
//    	Const romM = "[Mm]{0,3}"
158
//    ' roman numbers
159
//    ' !! includes empty string: ""
160
//    romOne = "([Vv]?" & romI & or_ & "(IV|iv)" & or_ & "(IX|ix)" & ")"
161
//    romTen = "([Ll]?" & romX & or_ & "(XL|xl)" & or_ & "(XC|xc)" & ")"
162
//    romHun = "([Dd]?" & romC & or_ & "(CD|cd)" & or_ & "(CM|cm)" & ")"
163
//    romNr = "(?=[MDCLXVImdclxvi])(((" & romM & ")?" & romHun & ")?" & romTen & ")?" & romOne
164
    protected static String pRomNr = "ljfweffaflas"; //TODO rom number have to be tested first
165

    
166
    protected static String pDetailAlternatives = "(" + pPages + "|" + pPageSpecial + "|" + pStrNo + "|" + pBracketNr +
167
    			"|" + pTabFig + "|" + pTabSpecial + "|" + pFolBracket + "|" + pCouv + "|" + pRomNr + "|" +
168
    			pSpecialGardDict + "|" + pSpecialDetail + ")";
169

    
170
    protected static String detail = pDetailAlternatives;
171

    
172
    //reference
173
    protected static String volume = nr4 + "[a-z]?" + "(\\("+ nr4  + "(-"+nr4+")?\\))?";
174
    //this line caused problem https://dev.e-taxonomy.eu/trac/ticket/1556 in its original form: "([\u005E:\\.]" + fWs + ")";
175
    protected static String anySepChar = "([\u005E:a-zA-Z]" + fWs + ")"; //all characters except for the detail separator, a stricter version would be [,\\-\\&] and some other characters
176
//  protected static String anySepChar = "([,\\-\\&\\.\\+\\']" + fWs + ")";
177

    
178
    protected static int authorSeparatorMaxPosition = 4;  //Author may have a maximum of 4 words
179
    protected static String pTitleWordSeparator = "(\\."+ fWs+"|" + oWs + ")";
180
    protected static String pSeriesPart = ",?" + fWs + "[sS]er(\\.)?" + oWs + "\\d{1,2},?";
181
    protected static String referenceTitleFirstPart = "(" + word + pTitleWordSeparator + ")";
182
    protected static String referenceTitle = referenceTitleFirstPart + "*" + "("+ dotWord + "|" + uppercaseWord + "|" + pSeriesPart + ")";  //reference title may have words seperated by whitespace or dot. The last word may not have a whitespace at the end. There must be at least one word
183
    protected static String referenceTitleWithSepCharacters = "(((" + referenceTitle +"|\\(.+\\))"  + anySepChar + ")*" + referenceTitle + ")"; //,?
184
    //TODO test performance ??
185
    protected static String referenceTitleWithSepCharactersAndBrackets = referenceTitleWithSepCharacters + fWs + "(\\(" + referenceTitleWithSepCharacters + "\\)"+fWs+ ")?(" + referenceTitleWithSepCharacters +")?"  ;
186

    
187
    protected static String referenceTitleWithoutAuthor = "(" + referenceTitleFirstPart + ")" + "{"+ (authorSeparatorMaxPosition -1) +",}" + dotWord +
188
    			anySepChar + referenceTitleWithSepCharactersAndBrackets ;   //separators exist and first separator appears at position authorSeparatorMaxPosition or later
189

    
190
    protected static String editionSeparator = "(" + oWs + "|," + fWs + ")ed\\.?" + oWs;  //
191
    protected static String pEdition = nr2;
192

    
193
    protected static String pVolPart = volumeSeparator +  volume;
194
    protected static String pEditionPart = editionSeparator +  pEdition;
195
    protected static String pEditionVolPart = editionSeparator +  pEdition + fWs + "," + volumeSeparator +  volume;
196
    protected static String pEditionVolAlternative = "(" + pEditionPart + "|" + pVolPart + "|" + pEditionVolPart + ")?";
197

    
198
    protected static String pVolRefTitle = referenceTitle + "(" + pVolPart + ")?";
199
    protected static String softEditionVolRefTitle = referenceTitleWithSepCharactersAndBrackets + pEditionVolAlternative;
200
    protected static String softVolNoAuthorRefTitle = referenceTitleWithoutAuthor + "(" + volumeSeparator +  volume + ")?";
201

    
202
    protected static String pBookReference = softEditionVolRefTitle;
203
    protected static String pBookSectionReference = authorTeam + referenceAuthorSeparator + softEditionVolRefTitle;
204
    protected static String pArticleReference = pVolRefTitle;
205
    protected static String pSoftArticleReference = softVolNoAuthorRefTitle;
206

    
207
    protected static String pReferenceSineDetail = "(" + pArticleReference + "|" + pBookSectionReference + "|" + pBookReference + ")";
208

    
209
    protected static String pReference = pReferenceSineDetail + detailSeparator + detail +
210
					yearSeperator + yearPhrase + "(" + referenceEnd + ")?";
211

    
212
    //static String strictBook = referenc
213

    
214
    protected static Pattern referencePattern = Pattern.compile(pReference);
215
    protected static Pattern referenceSineDetailPattern = Pattern.compile(pReferenceSineDetail);
216

    
217
    protected static String pNomStatusNom =
218
            "nom\\." + fWs + "(ambig\\.|dub\\.|confus\\.|superfl\\.|nud\\.|illeg\\.|inval\\.|cons\\.(\\s*(prop|des)\\.)?|altern(ativ)?\\.|subnud\\.|nov\\.|legit\\.|sanct\\.|valid|"+
219
    			"rej\\.("+ fWs + "prop\\.)?|provis\\.|utique"+fWs+"rej\\.("+fWs+"prop\\.)?|orth\\."+fWs+"cons\\.("+fWs+"prop\\.)?)";
220
    protected static String pNomStatusOrthVar = "orth\\." + fWs + "(var\\.|rej\\.)";
221
    protected static String pNomStatusComb = "comb\\." + fWs + "(inval\\.|illeg\\.|nov\\.)";
222
    protected static String pNomStatusOpus = "opus\\." + fWs + "utique" + fWs + "oppr\\.";
223

    
224
    protected static String pNomStatus = "(" + pNomStatusNom + "|" + pNomStatusOrthVar + "|" +pNomStatusComb + "|" + pNomStatusOpus + ")";
225
    protected static String pNomStatusPhrase1 = "," + fWs + pNomStatus;
226
    protected static String pNomStatusPhrase2 = "\\[" + fWs + pNomStatus + "\\]";
227

    
228
    protected static String pNomStatusPhrase = "(?:" + pNomStatusPhrase1 + "|" + pNomStatusPhrase2 + ")";
229

    
230
// Soraya
231
//opus utique oppr.
232
//pro syn.
233
//provisional synonym
234
//fossil name
235

    
236

    
237
    //cultivars and hybrids
238
    protected static String cultivar = oWs + "'..+'"; //Achtung mit Hochkomma in AuthorNamen
239
    protected static String cultivarMarker = oWs + "(cv\\.|')";
240
    protected static String notho = "notho";
241
    protected static String hybridPart = "([xX]" + oWs + "|"+hybridSign+"|"+notho+")";
242
    protected static String noNothoHybridPart = "([xX]" + oWs + "|"+hybridSign+")";
243
    protected static String hybridFull = "(" +oWs +"|"+ pStart +")" + noNothoHybridPart;  //for some reason infraspecific notho ranks do not parse if notho is allowed as uninomial prefix.
244
    protected static String hybridFormularSeparator = oWs + "[" + hybridSign + "xX]" + oWs;
245

    
246

    
247
    //  Name String
248
    protected static String genusOrSupraGenus = "("+hybridFull+")?" + capitalEpiWord;
249
    protected static String infraGenus = capitalEpiWord + oWs + InfraGenusMarker + oWs + capitalEpiWord;
250
    protected static String aggrOrGroup = capitalEpiWord + oWs + nonCapitalEpiWord + oWs + aggrOrGroupMarker;
251
    protected static String species = genusOrSupraGenus + oWs + "("+hybridPart+")?" + nonCapitalEpiWord;
252
    protected static String infraSpecies = species + oWs + infraSpeciesMarker + oWs + "("+hybridPart+")?" + nonCapitalEpiWord;
253
    protected static String zooInfraSpecies = species + oWs + "(" + infraSpeciesMarker + oWs +")?" + "("+hybridPart+")?" + nonCapitalEpiWord;
254
    protected static String oldInfraSpecies = capitalEpiWord + oWs +  nonCapitalEpiWord + oWs + oldInfraSpeciesMarker + oWs + nonCapitalEpiWord;
255
    protected static String autonym = capitalEpiWord + oWs + "(" + nonCapitalEpiWord +")" + oWs + fullBotanicAuthorString +  oWs + infraSpeciesMarker + oWs + "\\1";  //2-nd word and last word are the same
256
    //autonym pattern used within anyBotanicalFullName pattern
257
    protected static String autonym2 = capitalEpiWord + oWs + "(" + nonCapitalEpiWord +")" + oWs + fullBotanicAuthorString +  oWs + infraSpeciesMarker + oWs + "\\2";  //2-nd word and last word are the same
258

    
259

    
260
    protected static String anyBotanicName = "(" + genusOrSupraGenus + "|" + infraGenus + "|" + aggrOrGroup + "|" + species + "|" +
261
					infraSpecies + "|" + oldInfraSpecies + "|" + autonym   + ")+";
262
    protected static String anyZooName = "(" + genusOrSupraGenus + "|" + infraGenus + "|" + aggrOrGroup + "|" + species + "|" +
263
    				zooInfraSpecies + "|" +  oldInfraSpecies + ")+";
264
    protected static String anyBotanicFullName = "(" + autonym2 + "|" + anyBotanicName + oWs + fullBotanicAuthorString + ")"  ;
265
    protected static String anyZooFullName = anyZooName + oWs + fullZooAuthorString ;
266
    protected static String anyFullName = "(" + anyBotanicFullName + "|" + anyZooFullName + ")";
267
    protected static String hybridFullName = "(" + anyFullName  + "|" +  anyBotanicName + "|" + anyZooName + ")" + hybridFormularSeparator + "(" + anyFullName  + "|" +  anyBotanicName + "|" + anyZooName + ")";
268

    
269
    //Pattern
270
    protected static Pattern oWsPattern = Pattern.compile(oWs);
271
    protected static Pattern finalTeamSplitterPattern = Pattern.compile(finalTeamSplitter);
272
    protected static Pattern cultivarPattern = Pattern.compile(cultivar);
273
    protected static Pattern cultivarMarkerPattern = Pattern.compile(cultivarMarker);
274

    
275
    protected static Pattern genusOrSupraGenusPattern = Pattern.compile(pStart + genusOrSupraGenus + facultFullAuthorString2 + end);
276
    protected static Pattern infraGenusPattern = Pattern.compile(pStart + infraGenus + facultFullAuthorString2 + end);
277
    protected static Pattern aggrOrGroupPattern = Pattern.compile(pStart + aggrOrGroup + fWs + end); //aggr. or group has no author string
278
    protected static Pattern speciesPattern = Pattern.compile(pStart + species + facultFullAuthorString2 + end);
279
    protected static Pattern infraSpeciesPattern = Pattern.compile(pStart + infraSpecies + facultFullAuthorString2 + end);
280
    protected static Pattern zooInfraSpeciesPattern = Pattern.compile(pStart + zooInfraSpecies + facultFullAuthorString2 + end);
281
    protected static Pattern oldInfraSpeciesPattern = Pattern.compile(pStart + oldInfraSpecies + facultFullAuthorString2 + end);
282
    protected static Pattern autonymPattern = Pattern.compile(pStart + autonym + fWs + end);
283
    protected static Pattern hybridFormulaPattern = Pattern.compile(pStart + hybridFullName + fWs + end);
284

    
285

    
286
    protected static Pattern botanicBasionymPattern = Pattern.compile(botanicBasionymAuthor);
287
    protected static Pattern zooBasionymPattern = Pattern.compile(zooBasionymAuthor);
288
    protected static Pattern basionymPattern = Pattern.compile(basionymAuthor);
289

    
290
    protected static Pattern zooAuthorPattern = Pattern.compile(zooAuthorTeam);
291
    protected static Pattern zooAuthorAddidtionPattern = Pattern.compile(zooAuthorAddidtion);
292

    
293
    protected static Pattern exAuthorPattern = Pattern.compile(oWs + exString);
294

    
295
    protected static Pattern fullBotanicAuthorStringPattern = Pattern.compile(fullBotanicAuthorString);
296
    protected static Pattern fullZooAuthorStringPattern = Pattern.compile(fullZooAuthorString);
297
    protected static Pattern fullAuthorStringPattern = Pattern.compile(fullAuthorString);
298

    
299
    protected static Pattern anyBotanicFullNamePattern = Pattern.compile(anyBotanicFullName);
300
    protected static Pattern anyZooFullNamePattern = Pattern.compile(anyZooFullName);
301

    
302

    
303
}
(4-4/8)