Project

General

Profile

Download (27.1 KB) Statistics
| Branch: | Tag: | Revision:
1
/**
2
* Copyright (C) 2009 EDIT
3
* European Distributed Institute of Taxonomy
4
* http://www.e-taxonomy.eu
5
*
6
* The contents of this file are subject to the Mozilla Public License Version 1.1
7
* See LICENSE.TXT at the top of this package for the full license terms.
8
*/
9

    
10
package eu.etaxonomy.cdm.strategy.parser;
11

    
12
import java.util.regex.Pattern;
13

    
14
import org.apache.logging.log4j.LogManager;import org.apache.logging.log4j.Logger;
15

    
16
import eu.etaxonomy.cdm.common.UTF8;
17

    
18

    
19
/**
20
 * This class is a base class that separates regex parts of the parser from methods
21
 * @author a.mueller
22
 */
23
public abstract class NonViralNameParserImplRegExBase  {
24
	@SuppressWarnings("unused")
25
	private static final Logger logger = LogManager.getLogger(NonViralNameParserImplRegExBase.class);
26

    
27
	// good regex intro: http://java.sun.com/docs/books/tutorial/essential/regex/index.html
28

    
29
    //splitter
30
    protected static String epiSplitter = "(\\s+|\\(|\\))"; //( ' '+| '(' | ')' )
31
    protected static Pattern pattern = Pattern.compile(epiSplitter);
32

    
33
	public static final String hybridSign = UTF8.HYBRID.toString();  //  "\u00D7";
34

    
35
    //some useful non-terminals
36
	protected static String or = "|";
37
    protected static String pStart = "^";
38
    protected static String end = "$";
39
    protected static String anyEnd = ".*" + end;
40
    protected static String oWs = "\\s+"; //obligatory whitespaces
41
    protected static String fWs = "\\s*"; //facultative whitespcace
42
    protected static String dotSpaceOrBoth = "((?<=\\.)|\\s+)+";
43

    
44
    public static String capitalWord = "\\p{javaUpperCase}\\p{javaLowerCase}*";
45
    protected static String capital2LetterWord = "\\p{javaUpperCase}\\p{javaLowerCase}+";
46
    protected static String nonCapitalWord = "\\p{javaLowerCase}+";
47
    protected static String word = "(" + capitalWord + "|" + nonCapitalWord + ")"; //word (capital or non-capital) with no '.' at the end
48
    protected static String uppercaseWord = "\\p{javaUpperCase}{2,}";
49
    protected static String apostropheWord = word + "('\\p{javaLowerCase}*)?"; //word with optional apostrophe in between
50

    
51
    protected static String capitalDotWord = capitalWord + "\\.?"; //capitalWord with facultative '.' at the end
52
    protected static String capital2charDotWord = "(" + capital2LetterWord + "\\.?|\\p{javaUpperCase}\\.)"; //capitalWord with facultative '.' but minimum 2 characters (single capital word like 'L' is not allowed
53
    protected static String twoCapitalDotWord = "\\p{javaUpperCase}{2}\\.";   //e.g. NY.
54

    
55
    protected static String nonCapitalDotWord = nonCapitalWord + "\\.?"; //nonCapitalWord with facultative '.' at the end
56
    protected static String dotWord = "(" + capitalWord + "|" + nonCapitalWord + ")\\.?"; //word (capital or non-capital) with facultative '.' at the end
57
    protected static String obligateDotWord = "(" + capitalWord + "|" + nonCapitalWord + ")\\.+"; //word (capital or non-capital) with obligate '.' at the end
58
    protected static String dashDotWord = dotWord +"([-\\u2013]" + dotWord +")?"; //dotWord with optional separation by "-"
59

    
60
    //Words used in an epithet/name part for a TaxonName
61
    protected static String nonCapitalEpiWord = "[a-z\u00EF\u00EB\u00F6\u00FC\\-]+";   //a-z + diaeresis for ieou
62
    protected static String capitalEpiWord = "[A-Z]"+ nonCapitalEpiWord;
63

    
64
   //years
65
    protected static String month = "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)";
66
    protected static String singleYear = "\\b" + "(?:17|18|19|20)" + "\\d{2}" + "\\b";       // word boundary followed by either 17,18,19, or 20 (not captured) followed by 2 digits
67
    protected static String datePeriod = TimePeriodParser.strDateWithMonthesPeriod;
68
    protected static String SEP = TimePeriodParser.SEP;
69
    protected static String yearPeriod = singleYear + "("+ fWs + SEP + fWs + singleYear + ")?";
70
    protected static String correctYearPhrase = "(" + yearPeriod + "|" + datePeriod + ")" ;
71

    
72
    public static String verbStart = TimePeriodParser.verbatimStart;
73
    public static String verbEnd = TimePeriodParser.verbatimEnd;
74

    
75
    public static String verbatimYearPhrase = "(" + verbStart + correctYearPhrase + verbEnd + fWs + "\\[" + singleYear + "\\]" +"|"
76
            + correctYearPhrase + oWs+  "publ\\.?" + fWs + correctYearPhrase + ")" ;
77
    public static String undefinedYearPhrase = correctYearPhrase + fWs + "\\[" + correctYearPhrase + "\\]";
78
    protected static String yearPhrase = "(" + correctYearPhrase + "|" + verbatimYearPhrase + "|" + undefinedYearPhrase + ")";
79

    
80
    protected static String yearSeperator = "\\." + oWs;
81
    protected static String detailSeparator = ":" + oWs;
82
    protected static String referenceSeparator1 = "," + oWs ;
83
    protected static String inReferenceSeparator = oWs + "in" + oWs;
84
    protected static String referenceSeperator = "(" + referenceSeparator1 +"|" + inReferenceSeparator + ")" ;
85
    protected static String referenceAuthorSeparator = ","+ oWs;
86
    protected static String volumeSeparator = oWs ;
87
    protected static String referenceEnd = "\\.";
88

    
89

    
90
    //status
91
    protected static String status = "";
92

    
93
    //marker
94
    protected static String InfraGenusMarker = "(n|notho)?(subg(en)?\\.|sect\\.|subsect\\.|ser\\.|subser\\.|t\\.infgen\\.|\\[unranked\\]|\\[ranglos\\])";
95
    protected static String aggrOrGroupMarker = "(aggr\\.|agg\\.|group)";
96
    protected static String infraSpeciesMarkerNoNotho = "(subsp\\.|convar\\.|var\\.|subvar\\.|f\\.|forma|subf\\.|f\\.\\ssp\\.|f\\.spec\\.|f\\.sp\\.|\\[unranked\\]|\\[ranglos\\]|tax\\." + fWs + "infrasp\\.)";
97
    public static String infraSpeciesMarker = "(n|notho)?" + infraSpeciesMarkerNoNotho;
98
    public static String oldInfraSpeciesMarker = "((sub)?prol\\.|(sub)?proles|race|taxon|(sub)?lusus|(sub)?grex)";
99

    
100
    //AuthorString
101
    protected static String qm = "[" + UTF8.QUOT_SINGLE_RIGHT + UTF8.ACUTE_ACCENT + UTF8.QUOT_SINGLE_LEFT_HIGH + "'`]";
102
    protected static String authorPart = "(" + "([OdDL]"+qm+"|"+ qm + "t\\s?|ten\\s||l[ae]\\s|zur\\s)?" + "(" + capital2charDotWord + "|DC\\.)I?" + "(" + qm + nonCapitalDotWord + ")?" + "|[vV][ao]n(\\sder)?|da|du|-e|de(n|l|\\sla)?)" ;
103
    protected static String author = "((" + authorPart + "(" + fWs + "|-)" + ")+" + "(f(il)?\\.|secundus|jun\\.|ter|bis)?|Man in "+qm+"t Veld|Sant"+qm+"Anna)" ;
104
    protected static String finalTeamSplitter = "(" + fWs + "(&)" + fWs + "|" + oWs + "et" + oWs + ")";
105
    protected static String notFinalTeamSplitter = "(?:" + fWs + "," + fWs + "|" + finalTeamSplitter + ")";
106
    protected static String authorTeam = fWs + "(((?>" + author + notFinalTeamSplitter + ")*" + author + finalTeamSplitter + ")?(?:"  + author + "|al\\.)|hort\\.)" +  fWs;
107
    protected static String exString = "(ex\\.?)";
108
    protected static String authorAndExTeam = "(" + authorTeam + oWs + exString + oWs + ")?" + authorTeam;
109
    protected static String basStart = "\\(";
110
    protected static String basEnd = "\\)";
111
    protected static String botanicBasionymAuthor = basStart + "(" + authorAndExTeam + ")" + basEnd;  // '(' and ')' is for evaluation with RE.paren(x)
112
    protected static String fullBotanicAuthorString = fWs + "((" + botanicBasionymAuthor +")?" + fWs + authorAndExTeam + "|" + botanicBasionymAuthor +")"+ fWs;
113
    protected static String facultFullBotanicAuthorString = "(" +  fullBotanicAuthorString + ")?" ;
114

    
115
    //Zoo. Author
116
    //TODO does zoo author have ex-Author?
117
    protected static String zooAuthorYearSeperator = "(,|\\s)";
118
    protected static String zooAuthorAddidtion = fWs + zooAuthorYearSeperator + fWs + singleYear;
119
    protected static String zooAuthorTeam = authorTeam + zooAuthorAddidtion;
120
    protected static String zooBasionymAuthor = basStart + "(" + zooAuthorTeam + ")" + basEnd;
121
    protected static String fullZooAuthorString = fWs + "((" + zooBasionymAuthor +")?" + fWs + zooAuthorTeam + "|" + zooBasionymAuthor +")"+ fWs;
122
    protected static String facultFullZooAuthorString = "(" +  fullZooAuthorString + ")?" ;
123

    
124
    protected static String facultFullAuthorString2 = "(" + facultFullBotanicAuthorString + "|" + facultFullZooAuthorString + ")";
125

    
126
    protected static String basionymAuthor = "(" + botanicBasionymAuthor + "|" + zooBasionymAuthor+ ")";
127
    protected static String fullAuthorString = "(" + fullBotanicAuthorString + "|" + fullZooAuthorString+ ")";
128

    
129
    //details
130
    //TODO still not all parsed
131

    
132

    
133
    protected static String simpleRoman = "([IVXLC]+|[ivxlc]+)";
134

    
135
    protected static String nr2 = "\\d{1,2}";
136
    protected static String nr4 = "\\d{1,4}";
137
    protected static String nr5 = "\\d{1,5}";
138

    
139

    
140
    protected static String pPage = nr5 + "[a-zA-Z]?";
141
    protected static String pStrNo = "("+capitalWord + oWs + ")?n[o\u00B0]?\\.?" + fWs + "(" + nr4 + ")";
142

    
143
    protected static String pBracketNr = "\\[" + nr4 + "\\]";
144
    protected static String pFolBracket = "\\[fol\\." + fWs + "\\d{1,2}(-\\d{1,2})?\\]";  //maybe merge with pTabFigPlate (see below)
145

    
146

    
147
    protected static String pRangeSep = "[-\u2013]";
148
    protected static String pRangeSepCo = "[-\u2013,/]";
149

    
150
    protected static String pTabFigPlateStart = "([tT](abs?)?|[fF](igs?)?|[pP]l?s?)(\\.|\\s|$)";   //$ for only 'f'
151
    protected static String pAbcNr = "([a-zA-Z\u00DF]|bis)";
152
    protected static String pTabFigPlateNumber = "(" + nr4 + "|" + pAbcNr + "|" + nr4 + fWs + pAbcNr + "|" + simpleRoman + ")("+ pRangeSepCo + fWs + pAbcNr + ")?";
153
    protected static String pTabFigPlateNumbers = "(" + pTabFigPlateNumber + "(" + pRangeSepCo + fWs + pTabFigPlateNumber + ")?|s.n.)";
154

    
155
    protected static String pTabFigPlate = pTabFigPlateStart + fWs + pTabFigPlateNumbers + "?";
156
    protected static String pTabFigPl = pTabFigPlate;
157

    
158
    //e.g.: p455; p.455; pp455-456; pp.455-456; pp.455,456; 455, 456; pages 456-457; pages 456,567
159
    protected static String pSinglePages = "(p\\.?)?" + fWs + pPage + "(," + pTabFigPl +"){0,2}";
160
    protected static String pMultiPages = "(pp\\.?|pages)?" + fWs + pPage + fWs + pRangeSepCo +fWs + pPage ;
161
    //static String pPages = pPage + "(," + fWs + "(" + pPage + "|" + pTabFig + ")" + ")?";
162
    protected static String pPages = "(" + pSinglePages +"|" + pMultiPages +")";
163
    protected static String pPagesTabFig = pPages +"([,\\.]" + fWs + pTabFigPl + "){1,2}";
164

    
165
    protected static String pAnyWordWithNumber = word + "(\\.|\\s)" + fWs + "\\d{1,3}";
166

    
167
    protected static String pTabSpecial = "tab\\." + fWs + "(ad" + fWs + "\\d{1,3}|alphab)";
168
    protected static String pPageSpecial = nr4 + fWs + "(in obs|, Expl\\. Tab)";
169
    protected static String pSpecialGardDict = capitalWord + oWs + "n\u00B0" + oWs + "\\d{1,2}";
170
    //TODO
171
    // protected static String pSpecialDetail = "(in err|in tab|sine pag|add\\. & emend|Emend|""\\d{3}"" \\[\\d{3}\\])";
172
 // protected static String pSpecialDetail = "(in err|in tab|sine pag|add\\. & emend|Emend|""\\d{3}"" \\[\\d{3}\\])";
173
    protected static String pSpecialDetail = "(in err|in tab|sine pag|add\\.|s.p.?|errata)";
174

    
175

    
176
//    "(,\\s*" + pTabFigPl + ")?" +
177
    protected static String pDetailAlternatives = "(" + pPages + "|" + pPageSpecial + "|" + pStrNo + "|" + pBracketNr +
178
    			"|" + pTabFigPl + "(,\\s*" + pTabFigPl + ")?" + "|" + pTabSpecial + "|" + pFolBracket + "|" + pAnyWordWithNumber + "|" +
179
    			pSpecialGardDict + "|" + pSpecialDetail + "|" + pPagesTabFig + "|" + simpleRoman +  ")";
180

    
181
    protected static String detail = pDetailAlternatives;
182

    
183
    //reference
184
    protected static String bracketVolume = "(" + nr4 + "[A-Za-z]?" + "([-\u2013,]\\s*" + nr4 + ")?|" + "((\\d{1,2},\\s*)?(Suppl|Beibl|App|Beil|Misc|Vorabdr|Erg|Bih|(Sess\\.\\s*)?Extr|Reimpr|Bibl|Polypet|Litt|Phys|Orchid)\\.(\\s*\\d{1,4})?|Heft\\s*\\d{1,4}|Extra)){1,2}";
185
    protected static String volume =  nr4 + "([-\u2013]"+nr4+")?" + "[A-Za-z]?" + fWs + "(\\("+ bracketVolume + "\\))?";
186
//    protected static String volume_old = nr4 + "[A-Za-z]?" + fWs + "(\\("+ nr4 + "[A-Za-z]?" + "([-\u2013]" + nr4 + ")?\\)|[-\u2013]"+nr4+")?" + "(\\(((\\d{1,2},\\s*)?(Suppl|Beibl|App|Beil|Misc|Vorabdr|Erg)\\.(\\s*\\d{1,4})?|Heft\\s*\\d{1,4})\\))?";
187
    //this line caused problem https://dev.e-taxonomy.eu/redmine/issues/1556 in its original form: "([\u005E:\\.]" + fWs + ")";
188
    protected static String anySepChar = "([\u005E:a-zA-Z]" + fWs + "|" +oWs + "&" + oWs + ")"; //all characters except for the detail separator, a stricter version would be [,\\-\\&] and some other characters
189
//  protected static String anySepChar = "([,\\-\\&\\.\\+\\']" + fWs + ")";
190
    protected static String quotations = "\""+capitalDotWord + "(" + oWs + capitalDotWord +")*\"";
191

    
192
    protected static int authorSeparatorMaxPosition = 3;  //author may have a maximum of 2 words
193
    protected static String pTitleWordSeparator = "(\\."+ fWs+"|" + oWs + "|\\.?[-\u2013]"+oWs+"|\\.?" + oWs + "&(?!\\s*al\\.)" + oWs + ")";
194
    protected static String pSeriesPart = fWs + ",?" + fWs + "(([sS][e\u00E9]r|сер)("+oWs+"|\\."+fWs+")(\\d{1,2}|[A-Z](\\s*\\d{1,2})?)|n(ov)?\\.\\s*[sS](er)?\\.|Jerusalem Ser\\.|(Pt|Sect)\\.\\s*\\d{1,2}),?";  //Pt. (Part) and Sect. (Section) currently handled as series part, which is part of title, may be handled different later
195

    
196
    protected static String authorPrefix = "(Da(lla)?|Van|La|De)" + oWs; //should not include words allowed in first part of reference title
197
    protected static String firstTitleWord = "(?!"+authorPrefix+")" + word + "('\\p{javaLowerCase}*|\\.?[-\u2013]"+dotWord+")?"; //word with optional apostrophe in between
198

    
199
    protected static String singleJournalTitles = "PhytoKeys|PLoS ONE";
200
    protected static String referenceTitleFirstPart = "(" + firstTitleWord + pTitleWordSeparator + "|" + twoCapitalDotWord + fWs + ")";
201
    protected static String referenceTitleBase = "("+ referenceTitleFirstPart + "*" + "("+ dashDotWord + "|" + uppercaseWord + "|" + quotations + ")"
202
                    + "|" +singleJournalTitles + ")";  //reference title may have words separated by whitespace or dot. The last word may not have a whitespace at the end. There must be at least one word
203
    protected static String referenceTitleBaseWithSeries = referenceTitleBase + "("+ pSeriesPart + ")?";
204
    protected static String referenceTitle = "("+referenceTitleBaseWithSeries +")";
205
    protected static String referenceTitleWithSepCharacters = "(((" + referenceTitle +"|\\(.+\\))"  + anySepChar + ")*" + referenceTitle + ")"; //,?
206
    //TODO test performance ??
207
    protected static String referenceTitleWithSepCharactersAndBrackets = referenceTitleWithSepCharacters + fWs + "(\\(" + referenceTitleWithSepCharacters + "\\)"+fWs+ ")?(" + referenceTitleWithSepCharacters +")?"  ;
208

    
209
    protected static String referenceTitleWithoutAuthor = "(" + referenceTitleFirstPart + "){"+ (authorSeparatorMaxPosition -1) +",}" + dotWord +
210
    			anySepChar + referenceTitleWithSepCharactersAndBrackets ;   //separators exist and first separator appears at position authorSeparatorMaxPosition or later
211
    protected static String referenceTitleWithPlaceBracket = referenceTitle + "(" + oWs + "\\(" + capitalWord + "(" + oWs + "(&\\s+)?" +  capitalWord + ")?" + "\\))?" ;
212

    
213
    protected static String editionSeparator = "(" + oWs + "|," + fWs + ")ed\\.?" + oWs;  //
214
    public static String pEdition = nr2;
215

    
216
    protected static String pVolPart = volumeSeparator + volume;
217
    protected static String pEditionPart = "(" + editionSeparator +  pEdition +"([A-Z]|\\s*bis)?|,\\s*(jubilee|nouv\\.) ed\\.)";
218
    protected static String pEditionVolPart = pEditionPart + fWs + "," + volumeSeparator + volume;
219
    protected static String pEditionVolAlternative = "(" + pEditionPart + "|" + pVolPart + "|" + pEditionVolPart + ")?";
220

    
221
//    protected static String pVolRefTitle = referenceTitle + "(" + pVolPart + ")?";
222
    protected static String pVolRefTitle = referenceTitleWithPlaceBracket + "(" + pVolPart + ")?";
223
    protected static String softEditionVolRefTitle = referenceTitleWithSepCharactersAndBrackets + pEditionVolAlternative;
224
    protected static String softVolNoAuthorRefTitle = referenceTitleWithoutAuthor + "(" + volumeSeparator +  volume + ")?";
225

    
226
    protected static String pBookReference = softEditionVolRefTitle;
227
    protected static String pBookSectionReference = authorTeam + referenceAuthorSeparator + softEditionVolRefTitle;
228
    protected static String pArticleReference = pVolRefTitle;
229
    protected static String pSoftArticleReference = softVolNoAuthorRefTitle;
230

    
231
    protected static String pReferenceSineDetail = "(" + pArticleReference + "|" + pBookSectionReference + "|" + pBookReference + ")";
232

    
233
    protected static String pReference = pReferenceSineDetail + detailSeparator + detail +
234
					yearSeperator + yearPhrase + "(" + referenceEnd + ")?";
235

    
236
    //static String strictBook = referenc
237

    
238
    protected static Pattern referencePattern = Pattern.compile(pReference);
239
    protected static Pattern referenceSineDetailPattern = Pattern.compile(pReferenceSineDetail);
240

    
241
    protected static String originalSpellingName = "("+capitalEpiWord +")?(" + fWs + "\\(" + capitalEpiWord + "\\))?(" +
242
                        fWs + nonCapitalEpiWord+")?(" + oWs + infraSpeciesMarker + oWs + nonCapitalEpiWord + ")?";
243
    protected static String originalSpellingStart = "\\[as \"";
244
    protected static String originalSpellingEnd = "\"\\]";
245

    
246
    protected static String pOriginalSpelling = oWs + originalSpellingStart + originalSpellingName + originalSpellingEnd;
247
    protected static Pattern originalSpellingPattern = Pattern.compile(pOriginalSpelling);
248

    
249
    protected static String pNomStatusNom =
250
            "nom\\." + fWs + "(ambig\\.|dub\\.|confus\\.|superfl\\.|nud\\.|illeg\\.|inval\\.|cons\\.(\\s*(prop|des)\\.)?|altern(ativ)?\\.|subnud\\.|nov\\.|legit\\.|sanct\\.|val\\.|"+
251
    			"rej\\.("+ fWs + "prop\\.)?|provis\\.|utique"+fWs+"rej\\.("+fWs+"prop\\.)?)";
252
    protected static String pNomStatusOrth = "orth\\." + fWs + "(var\\.|rej\\.|cons\\.("+fWs+"prop\\.)?)";
253
    protected static String pNomStatusComb = "comb\\." + fWs + "(inval\\.|illeg\\.|nov\\.)";
254
    protected static String pNomStatusOpus = "op\\." + fWs + "utique" + fWs + "oppr\\.";
255
    protected static String pNomStatusIned = "ined\\.";
256

    
257

    
258
    protected static String pNomStatus = "(" + pNomStatusNom + "|" + pNomStatusOrth + "|" +pNomStatusComb + "|" + pNomStatusOpus + "|" + pNomStatusIned + ")";
259
    protected static String pNomStatusPhrase1 = "," + fWs + pNomStatus;
260
    protected static String pNomStatusPhrase2 = "\\[" + fWs + pNomStatus + "\\]";
261

    
262
    protected static String pNomStatusPhrase = "(?:" + pNomStatusPhrase1 + "|" + pNomStatusPhrase2 + ")";
263

    
264
// Soraya
265
//pro syn.
266
//provisional synonym
267
//fossil name
268

    
269
    //cultivars and hybrids
270
    protected static String cultivarWord = "[a-zA-Z0-9-,\u2019!/\\.\\\\]+";
271
    protected static String cultivarPhrase = cultivarWord + "("+ oWs + cultivarWord + ")*";   //TODO still unclear if groups and grex really may also contain special characters
272
    protected static String cultivarStatus = qm + cultivarPhrase + qm; //for stricter rules see Art. 21.xxx (but most of them are time dependend)
273
    public static String group = "(Gp|Groupe?|Grupp(en?|o))";
274
    public static String grex = "g(re)?x";
275
    protected static String cultivarGroupStatus = (cultivarPhrase + oWs + group + or + group + oWs + cultivarPhrase);
276
    protected static String cultivarGrexStatus = (cultivarPhrase + oWs + grex );
277

    
278
    protected static String cultivarOld =  oWs + "'..+'"; //Careful with apostroph in author names
279
    protected static String cultivarOldMarker = oWs + "(cv\\.|')";
280
    protected static String notho = "notho";
281
    protected static String hybridPart = "([xX]" + oWs + "|"+hybridSign+"|"+notho+")";
282
    protected static String noNothoHybridPart = "([xX]" + oWs + "|"+hybridSign+")";
283
    protected static String hybridFull = "(" +oWs +"|"+ pStart +")" + noNothoHybridPart;  //for some reason infraspecific notho ranks do not parse if notho is allowed as uninomial prefix.
284
    protected static String hybridFormularSeparator = oWs + "[" + hybridSign + "xX]" + oWs;
285

    
286

    
287
    //  Name String
288
    protected static String genusOrSupraGenus = "("+hybridFull+")?" + capitalEpiWord;
289
    protected static String infraGenus = capitalEpiWord + oWs + InfraGenusMarker + oWs + capitalEpiWord;
290
    protected static String aggrOrGroup = capitalEpiWord + oWs + nonCapitalEpiWord + oWs + aggrOrGroupMarker;
291
    protected static String spNov = "sp\\.(\\s*nov\\.)?(\\s*\\d{1,2})?";
292
    protected static String specificEpi = "(" + nonCapitalEpiWord + "|" + spNov + ")";
293
    protected static String species = genusOrSupraGenus + oWs + "("+hybridPart+")?" + specificEpi;
294
    protected static String speciesWithInfraGen = genusOrSupraGenus + oWs + "\\(" + capitalEpiWord + "\\)" + oWs + specificEpi;
295

    
296
    protected static String infraSpecies = species + oWs + infraSpeciesMarker + oWs + "("+hybridPart+")?" + nonCapitalEpiWord;
297
    protected static String zooInfraSpecies = species + oWs + "(" + infraSpeciesMarker + oWs +")?" + "("+hybridPart+")?" + nonCapitalEpiWord;
298
    protected static String oldInfraSpecies = capitalEpiWord + oWs +  nonCapitalEpiWord + oWs + oldInfraSpeciesMarker + oWs + nonCapitalEpiWord;
299
    protected static String autonym = capitalEpiWord + oWs + "(" + nonCapitalEpiWord +")" + oWs + fullBotanicAuthorString +  oWs + infraSpeciesMarker + oWs + "\\1";  //2-nd word and last word are the same
300
    protected static String genusAutonym = "("+capitalEpiWord+")" + oWs + fullBotanicAuthorString + oWs + InfraGenusMarker + oWs + "\\1";  //1st word and last word are the same
301
    //autonym patterns used within anyBotanicalFullName pattern as we need another group number there
302
    protected static String autonym2 =     "("+capitalEpiWord+")" + oWs
303
            + "(" + hybridSign + "?(" + nonCapitalEpiWord +")" + oWs + fullBotanicAuthorString + oWs + infraSpeciesMarker + oWs + "\\4|"  //infraspecific autonym
304
            +       fullBotanicAuthorString + oWs + InfraGenusMarker + oWs + "\\2"  //infrageneric autonym
305
            + ")";  //2-nd word and last word are the same
306

    
307
    protected static String anyPureBotanicName = "(" + genusOrSupraGenus + "|" + infraGenus + "|" + aggrOrGroup + "|" + species + "|" +
308
                    speciesWithInfraGen + "|" + infraSpecies + "|" + oldInfraSpecies + "|" + autonym + "|" + genusAutonym + ")+";
309
    protected static String anyBotanicName = anyPureBotanicName; //no difference yet, cultivars are currently implemented differently. was: (anyPureBotanicName + or + cultivar + or + cultivarGroup);
310
    protected static String cultivarGr = "(\\s+(?<cultivar>" + cultivarStatus + "))?";
311
    protected static String cultivarGroupGr = "(\\s+((?<cultivarGroup>" + cultivarGroupStatus + ")|\\((?<cultivarBrGroup>" + cultivarGroupStatus + ")\\)))?";
312
//    protected static String cultivarGroupGrx = "(\\s+\\((?<cultivarBrGroup>" + cultivarGroupStatus + ")\\))?";
313
    protected static String cultivarGrexGr =  "(\\s+(?<cultivarGrex>"       + cultivarGrexStatus  +    "))?";
314

    
315
    //to be used in combination if cultivar name is not a pure name but contains additional information (e.g. author)
316
    protected static String anyCultivarNameUnordered = anyPureBotanicName + oWs + "(" + cultivarStatus + or + cultivarGroupStatus + or + "\\(" + cultivarGroupStatus + "\\)" + or + cultivarGrexStatus + ")";
317
    protected static String anyCultivarName = "("+ anyPureBotanicName + ")(?!\\s*$)(" + cultivarGrexGr + cultivarGroupGr + cultivarGr + ")";
318

    
319
    protected static String anyZooName = "(" + genusOrSupraGenus + "|" + infraGenus + "|" + aggrOrGroup + "|" + species + "|" +
320
                    speciesWithInfraGen + "|" +zooInfraSpecies + "|" +  oldInfraSpecies + ")+";
321
    protected static String anyBotanicFullName = "(" + autonym2 + "|" + anyBotanicName + oWs + fullBotanicAuthorString + ")"  ;
322
    protected static String anyCultivarFullName = anyCultivarName + oWs + "(?<cultivarAuthor>" + authorTeam +")";
323
    protected static String anyZooFullName = anyZooName + oWs + fullZooAuthorString ;
324
    protected static String anyFullName = "(" + anyBotanicFullName + "|" + anyZooFullName + ")";
325
    protected static String abbrevHybridGenus = "([A-Z](\\.\\s*|\\s+))";
326
    protected static String abbrevHybridSecondPartWithSpecies = abbrevHybridGenus + "?" + nonCapitalEpiWord + "(" + oWs + infraSpeciesMarkerNoNotho + oWs + nonCapitalEpiWord + ")?";  //#5983 first step but still to strict
327
    protected static String abbrevHybridSecondPartOnlyInfraSpecies = infraSpeciesMarkerNoNotho + oWs + nonCapitalEpiWord;
328
    protected static String abbrevHybridSecondPart = "(" + abbrevHybridSecondPartWithSpecies + "|" + abbrevHybridSecondPartOnlyInfraSpecies + ")";
329

    
330
    protected static String hybridSecondPart = "(" + anyFullName  + "|" +  anyBotanicName + "|" + anyZooName + "|" + abbrevHybridSecondPart + ")";
331
    protected static String hybridFullName = "(" + anyFullName  + "|" +  anyBotanicName + "|" + anyZooName + ")" + hybridFormularSeparator + hybridSecondPart ;
332

    
333
    //Pattern
334
    protected static Pattern oWsPattern = Pattern.compile(oWs);
335
    protected static Pattern finalTeamSplitterPattern = Pattern.compile(finalTeamSplitter);
336
    protected static Pattern anyCultivarNamePattern = Pattern.compile(anyCultivarName);
337
    protected static Pattern anyCultivarNameUnorderedPattern = Pattern.compile(anyCultivarNameUnordered);
338

    
339
    protected static Pattern genusOrSupraGenusPattern = Pattern.compile(pStart + genusOrSupraGenus + facultFullAuthorString2 + end);
340
    protected static Pattern infraGenusPattern = Pattern.compile(pStart + infraGenus + facultFullAuthorString2 + end);
341
    protected static Pattern aggrOrGroupPattern = Pattern.compile(pStart + aggrOrGroup + fWs + end); //aggr. or group has no author string
342
    protected static Pattern speciesPattern = Pattern.compile(pStart + species + facultFullAuthorString2 + end);
343
    protected static Pattern speciesWithInfraGenPattern = Pattern.compile(pStart + speciesWithInfraGen + facultFullAuthorString2 + end);
344
    protected static Pattern infraSpeciesPattern = Pattern.compile(pStart + infraSpecies + facultFullAuthorString2 + end);
345
    protected static Pattern zooInfraSpeciesPattern = Pattern.compile(pStart + zooInfraSpecies + facultFullAuthorString2 + end);
346
    protected static Pattern oldInfraSpeciesPattern = Pattern.compile(pStart + oldInfraSpecies + facultFullAuthorString2 + end);
347
    protected static Pattern autonymPattern = Pattern.compile(pStart + autonym + fWs + end);
348
    protected static Pattern genusAutonymPattern = Pattern.compile(pStart + genusAutonym + fWs + end);
349
    protected static Pattern hybridFormulaPattern = Pattern.compile(pStart + hybridFullName + fWs + end);
350

    
351
    protected static Pattern botanicBasionymPattern = Pattern.compile(botanicBasionymAuthor);
352
    protected static Pattern zooBasionymPattern = Pattern.compile(zooBasionymAuthor);
353
    protected static Pattern basionymPattern = Pattern.compile(basionymAuthor);
354

    
355
    protected static Pattern zooAuthorPattern = Pattern.compile(zooAuthorTeam);
356
    protected static Pattern zooAuthorAddidtionPattern = Pattern.compile(zooAuthorAddidtion);
357

    
358
    protected static Pattern exAuthorPattern = Pattern.compile(oWs + exString);
359

    
360
    protected static Pattern fullBotanicAuthorStringPattern = Pattern.compile(fullBotanicAuthorString);
361
    protected static Pattern fullZooAuthorStringPattern = Pattern.compile(fullZooAuthorString);
362
    protected static Pattern fullAuthorStringPattern = Pattern.compile(fullAuthorString);
363

    
364
    protected static Pattern anyBotanicFullNamePattern = Pattern.compile(anyBotanicFullName);
365
    protected static Pattern anyCultivarFullNamePattern = Pattern.compile(anyCultivarFullName);
366
    protected static Pattern anyZooFullNamePattern = Pattern.compile(anyZooFullName);
367

    
368
    protected static Pattern spNovPattern = Pattern.compile(spNov);
369
}
(6-6/9)