Solved an issue where a reference with an uppercase word (e.g. SSSR) was not parsable
[cdmlib.git] / cdmlib-model / src / main / java / eu / etaxonomy / cdm / strategy / parser / NonViralNameParserImplRegExBase.java
1 /**
2 * Copyright (C) 2009 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
8 */
9
10 package eu.etaxonomy.cdm.strategy.parser;
11
12 import java.util.regex.Pattern;
13
14 import org.apache.log4j.Logger;
15
16
17 /**
18 * This class is a base class that separates regex parts of the parser from methods
19 * @author a.mueller
20 *
21 */
22 public abstract class NonViralNameParserImplRegExBase {
23 @SuppressWarnings("unused")
24 private static final Logger logger = Logger.getLogger(NonViralNameParserImplRegExBase.class);
25
26 // good intro: http://java.sun.com/docs/books/tutorial/essential/regex/index.html
27
28 //splitter
29 protected static String epiSplitter = "(\\s+|\\(|\\))"; //( ' '+| '(' | ')' )
30 protected static Pattern pattern = Pattern.compile(epiSplitter);
31
32 public static final String hybridSign = "\u00D7";
33
34 //some useful non-terminals
35 protected static String pStart = "^";
36 protected static String end = "$";
37 protected static String anyEnd = ".*" + end;
38 protected static String oWs = "\\s+"; //obligatory whitespaces
39 protected static String fWs = "\\s*"; //facultative whitespcace
40
41 protected static String capitalWord = "\\p{javaUpperCase}\\p{javaLowerCase}*";
42 protected static String nonCapitalWord = "\\p{javaLowerCase}+";
43 protected static String word = "(" + capitalWord + "|" + nonCapitalWord + ")"; //word (capital or non-capital) with no '.' at the end
44 protected static String uppercaseWord = "\\p{javaUpperCase}+";
45
46 protected static String capitalDotWord = capitalWord + "\\.?"; //capitalWord with facultativ '.' at the end
47 protected static String nonCapitalDotWord = nonCapitalWord + "\\.?"; //nonCapitalWord with facultativ '.' at the end
48 protected static String dotWord = "(" + capitalWord + "|" + nonCapitalWord + ")\\.?"; //word (capital or non-capital) with facultativ '.' at the end
49 protected static String obligateDotWord = "(" + capitalWord + "|" + nonCapitalWord + ")\\.+"; //word (capital or non-capital) with obligate '.' at the end
50
51 //Words used in an epethiton for a TaxonName
52 protected static String nonCapitalEpiWord = "[a-z\u00EF\\-]+";
53 protected static String capitalEpiWord = "[A-Z]"+ nonCapitalEpiWord;
54
55
56 //years
57 protected static String month = "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)";
58 protected static String singleYear = "\\b" + "(?:17|18|19|20)" + "\\d{2}" + "\\b"; // word boundary followed by either 17,18,19, or 20 (not captured) followed by 2 digits
59 protected static String yearPhrase = singleYear + "("+ fWs + "-" + fWs + singleYear + ")?" ;
60 //+ "(" + month + ")?)" ; // optional month
61
62 //seperator
63 protected static String yearSeperator = "\\." + oWs;
64 protected static String detailSeparator = ":" + oWs;
65 protected static String referenceSeparator1 = "," + oWs ;
66 protected static String inReferenceSeparator = oWs + "in" + oWs;
67 protected static String referenceSeperator = "(" + referenceSeparator1 +"|" + inReferenceSeparator + ")" ;
68 protected static String referenceAuthorSeparator = ","+ oWs;
69 protected static String volumeSeparator = oWs ;
70 protected static String referenceEnd = "\\.";
71
72
73 //status
74 protected static String status = "";
75
76 //marker
77 protected static String InfraGenusMarker = "(subgen.|subg.|sect.|subsect.|ser.|subser.|t.infgen.)";
78 protected static String aggrOrGroupMarker = "(aggr.|agg.|group)";
79 protected static String infraSpeciesMarker = "(subsp.|convar.|var.|subvar.|f.|subf.|f.spec.|tax." + fWs + "infrasp.)";
80 protected static String oldInfraSpeciesMarker = "(prol.|proles|race|taxon|sublusus)";
81
82
83 //AuthorString
84 protected static String authorPart = "(" + "(d'|D'|L'|'t\\s)?" + capitalDotWord + "('" + nonCapitalDotWord + ")?" + "|da|de(n|l|\\sla)?)" ;
85 protected static String author = "(" + authorPart + "(" + fWs + "|-)" + ")+" + "(f.|fil.|secundus)?";
86 protected static String teamSplitter = fWs + "(&)" + fWs;
87 protected static String authorTeam = fWs + "(" + author + teamSplitter + ")*" + author + "(" + teamSplitter + "al.)?" + fWs;
88 protected static String exString = "(ex.?)";
89 protected static String authorAndExTeam = "(" + authorTeam + oWs + exString + oWs + ")?" + authorTeam;
90 protected static String basStart = "\\(";
91 protected static String basEnd = "\\)";
92 protected static String botanicBasionymAuthor = basStart + "(" + authorAndExTeam + ")" + basEnd; // '(' and ')' is for evaluation with RE.paren(x)
93 protected static String fullBotanicAuthorString = fWs + "((" + botanicBasionymAuthor +")?" + fWs + authorAndExTeam + "|" + botanicBasionymAuthor +")"+ fWs;
94 protected static String facultFullBotanicAuthorString = "(" + fullBotanicAuthorString + ")?" ;
95
96 //Zoo. Author
97 //TODO does zoo author have ex-Author?
98 protected static String zooAuthorYearSeperator = ",";
99 protected static String zooAuthorAddidtion = fWs + zooAuthorYearSeperator + fWs + singleYear;
100 protected static String zooAuthorTeam = authorTeam + zooAuthorAddidtion;
101 protected static String zooBasionymAuthor = basStart + "(" + zooAuthorTeam + ")" + basEnd;
102 protected static String fullZooAuthorString = fWs + "((" + zooBasionymAuthor +")?" + fWs + zooAuthorTeam + "|" + zooBasionymAuthor +")"+ fWs;
103 protected static String facultFullZooAuthorString = "(" + fullZooAuthorString + ")?" ;
104
105 protected static String facultFullAuthorString2 = "(" + facultFullBotanicAuthorString + "|" + facultFullZooAuthorString + ")";
106
107 protected static String basionymAuthor = "(" + botanicBasionymAuthor + "|" + zooBasionymAuthor+ ")";
108 protected static String fullAuthorString = "(" + fullBotanicAuthorString + "|" + fullZooAuthorString+ ")";
109
110 //details
111 //TODO still very simple
112
113
114 protected static String nr2 = "\\d{1,2}";
115 protected static String nr4 = "\\d{1,4}";
116 protected static String nr5 = "\\d{1,5}";
117
118
119 protected static String pPage = nr5 + "[a-z]?";
120 protected static String pStrNo = "n\u00B0" + fWs + "(" + nr4 + ")";
121
122 protected static String pBracketNr = "\\[" + nr4 + "\\]";
123 protected static String pFolBracket = "\\[fol\\." + fWs + "\\d{1,2}(-\\d{1,2})?\\]";
124
125 protected static String pStrTab = "tab\\." + fWs + nr4 + "(" + fWs + "(B|\u00DF|\\(\\d{1,3}\\)))?";
126 protected static String pFig = "fig." + fWs + nr4 + "[a-z]?";
127 protected static String pFigs = pFig + "(-" + nr4 + ")?";
128 //static String pTabFig = pStrTab + "(," + fWs + pFigs + ")?";
129 protected static String pTabFig = "(" + pStrTab + "|" + pFigs + ")";
130
131 //e.g.: p455; p.455; pp455-456; pp.455-456; pp.455,456; 455, 456; pages 456-457; pages 456,567
132 protected static String pSinglePages = "(p\\.?)?" + fWs + pPage + "(," + pTabFig +")?";
133 protected static String pMultiPages = "(pp\\.?|pages)?" + fWs + pPage + fWs + "(-|,)" +fWs + pPage ;
134 //static String pPages = pPage + "(," + fWs + "(" + pPage + "|" + pTabFig + ")" + ")?";
135 protected static String pPages = "(" + pSinglePages +"|" + pMultiPages +")";
136
137
138 protected static String pCouv = "couv\\." + fWs + "\\d{1,3}";
139
140 protected static String pTabSpecial = "tab\\." + fWs + "(ad" + fWs + "\\d{1,3}|alphab)";
141 protected static String pPageSpecial = nr4 + fWs + "(in obs|, Expl\\. Tab)";
142 protected static String pSpecialGardDict = capitalWord + oWs + "n\u00B0" + oWs + "\\d{1,2}";
143 //TODO
144 // protected static String pSpecialDetail = "(in err|in tab|sine pag|add\\. & emend|Emend|""\\d{3}"" \\[\\d{3}\\])";
145 // protected static String pSpecialDetail = "(in err|in tab|sine pag|add\\. & emend|Emend|""\\d{3}"" \\[\\d{3}\\])";
146 protected static String pSpecialDetail = "(in err|in tab|sine pag|add\\.)";
147
148
149 // Const romI = "[Ii]{0,3}"
150 // Const romX = "[Xx]{0,3}"
151 // Const romC = "[Cc]{0,3}"
152 // Const romM = "[Mm]{0,3}"
153 // ' roman numbers
154 // ' !! includes empty string: ""
155 // romOne = "([Vv]?" & romI & or_ & "(IV|iv)" & or_ & "(IX|ix)" & ")"
156 // romTen = "([Ll]?" & romX & or_ & "(XL|xl)" & or_ & "(XC|xc)" & ")"
157 // romHun = "([Dd]?" & romC & or_ & "(CD|cd)" & or_ & "(CM|cm)" & ")"
158 // romNr = "(?=[MDCLXVImdclxvi])(((" & romM & ")?" & romHun & ")?" & romTen & ")?" & romOne
159 protected static String pRomNr = "ljfweffaflas"; //TODO rom number have to be tested first
160
161 protected static String pDetailAlternatives = "(" + pPages + "|" + pPageSpecial + "|" + pStrNo + "|" + pBracketNr +
162 "|" + pTabFig + "|" + pTabSpecial + "|" + pFolBracket + "|" + pCouv + "|" + pRomNr + "|" +
163 pSpecialGardDict + "|" + pSpecialDetail + ")";
164
165 protected static String detail = pDetailAlternatives;
166
167 //reference
168 protected static String volume = nr4 + "(\\("+ nr4 + "\\))?";
169 //this line caused problem https://dev.e-taxonomy.eu/trac/ticket/1556 in its original form: "([\u005E:\\.]" + fWs + ")";
170 protected static String anySepChar = "([\u005E:a-zA-Z]" + fWs + ")"; //all characters except for the detail separator, a stricter version would be [,\\-\\&] and some other characters
171 // protected static String anySepChar = "([,\\-\\&\\.\\+\\']" + fWs + ")";
172
173 protected static int authorSeparatorMaxPosition = 4; //Author may have a maximum of 4 words
174 protected static String pTitleWordSeparator = "(\\."+ fWs+"|" + oWs + ")";
175 protected static String pSeriesPart = ",?" + fWs + "[sS]er(\\.)?" + oWs + "\\d{1,2},?";
176 protected static String referenceTitleFirstPart = "(" + word + pTitleWordSeparator + ")";
177 protected static String referenceTitle = referenceTitleFirstPart + "*" + "("+ dotWord + "|" + uppercaseWord + "|" + pSeriesPart + ")"; //reference title may have words seperated by whitespace or dot. The last word may not have a whitespace at the end. There must be at least one word
178 protected static String referenceTitleWithSepCharacters = "(((" + referenceTitle +"|\\(.+\\))" + anySepChar + ")*" + referenceTitle + ")"; //,?
179 //TODO test performance ??
180 protected static String referenceTitleWithSepCharactersAndBrackets = referenceTitleWithSepCharacters + fWs + "(\\(" + referenceTitleWithSepCharacters + "\\)"+fWs+ ")?(" + referenceTitleWithSepCharacters +")?" ;
181
182 protected static String referenceTitleWithoutAuthor = "(" + referenceTitleFirstPart + ")" + "{"+ (authorSeparatorMaxPosition -1) +",}" + dotWord +
183 anySepChar + referenceTitleWithSepCharactersAndBrackets ; //separators exist and first separator appears at position authorSeparatorMaxPosition or later
184
185 protected static String editionSeparator = "(" + oWs + "|," + fWs + ")ed\\.?" + oWs; //
186 protected static String pEdition = nr2;
187
188 protected static String pVolPart = volumeSeparator + volume;
189 protected static String pEditionPart = editionSeparator + pEdition;
190 protected static String pEditionVolPart = editionSeparator + pEdition + fWs + "," + volumeSeparator + volume;
191 protected static String pEditionVolAlternative = "(" + pEditionPart + "|" + pVolPart + "|" + pEditionVolPart + ")?";
192
193 protected static String pVolRefTitle = referenceTitle + "(" + pVolPart + ")?";
194 protected static String softEditionVolRefTitle = referenceTitleWithSepCharactersAndBrackets + pEditionVolAlternative;
195 protected static String softVolNoAuthorRefTitle = referenceTitleWithoutAuthor + "(" + volumeSeparator + volume + ")?";
196
197 protected static String pBookReference = softEditionVolRefTitle;
198 protected static String pBookSectionReference = authorTeam + referenceAuthorSeparator + softEditionVolRefTitle;
199 protected static String pArticleReference = pVolRefTitle ;
200 protected static String pSoftArticleReference = softVolNoAuthorRefTitle ;
201
202
203 protected static String pReferenceSineDetail = "(" + pArticleReference + "|" + pBookSectionReference + "|" + pBookReference + ")";
204
205
206 protected static String pReference = pReferenceSineDetail + detailSeparator + detail +
207 yearSeperator + yearPhrase + "(" + referenceEnd + ")?";
208
209 //static String strictBook = referenc
210
211
212
213 protected static Pattern referencePattern = Pattern.compile(pReference);
214 protected static Pattern referenceSineDetailPattern = Pattern.compile(pReferenceSineDetail);
215
216 protected static String pNomStatusNom = "nom\\." + fWs + "(superfl\\.|nud\\.|illeg\\.|inval\\.|cons\\.|alternativ\\.|subnud.|"+
217 "rej\\.|rej\\."+ fWs + "prop\\.|provis\\.)";
218 protected static String pNomStatusOrthVar = "orth\\." + fWs + "var\\.";
219 protected static String pNomStatus = "(" + pNomStatusNom + "|" + pNomStatusOrthVar + ")";
220 protected static String pNomStatusPhrase1 = "," + fWs + pNomStatus;
221 protected static String pNomStatusPhrase2 = "\\[" + fWs + pNomStatus + "\\]";
222
223 protected static String pNomStatusPhrase = "(?:" + pNomStatusPhrase1 + "|" + pNomStatusPhrase2 + ")";
224
225 // Soraya
226 //opus utique oppr.
227 //pro syn.
228 //provisional synonym
229 //fossil name
230
231
232
233 //cultivars and hybrids
234 protected static String cultivar = oWs + "'..+'"; //Achtung mit Hochkomma in AuthorNamen
235 protected static String cultivarMarker = oWs + "(cv.|')";
236 protected static String hybridPart = "((x|X)" + oWs + "|"+hybridSign+"|notho)";
237 protected static String hybridFull = "(" +oWs +"|"+ pStart +")" + hybridPart;
238
239
240 // Name String
241 protected static String genusOrSupraGenus = "("+hybridFull+")?" + capitalEpiWord;
242 protected static String infraGenus = capitalEpiWord + oWs + InfraGenusMarker + oWs + capitalEpiWord;
243 protected static String aggrOrGroup = capitalEpiWord + oWs + nonCapitalEpiWord + oWs + aggrOrGroupMarker;
244 protected static String species = genusOrSupraGenus + oWs + "("+hybridPart+")?" + nonCapitalEpiWord;
245 protected static String infraSpecies = species + oWs + infraSpeciesMarker + oWs + "("+hybridPart+")?" + nonCapitalEpiWord;
246 protected static String oldInfraSpecies = capitalEpiWord + oWs + nonCapitalEpiWord + oWs + oldInfraSpeciesMarker + oWs + nonCapitalEpiWord;
247 protected static String autonym = capitalEpiWord + oWs + "(" + nonCapitalEpiWord +")" + oWs + fullBotanicAuthorString + oWs + infraSpeciesMarker + oWs + "\\1"; //2-nd word and last word are the same
248 //autonym pattern used within anyBotanicalFullName pattern
249 protected static String autonym2 = capitalEpiWord + oWs + "(" + nonCapitalEpiWord +")" + oWs + fullBotanicAuthorString + oWs + infraSpeciesMarker + oWs + "\\2"; //2-nd word and last word are the same
250
251 protected static String anyBotanicName = "(" + genusOrSupraGenus + "|" + infraGenus + "|" + aggrOrGroup + "|" + species + "|" +
252 infraSpecies + "|" + infraSpecies + "|" + oldInfraSpecies + "|" + autonym + ")+";
253 protected static String anyZooName = "(" + genusOrSupraGenus + "|" + infraGenus + "|" + aggrOrGroup + "|" + species + "|" +
254 infraSpecies + "|" + infraSpecies + "|" + oldInfraSpecies + ")+";
255 protected static String anyBotanicFullName = "(" + autonym2 + "|" + anyBotanicName + oWs + fullBotanicAuthorString + ")" ;
256 protected static String anyZooFullName = anyZooName + oWs + fullZooAuthorString ;
257 protected static String anyFullName = "(" + anyBotanicFullName + "|" + anyZooFullName + ")";
258
259 //Pattern
260 protected static Pattern oWsPattern = Pattern.compile(oWs);
261 protected static Pattern teamSplitterPattern = Pattern.compile(teamSplitter);
262 protected static Pattern cultivarPattern = Pattern.compile(cultivar);
263 protected static Pattern cultivarMarkerPattern = Pattern.compile(cultivarMarker);
264 protected static Pattern hybridPattern = Pattern.compile(hybridFull);
265
266 protected static Pattern genusOrSupraGenusPattern = Pattern.compile(pStart + genusOrSupraGenus + facultFullAuthorString2 + end);
267 protected static Pattern infraGenusPattern = Pattern.compile(pStart + infraGenus + facultFullAuthorString2 + end);
268 protected static Pattern aggrOrGroupPattern = Pattern.compile(pStart + aggrOrGroup + fWs + end); //aggr. or group has no author string
269 protected static Pattern speciesPattern = Pattern.compile(pStart + species + facultFullAuthorString2 + end);
270 protected static Pattern infraSpeciesPattern = Pattern.compile(pStart + infraSpecies + facultFullAuthorString2 + end);
271 protected static Pattern oldInfraSpeciesPattern = Pattern.compile(pStart + oldInfraSpecies + facultFullAuthorString2 + end);
272 protected static Pattern autonymPattern = Pattern.compile(pStart + autonym + fWs + end);
273
274 protected static Pattern botanicBasionymPattern = Pattern.compile(botanicBasionymAuthor);
275 protected static Pattern zooBasionymPattern = Pattern.compile(zooBasionymAuthor);
276 protected static Pattern basionymPattern = Pattern.compile(basionymAuthor);
277
278 protected static Pattern zooAuthorPattern = Pattern.compile(zooAuthorTeam);
279 protected static Pattern zooAuthorAddidtionPattern = Pattern.compile(zooAuthorAddidtion);
280
281 protected static Pattern exAuthorPattern = Pattern.compile(oWs + exString);
282
283 protected static Pattern fullBotanicAuthorStringPattern = Pattern.compile(fullBotanicAuthorString);
284 protected static Pattern fullZooAuthorStringPattern = Pattern.compile(fullZooAuthorString);
285 protected static Pattern fullAuthorStringPattern = Pattern.compile(fullAuthorString);
286
287 protected static Pattern anyBotanicFullNamePattern = Pattern.compile(anyBotanicFullName);
288 protected static Pattern anyZooFullNamePattern = Pattern.compile(anyZooFullName);
289
290
291 }