2 * Copyright (C) 2009 EDIT
3 * European Distributed Institute of Taxonomy
4 * http://www.e-taxonomy.eu
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * See LICENSE.TXT at the top of this package for the full license terms.
10 package eu
.etaxonomy
.cdm
.strategy
.parser
;
12 import java
.util
.regex
.Pattern
;
14 import org
.apache
.log4j
.Logger
;
18 * This class is a base class that separates regex parts of the parser from methods
22 public abstract class NonViralNameParserImplRegExBase
{
23 @SuppressWarnings("unused")
24 private static final Logger logger
= Logger
.getLogger(NonViralNameParserImplRegExBase
.class);
26 // good intro: http://java.sun.com/docs/books/tutorial/essential/regex/index.html
29 protected static String epiSplitter
= "(\\s+|\\(|\\))"; //( ' '+| '(' | ')' )
30 protected static Pattern pattern
= Pattern
.compile(epiSplitter
);
32 public static final String hybridSign
= "\u00D7";
34 //some useful non-terminals
35 protected static String pStart
= "^";
36 protected static String end
= "$";
37 protected static String anyEnd
= ".*" + end
;
38 protected static String oWs
= "\\s+"; //obligatory whitespaces
39 protected static String fWs
= "\\s*"; //facultative whitespcace
41 protected static String capitalWord
= "\\p{javaUpperCase}\\p{javaLowerCase}*";
42 protected static String nonCapitalWord
= "\\p{javaLowerCase}+";
43 protected static String word
= "(" + capitalWord
+ "|" + nonCapitalWord
+ ")"; //word (capital or non-capital) with no '.' at the end
44 protected static String uppercaseWord
= "\\p{javaUpperCase}+";
46 protected static String capitalDotWord
= capitalWord
+ "\\.?"; //capitalWord with facultativ '.' at the end
47 protected static String nonCapitalDotWord
= nonCapitalWord
+ "\\.?"; //nonCapitalWord with facultativ '.' at the end
48 protected static String dotWord
= "(" + capitalWord
+ "|" + nonCapitalWord
+ ")\\.?"; //word (capital or non-capital) with facultativ '.' at the end
49 protected static String obligateDotWord
= "(" + capitalWord
+ "|" + nonCapitalWord
+ ")\\.+"; //word (capital or non-capital) with obligate '.' at the end
51 //Words used in an epethiton for a TaxonName
52 protected static String nonCapitalEpiWord
= "[a-z\u00EF\\-]+";
53 protected static String capitalEpiWord
= "[A-Z]"+ nonCapitalEpiWord
;
57 protected static String month
= "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)";
58 protected static String singleYear
= "\\b" + "(?:17|18|19|20)" + "\\d{2}" + "\\b"; // word boundary followed by either 17,18,19, or 20 (not captured) followed by 2 digits
59 protected static String yearPhrase
= singleYear
+ "("+ fWs
+ "-" + fWs
+ singleYear
+ ")?" ;
60 //+ "(" + month + ")?)" ; // optional month
63 protected static String yearSeperator
= "\\." + oWs
;
64 protected static String detailSeparator
= ":" + oWs
;
65 protected static String referenceSeparator1
= "," + oWs
;
66 protected static String inReferenceSeparator
= oWs
+ "in" + oWs
;
67 protected static String referenceSeperator
= "(" + referenceSeparator1
+"|" + inReferenceSeparator
+ ")" ;
68 protected static String referenceAuthorSeparator
= ","+ oWs
;
69 protected static String volumeSeparator
= oWs
;
70 protected static String referenceEnd
= "\\.";
74 protected static String status
= "";
77 protected static String InfraGenusMarker
= "(subgen.|subg.|sect.|subsect.|ser.|subser.|t.infgen.)";
78 protected static String aggrOrGroupMarker
= "(aggr.|agg.|group)";
79 protected static String infraSpeciesMarker
= "(subsp.|convar.|var.|subvar.|f.|subf.|f.spec.|tax." + fWs
+ "infrasp.)";
80 protected static String oldInfraSpeciesMarker
= "(prol.|proles|race|taxon|sublusus)";
84 protected static String authorPart
= "(" + "(d'|D'|L'|'t\\s)?" + capitalDotWord
+ "('" + nonCapitalDotWord
+ ")?" + "|da|de(n|l|\\sla)?)" ;
85 protected static String author
= "(" + authorPart
+ "(" + fWs
+ "|-)" + ")+" + "(f.|fil.|secundus)?";
86 protected static String teamSplitter
= fWs
+ "(&)" + fWs
;
87 protected static String authorTeam
= fWs
+ "(" + author
+ teamSplitter
+ ")*" + author
+ "(" + teamSplitter
+ "al.)?" + fWs
;
88 protected static String exString
= "(ex.?)";
89 protected static String authorAndExTeam
= "(" + authorTeam
+ oWs
+ exString
+ oWs
+ ")?" + authorTeam
;
90 protected static String basStart
= "\\(";
91 protected static String basEnd
= "\\)";
92 protected static String botanicBasionymAuthor
= basStart
+ "(" + authorAndExTeam
+ ")" + basEnd
; // '(' and ')' is for evaluation with RE.paren(x)
93 protected static String fullBotanicAuthorString
= fWs
+ "((" + botanicBasionymAuthor
+")?" + fWs
+ authorAndExTeam
+ "|" + botanicBasionymAuthor
+")"+ fWs
;
94 protected static String facultFullBotanicAuthorString
= "(" + fullBotanicAuthorString
+ ")?" ;
97 //TODO does zoo author have ex-Author?
98 protected static String zooAuthorYearSeperator
= ",";
99 protected static String zooAuthorAddidtion
= fWs
+ zooAuthorYearSeperator
+ fWs
+ singleYear
;
100 protected static String zooAuthorTeam
= authorTeam
+ zooAuthorAddidtion
;
101 protected static String zooBasionymAuthor
= basStart
+ "(" + zooAuthorTeam
+ ")" + basEnd
;
102 protected static String fullZooAuthorString
= fWs
+ "((" + zooBasionymAuthor
+")?" + fWs
+ zooAuthorTeam
+ "|" + zooBasionymAuthor
+")"+ fWs
;
103 protected static String facultFullZooAuthorString
= "(" + fullZooAuthorString
+ ")?" ;
105 protected static String facultFullAuthorString2
= "(" + facultFullBotanicAuthorString
+ "|" + facultFullZooAuthorString
+ ")";
107 protected static String basionymAuthor
= "(" + botanicBasionymAuthor
+ "|" + zooBasionymAuthor
+ ")";
108 protected static String fullAuthorString
= "(" + fullBotanicAuthorString
+ "|" + fullZooAuthorString
+ ")";
111 //TODO still very simple
114 protected static String nr2
= "\\d{1,2}";
115 protected static String nr4
= "\\d{1,4}";
116 protected static String nr5
= "\\d{1,5}";
119 protected static String pPage
= nr5
+ "[a-z]?";
120 protected static String pStrNo
= "n\u00B0" + fWs
+ "(" + nr4
+ ")";
122 protected static String pBracketNr
= "\\[" + nr4
+ "\\]";
123 protected static String pFolBracket
= "\\[fol\\." + fWs
+ "\\d{1,2}(-\\d{1,2})?\\]";
125 protected static String pStrTab
= "tab\\." + fWs
+ nr4
+ "(" + fWs
+ "(B|\u00DF|\\(\\d{1,3}\\)))?";
126 protected static String pFig
= "fig." + fWs
+ nr4
+ "[a-z]?";
127 protected static String pFigs
= pFig
+ "(-" + nr4
+ ")?";
128 //static String pTabFig = pStrTab + "(," + fWs + pFigs + ")?";
129 protected static String pTabFig
= "(" + pStrTab
+ "|" + pFigs
+ ")";
131 //e.g.: p455; p.455; pp455-456; pp.455-456; pp.455,456; 455, 456; pages 456-457; pages 456,567
132 protected static String pSinglePages
= "(p\\.?)?" + fWs
+ pPage
+ "(," + pTabFig
+")?";
133 protected static String pMultiPages
= "(pp\\.?|pages)?" + fWs
+ pPage
+ fWs
+ "(-|,)" +fWs
+ pPage
;
134 //static String pPages = pPage + "(," + fWs + "(" + pPage + "|" + pTabFig + ")" + ")?";
135 protected static String pPages
= "(" + pSinglePages
+"|" + pMultiPages
+")";
138 protected static String pCouv
= "couv\\." + fWs
+ "\\d{1,3}";
140 protected static String pTabSpecial
= "tab\\." + fWs
+ "(ad" + fWs
+ "\\d{1,3}|alphab)";
141 protected static String pPageSpecial
= nr4
+ fWs
+ "(in obs|, Expl\\. Tab)";
142 protected static String pSpecialGardDict
= capitalWord
+ oWs
+ "n\u00B0" + oWs
+ "\\d{1,2}";
144 // protected static String pSpecialDetail = "(in err|in tab|sine pag|add\\. & emend|Emend|""\\d{3}"" \\[\\d{3}\\])";
145 // protected static String pSpecialDetail = "(in err|in tab|sine pag|add\\. & emend|Emend|""\\d{3}"" \\[\\d{3}\\])";
146 protected static String pSpecialDetail
= "(in err|in tab|sine pag|add\\.)";
149 // Const romI = "[Ii]{0,3}"
150 // Const romX = "[Xx]{0,3}"
151 // Const romC = "[Cc]{0,3}"
152 // Const romM = "[Mm]{0,3}"
154 // ' !! includes empty string: ""
155 // romOne = "([Vv]?" & romI & or_ & "(IV|iv)" & or_ & "(IX|ix)" & ")"
156 // romTen = "([Ll]?" & romX & or_ & "(XL|xl)" & or_ & "(XC|xc)" & ")"
157 // romHun = "([Dd]?" & romC & or_ & "(CD|cd)" & or_ & "(CM|cm)" & ")"
158 // romNr = "(?=[MDCLXVImdclxvi])(((" & romM & ")?" & romHun & ")?" & romTen & ")?" & romOne
159 protected static String pRomNr
= "ljfweffaflas"; //TODO rom number have to be tested first
161 protected static String pDetailAlternatives
= "(" + pPages
+ "|" + pPageSpecial
+ "|" + pStrNo
+ "|" + pBracketNr
+
162 "|" + pTabFig
+ "|" + pTabSpecial
+ "|" + pFolBracket
+ "|" + pCouv
+ "|" + pRomNr
+ "|" +
163 pSpecialGardDict
+ "|" + pSpecialDetail
+ ")";
165 protected static String detail
= pDetailAlternatives
;
168 protected static String volume
= nr4
+ "(\\("+ nr4
+ "\\))?";
169 //this line caused problem https://dev.e-taxonomy.eu/trac/ticket/1556 in its original form: "([\u005E:\\.]" + fWs + ")";
170 protected static String anySepChar
= "([\u005E:a-zA-Z]" + fWs
+ ")"; //all characters except for the detail separator, a stricter version would be [,\\-\\&] and some other characters
171 // protected static String anySepChar = "([,\\-\\&\\.\\+\\']" + fWs + ")";
173 protected static int authorSeparatorMaxPosition
= 4; //Author may have a maximum of 4 words
174 protected static String pTitleWordSeparator
= "(\\."+ fWs
+"|" + oWs
+ ")";
175 protected static String pSeriesPart
= ",?" + fWs
+ "[sS]er(\\.)?" + oWs
+ "\\d{1,2},?";
176 protected static String referenceTitleFirstPart
= "(" + word
+ pTitleWordSeparator
+ ")";
177 protected static String referenceTitle
= referenceTitleFirstPart
+ "*" + "("+ dotWord
+ "|" + uppercaseWord
+ "|" + pSeriesPart
+ ")"; //reference title may have words seperated by whitespace or dot. The last word may not have a whitespace at the end. There must be at least one word
178 protected static String referenceTitleWithSepCharacters
= "(((" + referenceTitle
+"|\\(.+\\))" + anySepChar
+ ")*" + referenceTitle
+ ")"; //,?
179 //TODO test performance ??
180 protected static String referenceTitleWithSepCharactersAndBrackets
= referenceTitleWithSepCharacters
+ fWs
+ "(\\(" + referenceTitleWithSepCharacters
+ "\\)"+fWs
+ ")?(" + referenceTitleWithSepCharacters
+")?" ;
182 protected static String referenceTitleWithoutAuthor
= "(" + referenceTitleFirstPart
+ ")" + "{"+ (authorSeparatorMaxPosition
-1) +",}" + dotWord
+
183 anySepChar
+ referenceTitleWithSepCharactersAndBrackets
; //separators exist and first separator appears at position authorSeparatorMaxPosition or later
185 protected static String editionSeparator
= "(" + oWs
+ "|," + fWs
+ ")ed\\.?" + oWs
; //
186 protected static String pEdition
= nr2
;
188 protected static String pVolPart
= volumeSeparator
+ volume
;
189 protected static String pEditionPart
= editionSeparator
+ pEdition
;
190 protected static String pEditionVolPart
= editionSeparator
+ pEdition
+ fWs
+ "," + volumeSeparator
+ volume
;
191 protected static String pEditionVolAlternative
= "(" + pEditionPart
+ "|" + pVolPart
+ "|" + pEditionVolPart
+ ")?";
193 protected static String pVolRefTitle
= referenceTitle
+ "(" + pVolPart
+ ")?";
194 protected static String softEditionVolRefTitle
= referenceTitleWithSepCharactersAndBrackets
+ pEditionVolAlternative
;
195 protected static String softVolNoAuthorRefTitle
= referenceTitleWithoutAuthor
+ "(" + volumeSeparator
+ volume
+ ")?";
197 protected static String pBookReference
= softEditionVolRefTitle
;
198 protected static String pBookSectionReference
= authorTeam
+ referenceAuthorSeparator
+ softEditionVolRefTitle
;
199 protected static String pArticleReference
= pVolRefTitle
;
200 protected static String pSoftArticleReference
= softVolNoAuthorRefTitle
;
203 protected static String pReferenceSineDetail
= "(" + pArticleReference
+ "|" + pBookSectionReference
+ "|" + pBookReference
+ ")";
206 protected static String pReference
= pReferenceSineDetail
+ detailSeparator
+ detail
+
207 yearSeperator
+ yearPhrase
+ "(" + referenceEnd
+ ")?";
209 //static String strictBook = referenc
213 protected static Pattern referencePattern
= Pattern
.compile(pReference
);
214 protected static Pattern referenceSineDetailPattern
= Pattern
.compile(pReferenceSineDetail
);
216 protected static String pNomStatusNom
= "nom\\." + fWs
+ "(superfl\\.|nud\\.|illeg\\.|inval\\.|cons\\.|alternativ\\.|subnud.|"+
217 "rej\\.|rej\\."+ fWs
+ "prop\\.|provis\\.)";
218 protected static String pNomStatusOrthVar
= "orth\\." + fWs
+ "var\\.";
219 protected static String pNomStatus
= "(" + pNomStatusNom
+ "|" + pNomStatusOrthVar
+ ")";
220 protected static String pNomStatusPhrase1
= "," + fWs
+ pNomStatus
;
221 protected static String pNomStatusPhrase2
= "\\[" + fWs
+ pNomStatus
+ "\\]";
223 protected static String pNomStatusPhrase
= "(?:" + pNomStatusPhrase1
+ "|" + pNomStatusPhrase2
+ ")";
228 //provisional synonym
233 //cultivars and hybrids
234 protected static String cultivar
= oWs
+ "'..+'"; //Achtung mit Hochkomma in AuthorNamen
235 protected static String cultivarMarker
= oWs
+ "(cv.|')";
236 protected static String hybridPart
= "((x|X)" + oWs
+ "|"+hybridSign
+"|notho)";
237 protected static String hybridFull
= "(" +oWs
+"|"+ pStart
+")" + hybridPart
;
241 protected static String genusOrSupraGenus
= "("+hybridFull
+")?" + capitalEpiWord
;
242 protected static String infraGenus
= capitalEpiWord
+ oWs
+ InfraGenusMarker
+ oWs
+ capitalEpiWord
;
243 protected static String aggrOrGroup
= capitalEpiWord
+ oWs
+ nonCapitalEpiWord
+ oWs
+ aggrOrGroupMarker
;
244 protected static String species
= genusOrSupraGenus
+ oWs
+ "("+hybridPart
+")?" + nonCapitalEpiWord
;
245 protected static String infraSpecies
= species
+ oWs
+ infraSpeciesMarker
+ oWs
+ "("+hybridPart
+")?" + nonCapitalEpiWord
;
246 protected static String oldInfraSpecies
= capitalEpiWord
+ oWs
+ nonCapitalEpiWord
+ oWs
+ oldInfraSpeciesMarker
+ oWs
+ nonCapitalEpiWord
;
247 protected static String autonym
= capitalEpiWord
+ oWs
+ "(" + nonCapitalEpiWord
+")" + oWs
+ fullBotanicAuthorString
+ oWs
+ infraSpeciesMarker
+ oWs
+ "\\1"; //2-nd word and last word are the same
248 //autonym pattern used within anyBotanicalFullName pattern
249 protected static String autonym2
= capitalEpiWord
+ oWs
+ "(" + nonCapitalEpiWord
+")" + oWs
+ fullBotanicAuthorString
+ oWs
+ infraSpeciesMarker
+ oWs
+ "\\2"; //2-nd word and last word are the same
251 protected static String anyBotanicName
= "(" + genusOrSupraGenus
+ "|" + infraGenus
+ "|" + aggrOrGroup
+ "|" + species
+ "|" +
252 infraSpecies
+ "|" + infraSpecies
+ "|" + oldInfraSpecies
+ "|" + autonym
+ ")+";
253 protected static String anyZooName
= "(" + genusOrSupraGenus
+ "|" + infraGenus
+ "|" + aggrOrGroup
+ "|" + species
+ "|" +
254 infraSpecies
+ "|" + infraSpecies
+ "|" + oldInfraSpecies
+ ")+";
255 protected static String anyBotanicFullName
= "(" + autonym2
+ "|" + anyBotanicName
+ oWs
+ fullBotanicAuthorString
+ ")" ;
256 protected static String anyZooFullName
= anyZooName
+ oWs
+ fullZooAuthorString
;
257 protected static String anyFullName
= "(" + anyBotanicFullName
+ "|" + anyZooFullName
+ ")";
260 protected static Pattern oWsPattern
= Pattern
.compile(oWs
);
261 protected static Pattern teamSplitterPattern
= Pattern
.compile(teamSplitter
);
262 protected static Pattern cultivarPattern
= Pattern
.compile(cultivar
);
263 protected static Pattern cultivarMarkerPattern
= Pattern
.compile(cultivarMarker
);
264 protected static Pattern hybridPattern
= Pattern
.compile(hybridFull
);
266 protected static Pattern genusOrSupraGenusPattern
= Pattern
.compile(pStart
+ genusOrSupraGenus
+ facultFullAuthorString2
+ end
);
267 protected static Pattern infraGenusPattern
= Pattern
.compile(pStart
+ infraGenus
+ facultFullAuthorString2
+ end
);
268 protected static Pattern aggrOrGroupPattern
= Pattern
.compile(pStart
+ aggrOrGroup
+ fWs
+ end
); //aggr. or group has no author string
269 protected static Pattern speciesPattern
= Pattern
.compile(pStart
+ species
+ facultFullAuthorString2
+ end
);
270 protected static Pattern infraSpeciesPattern
= Pattern
.compile(pStart
+ infraSpecies
+ facultFullAuthorString2
+ end
);
271 protected static Pattern oldInfraSpeciesPattern
= Pattern
.compile(pStart
+ oldInfraSpecies
+ facultFullAuthorString2
+ end
);
272 protected static Pattern autonymPattern
= Pattern
.compile(pStart
+ autonym
+ fWs
+ end
);
274 protected static Pattern botanicBasionymPattern
= Pattern
.compile(botanicBasionymAuthor
);
275 protected static Pattern zooBasionymPattern
= Pattern
.compile(zooBasionymAuthor
);
276 protected static Pattern basionymPattern
= Pattern
.compile(basionymAuthor
);
278 protected static Pattern zooAuthorPattern
= Pattern
.compile(zooAuthorTeam
);
279 protected static Pattern zooAuthorAddidtionPattern
= Pattern
.compile(zooAuthorAddidtion
);
281 protected static Pattern exAuthorPattern
= Pattern
.compile(oWs
+ exString
);
283 protected static Pattern fullBotanicAuthorStringPattern
= Pattern
.compile(fullBotanicAuthorString
);
284 protected static Pattern fullZooAuthorStringPattern
= Pattern
.compile(fullZooAuthorString
);
285 protected static Pattern fullAuthorStringPattern
= Pattern
.compile(fullAuthorString
);
287 protected static Pattern anyBotanicFullNamePattern
= Pattern
.compile(anyBotanicFullName
);
288 protected static Pattern anyZooFullNamePattern
= Pattern
.compile(anyZooFullName
);