Revision 4f5219a9
Added by Andreas Müller almost 8 years ago
cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/parser/NonViralNameParserImplRegExBase.java | ||
---|---|---|
45 | 45 |
protected static String nonCapitalWord = "\\p{javaLowerCase}+"; |
46 | 46 |
protected static String word = "(" + capitalWord + "|" + nonCapitalWord + ")"; //word (capital or non-capital) with no '.' at the end |
47 | 47 |
protected static String uppercaseWord = "\\p{javaUpperCase}{2,}"; |
48 |
protected static String apostrophWord = word + "('\\p{javaLowerCase}*)?"; |
|
48 | 49 |
|
49 | 50 |
protected static String capitalDotWord = capitalWord + "\\.?"; //capitalWord with facultativ '.' at the end |
50 | 51 |
protected static String capital2charDotWord = "(" + capital2LetterWord + "\\.?|\\p{javaUpperCase}\\.)"; //capitalWord with facultativ '.' but minimum 2 characters (single capital word like 'L' is not allowed |
52 |
protected static String twoCapitalDotWord = "\\p{javaUpperCase}{2}\\."; //e.g. NY. |
|
53 |
|
|
51 | 54 |
protected static String nonCapitalDotWord = nonCapitalWord + "\\.?"; //nonCapitalWord with facultativ '.' at the end |
52 | 55 |
protected static String dotWord = "(" + capitalWord + "|" + nonCapitalWord + ")\\.?"; //word (capital or non-capital) with facultativ '.' at the end |
53 | 56 |
protected static String obligateDotWord = "(" + capitalWord + "|" + nonCapitalWord + ")\\.+"; //word (capital or non-capital) with obligate '.' at the end |
... | ... | |
127 | 130 |
protected static String pBracketNr = "\\[" + nr4 + "\\]"; |
128 | 131 |
protected static String pFolBracket = "\\[fol\\." + fWs + "\\d{1,2}(-\\d{1,2})?\\]"; |
129 | 132 |
|
130 |
protected static String pStrTab = "tab\\." + fWs + nr4 + "(" + fWs + "(B|\u00DF|\\(\\d{1,3}\\)))?"; |
|
131 |
protected static String pFig = "fig\\." + fWs + nr4 + "[a-z]?"; |
|
132 |
protected static String pFigs = pFig + "(-" + nr4 + ")?"; |
|
133 |
protected static String pStrTab = "[tT]((ab)?\\.|ab\\s)" + fWs + nr4 + "(" + fWs + "(B|\u00DF|\\(\\d{1,3}\\)))?"; |
|
134 |
protected static String pFig = "[fF]((ig)?\\.|ig\\s)" + fWs + nr4 + "([a-zA-Z]([-\u2013,]\\s*[a-zA-Z])?)?"; |
|
135 |
protected static String pFigs = pFig + "([-\u2013]" + nr4 + ")?"; |
|
136 |
protected static String pPlate = "[pP]((l)?\\.|l\\s)" + fWs + nr4 + "([a-zA-Z]([-\u2013,]\\s*[a-zA-Z])?)?"; |
|
137 |
|
|
138 |
|
|
133 | 139 |
//static String pTabFig = pStrTab + "(," + fWs + pFigs + ")?"; |
134 |
protected static String pTabFig = "(" + pStrTab + "|" + pFigs + ")";
|
|
140 |
protected static String pTabFigPl = "(" + pStrTab + "|" + pFigs + "|" + pPlate + ")";
|
|
135 | 141 |
|
136 | 142 |
//e.g.: p455; p.455; pp455-456; pp.455-456; pp.455,456; 455, 456; pages 456-457; pages 456,567 |
137 |
protected static String pSinglePages = "(p\\.?)?" + fWs + pPage + "(," + pTabFig +")?"; |
|
138 |
protected static String pMultiPages = "(pp\\.?|pages)?" + fWs + pPage + fWs + "(-|,)" +fWs + pPage ;
|
|
143 |
protected static String pSinglePages = "(p\\.?)?" + fWs + pPage + "(," + pTabFigPl +")?";
|
|
144 |
protected static String pMultiPages = "(pp\\.?|pages)?" + fWs + pPage + fWs + "[-\u2013,]" +fWs + pPage ;
|
|
139 | 145 |
//static String pPages = pPage + "(," + fWs + "(" + pPage + "|" + pTabFig + ")" + ")?"; |
140 | 146 |
protected static String pPages = "(" + pSinglePages +"|" + pMultiPages +")"; |
147 |
protected static String pPagesTabFig = pPages +"[,\\.]" + fWs + pTabFigPl; |
|
148 |
|
|
141 | 149 |
|
142 | 150 |
|
143 | 151 |
protected static String pCouv = "couv\\." + fWs + "\\d{1,3}"; |
... | ... | |
164 | 172 |
protected static String pRomNr = "ljfweffaflas"; //TODO rom number have to be tested first |
165 | 173 |
|
166 | 174 |
protected static String pDetailAlternatives = "(" + pPages + "|" + pPageSpecial + "|" + pStrNo + "|" + pBracketNr + |
167 |
"|" + pTabFig + "|" + pTabSpecial + "|" + pFolBracket + "|" + pCouv + "|" + pRomNr + "|" + |
|
168 |
pSpecialGardDict + "|" + pSpecialDetail + ")"; |
|
175 |
"|" + pTabFigPl + "|" + pTabSpecial + "|" + pFolBracket + "|" + pCouv + "|" + pRomNr + "|" +
|
|
176 |
pSpecialGardDict + "|" + pSpecialDetail + "|" + pPagesTabFig + ")";
|
|
169 | 177 |
|
170 | 178 |
protected static String detail = pDetailAlternatives; |
171 | 179 |
|
172 | 180 |
//reference |
173 |
protected static String volume = nr4 + "[a-z]?" + "(\\("+ nr4 + "(-"+nr4+")?\\))?";
|
|
181 |
protected static String volume = nr4 + "[a-z]?" + fWs + "(\\("+ nr4 + "([-\u2013]" + nr4 + ")?\\))?" + "(\\((Suppl|Beibl)\\.\\))?";
|
|
174 | 182 |
//this line caused problem https://dev.e-taxonomy.eu/trac/ticket/1556 in its original form: "([\u005E:\\.]" + fWs + ")"; |
175 | 183 |
protected static String anySepChar = "([\u005E:a-zA-Z]" + fWs + ")"; //all characters except for the detail separator, a stricter version would be [,\\-\\&] and some other characters |
176 | 184 |
// protected static String anySepChar = "([,\\-\\&\\.\\+\\']" + fWs + ")"; |
... | ... | |
178 | 186 |
protected static int authorSeparatorMaxPosition = 4; //Author may have a maximum of 4 words |
179 | 187 |
protected static String pTitleWordSeparator = "(\\."+ fWs+"|" + oWs + ")"; |
180 | 188 |
protected static String pSeriesPart = ",?" + fWs + "[sS]er(\\.)?" + oWs + "\\d{1,2},?"; |
181 |
protected static String referenceTitleFirstPart = "(" + word + pTitleWordSeparator + ")"; |
|
189 |
|
|
190 |
protected static String referenceTitleFirstPart = "(" + apostrophWord + pTitleWordSeparator + "|" + twoCapitalDotWord + fWs + ")"; |
|
182 | 191 |
protected static String referenceTitle = referenceTitleFirstPart + "*" + "("+ dotWord + "|" + uppercaseWord + "|" + pSeriesPart + ")"; //reference title may have words seperated by whitespace or dot. The last word may not have a whitespace at the end. There must be at least one word |
183 | 192 |
protected static String referenceTitleWithSepCharacters = "(((" + referenceTitle +"|\\(.+\\))" + anySepChar + ")*" + referenceTitle + ")"; //,? |
184 | 193 |
//TODO test performance ?? |
Also available in: Unified diff
#5909 Improve referenced name parser