Project

General

Profile

« Previous | Next » 

Revision 4f5219a9

Added by Andreas Müller almost 8 years ago

#5909 Improve referenced name parser

View differences:

cdmlib-model/src/main/java/eu/etaxonomy/cdm/strategy/parser/NonViralNameParserImplRegExBase.java
45 45
    protected static String nonCapitalWord = "\\p{javaLowerCase}+";
46 46
    protected static String word = "(" + capitalWord + "|" + nonCapitalWord + ")"; //word (capital or non-capital) with no '.' at the end
47 47
    protected static String uppercaseWord = "\\p{javaUpperCase}{2,}";
48
    protected static String apostrophWord = word + "('\\p{javaLowerCase}*)?";
48 49

  
49 50
    protected static String capitalDotWord = capitalWord + "\\.?"; //capitalWord with facultativ '.' at the end
50 51
    protected static String capital2charDotWord = "(" + capital2LetterWord + "\\.?|\\p{javaUpperCase}\\.)"; //capitalWord with facultativ '.' but minimum 2 characters (single capital word like 'L' is not allowed
52
    protected static String twoCapitalDotWord = "\\p{javaUpperCase}{2}\\.";   //e.g. NY.
53

  
51 54
    protected static String nonCapitalDotWord = nonCapitalWord + "\\.?"; //nonCapitalWord with facultativ '.' at the end
52 55
    protected static String dotWord = "(" + capitalWord + "|" + nonCapitalWord + ")\\.?"; //word (capital or non-capital) with facultativ '.' at the end
53 56
    protected static String obligateDotWord = "(" + capitalWord + "|" + nonCapitalWord + ")\\.+"; //word (capital or non-capital) with obligate '.' at the end
......
127 130
    protected static String pBracketNr = "\\[" + nr4 + "\\]";
128 131
    protected static String pFolBracket = "\\[fol\\." + fWs + "\\d{1,2}(-\\d{1,2})?\\]";
129 132

  
130
    protected static String pStrTab = "tab\\." + fWs + nr4 + "(" + fWs + "(B|\u00DF|\\(\\d{1,3}\\)))?";
131
    protected static String pFig = "fig\\." + fWs + nr4 + "[a-z]?";
132
    protected static String pFigs = pFig + "(-" + nr4 + ")?";
133
    protected static String pStrTab = "[tT]((ab)?\\.|ab\\s)" + fWs + nr4 + "(" + fWs + "(B|\u00DF|\\(\\d{1,3}\\)))?";
134
    protected static String pFig = "[fF]((ig)?\\.|ig\\s)" + fWs + nr4 + "([a-zA-Z]([-\u2013,]\\s*[a-zA-Z])?)?";
135
    protected static String pFigs = pFig + "([-\u2013]" + nr4 + ")?";
136
    protected static String pPlate = "[pP]((l)?\\.|l\\s)" + fWs + nr4 + "([a-zA-Z]([-\u2013,]\\s*[a-zA-Z])?)?";
137

  
138

  
133 139
    //static String pTabFig = pStrTab + "(," + fWs + pFigs + ")?";
134
    protected static String pTabFig = "(" + pStrTab + "|" + pFigs + ")";
140
    protected static String pTabFigPl = "(" + pStrTab + "|" + pFigs + "|" +  pPlate + ")";
135 141

  
136 142
    //e.g.: p455; p.455; pp455-456; pp.455-456; pp.455,456; 455, 456; pages 456-457; pages 456,567
137
    protected static String pSinglePages = "(p\\.?)?" + fWs + pPage + "(," + pTabFig +")?";
138
    protected static String pMultiPages = "(pp\\.?|pages)?" + fWs + pPage + fWs + "(-|,)" +fWs + pPage ;
143
    protected static String pSinglePages = "(p\\.?)?" + fWs + pPage + "(," + pTabFigPl +")?";
144
    protected static String pMultiPages = "(pp\\.?|pages)?" + fWs + pPage + fWs + "[-\u2013,]" +fWs + pPage ;
139 145
    //static String pPages = pPage + "(," + fWs + "(" + pPage + "|" + pTabFig + ")" + ")?";
140 146
    protected static String pPages = "(" + pSinglePages +"|" + pMultiPages +")";
147
    protected static String pPagesTabFig = pPages +"[,\\.]" + fWs + pTabFigPl;
148

  
141 149

  
142 150

  
143 151
    protected static String pCouv = "couv\\." + fWs + "\\d{1,3}";
......
164 172
    protected static String pRomNr = "ljfweffaflas"; //TODO rom number have to be tested first
165 173

  
166 174
    protected static String pDetailAlternatives = "(" + pPages + "|" + pPageSpecial + "|" + pStrNo + "|" + pBracketNr +
167
    			"|" + pTabFig + "|" + pTabSpecial + "|" + pFolBracket + "|" + pCouv + "|" + pRomNr + "|" +
168
    			pSpecialGardDict + "|" + pSpecialDetail + ")";
175
    			"|" + pTabFigPl + "|" + pTabSpecial + "|" + pFolBracket + "|" + pCouv + "|" + pRomNr + "|" +
176
    			pSpecialGardDict + "|" + pSpecialDetail + "|" + pPagesTabFig + ")";
169 177

  
170 178
    protected static String detail = pDetailAlternatives;
171 179

  
172 180
    //reference
173
    protected static String volume = nr4 + "[a-z]?" + "(\\("+ nr4  + "(-"+nr4+")?\\))?";
181
    protected static String volume = nr4 + "[a-z]?" + fWs + "(\\("+ nr4  + "([-\u2013]" + nr4 + ")?\\))?" + "(\\((Suppl|Beibl)\\.\\))?";
174 182
    //this line caused problem https://dev.e-taxonomy.eu/trac/ticket/1556 in its original form: "([\u005E:\\.]" + fWs + ")";
175 183
    protected static String anySepChar = "([\u005E:a-zA-Z]" + fWs + ")"; //all characters except for the detail separator, a stricter version would be [,\\-\\&] and some other characters
176 184
//  protected static String anySepChar = "([,\\-\\&\\.\\+\\']" + fWs + ")";
......
178 186
    protected static int authorSeparatorMaxPosition = 4;  //Author may have a maximum of 4 words
179 187
    protected static String pTitleWordSeparator = "(\\."+ fWs+"|" + oWs + ")";
180 188
    protected static String pSeriesPart = ",?" + fWs + "[sS]er(\\.)?" + oWs + "\\d{1,2},?";
181
    protected static String referenceTitleFirstPart = "(" + word + pTitleWordSeparator + ")";
189

  
190
    protected static String referenceTitleFirstPart = "(" + apostrophWord + pTitleWordSeparator + "|" + twoCapitalDotWord + fWs + ")";
182 191
    protected static String referenceTitle = referenceTitleFirstPart + "*" + "("+ dotWord + "|" + uppercaseWord + "|" + pSeriesPart + ")";  //reference title may have words seperated by whitespace or dot. The last word may not have a whitespace at the end. There must be at least one word
183 192
    protected static String referenceTitleWithSepCharacters = "(((" + referenceTitle +"|\\(.+\\))"  + anySepChar + ")*" + referenceTitle + ")"; //,?
184 193
    //TODO test performance ??

Also available in: Unified diff