|
Removed
Link Here
|
| 1 |
/** |
| 2 |
* <copyright> |
| 3 |
* |
| 4 |
* Copyright (c) 2004-2005 IBM Corporation and others. |
| 5 |
* All rights reserved. This program and the accompanying materials |
| 6 |
* are made available under the terms of the Eclipse Public License v1.0 |
| 7 |
* which accompanies this distribution, and is available at |
| 8 |
* http://www.eclipse.org/legal/epl-v10.html |
| 9 |
* |
| 10 |
* Contributors: |
| 11 |
* IBM - Initial API and implementation |
| 12 |
* |
| 13 |
* </copyright> |
| 14 |
* |
| 15 |
* $Id: RegEx.java,v 1.8.4.1 2007/08/14 18:18:44 emerks Exp $ |
| 16 |
* |
| 17 |
* --------------------------------------------------------------------- |
| 18 |
* |
| 19 |
* The Apache Software License, Version 1.1 |
| 20 |
* |
| 21 |
* |
| 22 |
* Copyright (c) 1999-2004 The Apache Software Foundation. All rights |
| 23 |
* reserved. |
| 24 |
* |
| 25 |
* Redistribution and use in source and binary forms, with or without |
| 26 |
* modification, are permitted provided that the following conditions |
| 27 |
* are met: |
| 28 |
* |
| 29 |
* 1. Redistributions of source code must retain the above copyright |
| 30 |
* notice, this list of conditions and the following disclaimer. |
| 31 |
* |
| 32 |
* 2. Redistributions in binary form must reproduce the above copyright |
| 33 |
* notice, this list of conditions and the following disclaimer in |
| 34 |
* the documentation and/or other materials provided with the |
| 35 |
* distribution. |
| 36 |
* |
| 37 |
* 3. The end-user documentation included with the redistribution, |
| 38 |
* if any, must include the following acknowledgment: |
| 39 |
* "This product includes software developed by the |
| 40 |
* Apache Software Foundation (http://www.apache.org/)." |
| 41 |
* Alternately, this acknowledgment may appear in the software itself, |
| 42 |
* if and wherever such third-party acknowledgments normally appear. |
| 43 |
* |
| 44 |
* 4. The names "Xerces" and "Apache Software Foundation" must |
| 45 |
* not be used to endorse or promote products derived from this |
| 46 |
* software without prior written permission. For written |
| 47 |
* permission, please contact apache@apache.org. |
| 48 |
* |
| 49 |
* 5. Products derived from this software may not be called "Apache", |
| 50 |
* nor may "Apache" appear in their name, without prior written |
| 51 |
* permission of the Apache Software Foundation. |
| 52 |
* |
| 53 |
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED |
| 54 |
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| 55 |
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| 56 |
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR |
| 57 |
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 58 |
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 59 |
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF |
| 60 |
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| 61 |
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 62 |
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| 63 |
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 64 |
* SUCH DAMAGE. |
| 65 |
* ==================================================================== |
| 66 |
* |
| 67 |
* This software consists of voluntary contributions made by many |
| 68 |
* individuals on behalf of the Apache Software Foundation and was |
| 69 |
* originally based on software copyright (c) 1999-2003, International |
| 70 |
* Business Machines, Inc., http://www.apache.org. For more |
| 71 |
* information on the Apache Software Foundation, please see |
| 72 |
* <http://www.apache.org/>. |
| 73 |
*/ |
| 74 |
|
| 75 |
package org.eclipse.emf.ecore.xml.type.internal; |
| 76 |
|
| 77 |
|
| 78 |
import java.text.CharacterIterator; |
| 79 |
import java.util.Hashtable; |
| 80 |
import java.util.Locale; |
| 81 |
import java.util.ResourceBundle; |
| 82 |
import java.util.Vector; |
| 83 |
|
| 84 |
import org.eclipse.emf.ecore.plugin.EcorePlugin; |
| 85 |
|
| 86 |
/** |
| 87 |
* NOTE: this class is for internal use only. |
| 88 |
*/ |
| 89 |
public final class RegEx |
| 90 |
{ |
| 91 |
static class BMPattern |
| 92 |
{ |
| 93 |
char[] pattern; |
| 94 |
|
| 95 |
int[] shiftTable; |
| 96 |
|
| 97 |
boolean ignoreCase; |
| 98 |
|
| 99 |
public BMPattern(String pat, boolean ignoreCase) |
| 100 |
{ |
| 101 |
this(pat, 256, ignoreCase); |
| 102 |
} |
| 103 |
|
| 104 |
public BMPattern(String pat, int tableSize, boolean ignoreCase) |
| 105 |
{ |
| 106 |
this.pattern = pat.toCharArray(); |
| 107 |
this.shiftTable = new int [tableSize]; |
| 108 |
this.ignoreCase = ignoreCase; |
| 109 |
int length = pattern.length; |
| 110 |
for (int i = 0; i < this.shiftTable.length; i++) |
| 111 |
this.shiftTable[i] = length; |
| 112 |
for (int i = 0; i < length; i++) |
| 113 |
{ |
| 114 |
char ch = this.pattern[i]; |
| 115 |
int diff = length - i - 1; |
| 116 |
int index = ch % this.shiftTable.length; |
| 117 |
if (diff < this.shiftTable[index]) |
| 118 |
this.shiftTable[index] = diff; |
| 119 |
if (this.ignoreCase) |
| 120 |
{ |
| 121 |
ch = Character.toUpperCase(ch); |
| 122 |
index = ch % this.shiftTable.length; |
| 123 |
if (diff < this.shiftTable[index]) |
| 124 |
this.shiftTable[index] = diff; |
| 125 |
ch = Character.toLowerCase(ch); |
| 126 |
index = ch % this.shiftTable.length; |
| 127 |
if (diff < this.shiftTable[index]) |
| 128 |
this.shiftTable[index] = diff; |
| 129 |
} |
| 130 |
} |
| 131 |
} |
| 132 |
|
| 133 |
/** |
| 134 |
* |
| 135 |
* @return -1 if <var>iterator</var> does not contain this pattern. |
| 136 |
*/ |
| 137 |
public int matches(CharacterIterator iterator, int start, int limit) |
| 138 |
{ |
| 139 |
if (this.ignoreCase) |
| 140 |
return this.matchesIgnoreCase(iterator, start, limit); |
| 141 |
int plength = this.pattern.length; |
| 142 |
if (plength == 0) |
| 143 |
return start; |
| 144 |
int index = start + plength; |
| 145 |
while (index <= limit) |
| 146 |
{ |
| 147 |
int pindex = plength; |
| 148 |
int nindex = index + 1; |
| 149 |
char ch; |
| 150 |
do |
| 151 |
{ |
| 152 |
if ((ch = iterator.setIndex(--index)) != this.pattern[--pindex]) |
| 153 |
break; |
| 154 |
if (pindex == 0) |
| 155 |
return index; |
| 156 |
} |
| 157 |
while (pindex > 0); |
| 158 |
index += this.shiftTable[ch % this.shiftTable.length] + 1; |
| 159 |
if (index < nindex) |
| 160 |
index = nindex; |
| 161 |
} |
| 162 |
return -1; |
| 163 |
} |
| 164 |
|
| 165 |
/** |
| 166 |
* |
| 167 |
* @return -1 if <var>str</var> does not contain this pattern. |
| 168 |
*/ |
| 169 |
public int matches(String str, int start, int limit) |
| 170 |
{ |
| 171 |
if (this.ignoreCase) |
| 172 |
return this.matchesIgnoreCase(str, start, limit); |
| 173 |
int plength = this.pattern.length; |
| 174 |
if (plength == 0) |
| 175 |
return start; |
| 176 |
int index = start + plength; |
| 177 |
while (index <= limit) |
| 178 |
{ |
| 179 |
//System.err.println("Starts at "+index); |
| 180 |
int pindex = plength; |
| 181 |
int nindex = index + 1; |
| 182 |
char ch; |
| 183 |
do |
| 184 |
{ |
| 185 |
if ((ch = str.charAt(--index)) != this.pattern[--pindex]) |
| 186 |
break; |
| 187 |
if (pindex == 0) |
| 188 |
return index; |
| 189 |
} |
| 190 |
while (pindex > 0); |
| 191 |
index += this.shiftTable[ch % this.shiftTable.length] + 1; |
| 192 |
if (index < nindex) |
| 193 |
index = nindex; |
| 194 |
} |
| 195 |
return -1; |
| 196 |
} |
| 197 |
|
| 198 |
/** |
| 199 |
* |
| 200 |
* @return -1 if <var>chars</char> does not contain this pattern. |
| 201 |
*/ |
| 202 |
public int matches(char[] chars, int start, int limit) |
| 203 |
{ |
| 204 |
if (this.ignoreCase) |
| 205 |
return this.matchesIgnoreCase(chars, start, limit); |
| 206 |
int plength = this.pattern.length; |
| 207 |
if (plength == 0) |
| 208 |
return start; |
| 209 |
int index = start + plength; |
| 210 |
while (index <= limit) |
| 211 |
{ |
| 212 |
//System.err.println("Starts at "+index); |
| 213 |
int pindex = plength; |
| 214 |
int nindex = index + 1; |
| 215 |
char ch; |
| 216 |
do |
| 217 |
{ |
| 218 |
if ((ch = chars[--index]) != this.pattern[--pindex]) |
| 219 |
break; |
| 220 |
if (pindex == 0) |
| 221 |
return index; |
| 222 |
} |
| 223 |
while (pindex > 0); |
| 224 |
index += this.shiftTable[ch % this.shiftTable.length] + 1; |
| 225 |
if (index < nindex) |
| 226 |
index = nindex; |
| 227 |
} |
| 228 |
return -1; |
| 229 |
} |
| 230 |
|
| 231 |
int matchesIgnoreCase(CharacterIterator iterator, int start, int limit) |
| 232 |
{ |
| 233 |
int plength = this.pattern.length; |
| 234 |
if (plength == 0) |
| 235 |
return start; |
| 236 |
int index = start + plength; |
| 237 |
while (index <= limit) |
| 238 |
{ |
| 239 |
int pindex = plength; |
| 240 |
int nindex = index + 1; |
| 241 |
char ch; |
| 242 |
do |
| 243 |
{ |
| 244 |
char ch1 = ch = iterator.setIndex(--index); |
| 245 |
char ch2 = this.pattern[--pindex]; |
| 246 |
if (ch1 != ch2) |
| 247 |
{ |
| 248 |
ch1 = Character.toUpperCase(ch1); |
| 249 |
ch2 = Character.toUpperCase(ch2); |
| 250 |
if (ch1 != ch2 && Character.toLowerCase(ch1) != Character.toLowerCase(ch2)) |
| 251 |
break; |
| 252 |
} |
| 253 |
if (pindex == 0) |
| 254 |
return index; |
| 255 |
} |
| 256 |
while (pindex > 0); |
| 257 |
index += this.shiftTable[ch % this.shiftTable.length] + 1; |
| 258 |
if (index < nindex) |
| 259 |
index = nindex; |
| 260 |
} |
| 261 |
return -1; |
| 262 |
} |
| 263 |
|
| 264 |
int matchesIgnoreCase(String text, int start, int limit) |
| 265 |
{ |
| 266 |
int plength = this.pattern.length; |
| 267 |
if (plength == 0) |
| 268 |
return start; |
| 269 |
int index = start + plength; |
| 270 |
while (index <= limit) |
| 271 |
{ |
| 272 |
int pindex = plength; |
| 273 |
int nindex = index + 1; |
| 274 |
char ch; |
| 275 |
do |
| 276 |
{ |
| 277 |
char ch1 = ch = text.charAt(--index); |
| 278 |
char ch2 = this.pattern[--pindex]; |
| 279 |
if (ch1 != ch2) |
| 280 |
{ |
| 281 |
ch1 = Character.toUpperCase(ch1); |
| 282 |
ch2 = Character.toUpperCase(ch2); |
| 283 |
if (ch1 != ch2 && Character.toLowerCase(ch1) != Character.toLowerCase(ch2)) |
| 284 |
break; |
| 285 |
} |
| 286 |
if (pindex == 0) |
| 287 |
return index; |
| 288 |
} |
| 289 |
while (pindex > 0); |
| 290 |
index += this.shiftTable[ch % this.shiftTable.length] + 1; |
| 291 |
if (index < nindex) |
| 292 |
index = nindex; |
| 293 |
} |
| 294 |
return -1; |
| 295 |
} |
| 296 |
|
| 297 |
int matchesIgnoreCase(char[] chars, int start, int limit) |
| 298 |
{ |
| 299 |
int plength = this.pattern.length; |
| 300 |
if (plength == 0) |
| 301 |
return start; |
| 302 |
int index = start + plength; |
| 303 |
while (index <= limit) |
| 304 |
{ |
| 305 |
int pindex = plength; |
| 306 |
int nindex = index + 1; |
| 307 |
char ch; |
| 308 |
do |
| 309 |
{ |
| 310 |
char ch1 = ch = chars[--index]; |
| 311 |
char ch2 = this.pattern[--pindex]; |
| 312 |
if (ch1 != ch2) |
| 313 |
{ |
| 314 |
ch1 = Character.toUpperCase(ch1); |
| 315 |
ch2 = Character.toUpperCase(ch2); |
| 316 |
if (ch1 != ch2 && Character.toLowerCase(ch1) != Character.toLowerCase(ch2)) |
| 317 |
break; |
| 318 |
} |
| 319 |
if (pindex == 0) |
| 320 |
return index; |
| 321 |
} |
| 322 |
while (pindex > 0); |
| 323 |
index += this.shiftTable[ch % this.shiftTable.length] + 1; |
| 324 |
if (index < nindex) |
| 325 |
index = nindex; |
| 326 |
} |
| 327 |
return -1; |
| 328 |
} |
| 329 |
/* |
| 330 |
public static void main(String[] argv) { |
| 331 |
try { |
| 332 |
int[] shiftTable = new int[256]; |
| 333 |
initializeBoyerMoore(argv[0], shiftTable, true); |
| 334 |
int o = -1; |
| 335 |
CharacterIterator ite = new java.text.StringCharacterIterator(argv[1]); |
| 336 |
long start = System.currentTimeMillis(); |
| 337 |
//for (int i = 0; i < 10000; i ++) |
| 338 |
o = searchIgnoreCasesWithBoyerMoore(ite, 0, argv[0], shiftTable); |
| 339 |
start = System.currentTimeMillis()-start; |
| 340 |
System.out.println("Result: "+o+", Elapsed: "+start); |
| 341 |
} catch (Exception ex) { |
| 342 |
ex.printStackTrace(); |
| 343 |
} |
| 344 |
}*/ |
| 345 |
} |
| 346 |
|
| 347 |
public static class Match implements Cloneable { |
| 348 |
int[] beginpos = null; |
| 349 |
int[] endpos = null; |
| 350 |
int nofgroups = 0; |
| 351 |
|
| 352 |
CharacterIterator ciSource = null; |
| 353 |
String strSource = null; |
| 354 |
char[] charSource = null; |
| 355 |
|
| 356 |
/** |
| 357 |
* Creates an instance. |
| 358 |
*/ |
| 359 |
public Match() { |
| 360 |
} |
| 361 |
|
| 362 |
/** |
| 363 |
* |
| 364 |
*/ |
| 365 |
public synchronized Object clone() { |
| 366 |
Match ma = new Match(); |
| 367 |
if (this.nofgroups > 0) { |
| 368 |
ma.setNumberOfGroups(this.nofgroups); |
| 369 |
if (this.ciSource != null) ma.setSource(this.ciSource); |
| 370 |
if (this.strSource != null) ma.setSource(this.strSource); |
| 371 |
for (int i = 0; i < this.nofgroups; i ++) { |
| 372 |
ma.setBeginning(i, this.getBeginning(i)); |
| 373 |
ma.setEnd(i, this.getEnd(i)); |
| 374 |
} |
| 375 |
} |
| 376 |
return ma; |
| 377 |
} |
| 378 |
|
| 379 |
/** |
| 380 |
* |
| 381 |
*/ |
| 382 |
protected void setNumberOfGroups(int n) { |
| 383 |
int oldn = this.nofgroups; |
| 384 |
this.nofgroups = n; |
| 385 |
if (oldn <= 0 |
| 386 |
|| oldn < n || n*2 < oldn) { |
| 387 |
this.beginpos = new int[n]; |
| 388 |
this.endpos = new int[n]; |
| 389 |
} |
| 390 |
for (int i = 0; i < n; i ++) { |
| 391 |
this.beginpos[i] = -1; |
| 392 |
this.endpos[i] = -1; |
| 393 |
} |
| 394 |
} |
| 395 |
|
| 396 |
/** |
| 397 |
* |
| 398 |
*/ |
| 399 |
protected void setSource(CharacterIterator ci) { |
| 400 |
this.ciSource = ci; |
| 401 |
this.strSource = null; |
| 402 |
this.charSource = null; |
| 403 |
} |
| 404 |
/** |
| 405 |
* |
| 406 |
*/ |
| 407 |
protected void setSource(String str) { |
| 408 |
this.ciSource = null; |
| 409 |
this.strSource = str; |
| 410 |
this.charSource = null; |
| 411 |
} |
| 412 |
/** |
| 413 |
* |
| 414 |
*/ |
| 415 |
protected void setSource(char[] chars) { |
| 416 |
this.ciSource = null; |
| 417 |
this.strSource = null; |
| 418 |
this.charSource = chars; |
| 419 |
} |
| 420 |
|
| 421 |
/** |
| 422 |
* |
| 423 |
*/ |
| 424 |
protected void setBeginning(int index, int v) { |
| 425 |
this.beginpos[index] = v; |
| 426 |
} |
| 427 |
|
| 428 |
/** |
| 429 |
* |
| 430 |
*/ |
| 431 |
protected void setEnd(int index, int v) { |
| 432 |
this.endpos[index] = v; |
| 433 |
} |
| 434 |
|
| 435 |
/** |
| 436 |
* Return the number of regular expression groups. |
| 437 |
* This method returns 1 when the regular expression has no capturing-parenthesis. |
| 438 |
*/ |
| 439 |
public int getNumberOfGroups() { |
| 440 |
if (this.nofgroups <= 0) |
| 441 |
throw new IllegalStateException("A result is not set."); |
| 442 |
return this.nofgroups; |
| 443 |
} |
| 444 |
|
| 445 |
/** |
| 446 |
* Return a start position in the target text matched to specified regular expression group. |
| 447 |
* |
| 448 |
* @param index Less than <code>getNumberOfGroups()</code>. |
| 449 |
*/ |
| 450 |
public int getBeginning(int index) { |
| 451 |
if (this.beginpos == null) |
| 452 |
throw new IllegalStateException("A result is not set."); |
| 453 |
if (index < 0 || this.nofgroups <= index) |
| 454 |
throw new IllegalArgumentException("The parameter must be less than " |
| 455 |
+this.nofgroups+": "+index); |
| 456 |
return this.beginpos[index]; |
| 457 |
} |
| 458 |
|
| 459 |
/** |
| 460 |
* Return an end position in the target text matched to specified regular expression group. |
| 461 |
* |
| 462 |
* @param index Less than <code>getNumberOfGroups()</code>. |
| 463 |
*/ |
| 464 |
public int getEnd(int index) { |
| 465 |
if (this.endpos == null) |
| 466 |
throw new IllegalStateException("A result is not set."); |
| 467 |
if (index < 0 || this.nofgroups <= index) |
| 468 |
throw new IllegalArgumentException("The parameter must be less than " |
| 469 |
+this.nofgroups+": "+index); |
| 470 |
return this.endpos[index]; |
| 471 |
} |
| 472 |
|
| 473 |
/** |
| 474 |
* Return an substring of the target text matched to specified regular expression group. |
| 475 |
* |
| 476 |
* @param index Less than <code>getNumberOfGroups()</code>. |
| 477 |
*/ |
| 478 |
public String getCapturedText(int index) { |
| 479 |
if (this.beginpos == null) |
| 480 |
throw new IllegalStateException("match() has never been called."); |
| 481 |
if (index < 0 || this.nofgroups <= index) |
| 482 |
throw new IllegalArgumentException("The parameter must be less than " |
| 483 |
+this.nofgroups+": "+index); |
| 484 |
String ret; |
| 485 |
int begin = this.beginpos[index], end = this.endpos[index]; |
| 486 |
if (begin < 0 || end < 0) return null; |
| 487 |
if (this.ciSource != null) { |
| 488 |
ret = REUtil.substring(this.ciSource, begin, end); |
| 489 |
} else if (this.strSource != null) { |
| 490 |
ret = this.strSource.substring(begin, end); |
| 491 |
} else { |
| 492 |
ret = new String(this.charSource, begin, end-begin); |
| 493 |
} |
| 494 |
return ret; |
| 495 |
} |
| 496 |
} |
| 497 |
|
| 498 |
public final static class REUtil { |
| 499 |
private REUtil() { |
| 500 |
} |
| 501 |
|
| 502 |
static final int composeFromSurrogates(int high, int low) { |
| 503 |
return 0x10000 + ((high-0xd800)<<10) + low-0xdc00; |
| 504 |
} |
| 505 |
|
| 506 |
static final boolean isLowSurrogate(int ch) { |
| 507 |
return (ch & 0xfc00) == 0xdc00; |
| 508 |
} |
| 509 |
|
| 510 |
static final boolean isHighSurrogate(int ch) { |
| 511 |
return (ch & 0xfc00) == 0xd800; |
| 512 |
} |
| 513 |
|
| 514 |
static final String decomposeToSurrogates(int ch) { |
| 515 |
char[] chs = new char[2]; |
| 516 |
ch -= 0x10000; |
| 517 |
chs[0] = (char)((ch>>10)+0xd800); |
| 518 |
chs[1] = (char)((ch&0x3ff)+0xdc00); |
| 519 |
return new String(chs); |
| 520 |
} |
| 521 |
|
| 522 |
static final String substring(CharacterIterator iterator, int begin, int end) { |
| 523 |
char[] src = new char[end-begin]; |
| 524 |
for (int i = 0; i < src.length; i ++) |
| 525 |
src[i] = iterator.setIndex(i+begin); |
| 526 |
return new String(src); |
| 527 |
} |
| 528 |
|
| 529 |
// ================================================================ |
| 530 |
|
| 531 |
static final int getOptionValue(int ch) { |
| 532 |
int ret = 0; |
| 533 |
switch (ch) { |
| 534 |
case 'i': |
| 535 |
ret = RegularExpression.IGNORE_CASE; |
| 536 |
break; |
| 537 |
case 'm': |
| 538 |
ret = RegularExpression.MULTIPLE_LINES; |
| 539 |
break; |
| 540 |
case 's': |
| 541 |
ret = RegularExpression.SINGLE_LINE; |
| 542 |
break; |
| 543 |
case 'x': |
| 544 |
ret = RegularExpression.EXTENDED_COMMENT; |
| 545 |
break; |
| 546 |
case 'u': |
| 547 |
ret = RegularExpression.USE_UNICODE_CATEGORY; |
| 548 |
break; |
| 549 |
case 'w': |
| 550 |
ret = RegularExpression.UNICODE_WORD_BOUNDARY; |
| 551 |
break; |
| 552 |
case 'F': |
| 553 |
ret = RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION; |
| 554 |
break; |
| 555 |
case 'H': |
| 556 |
ret = RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION; |
| 557 |
break; |
| 558 |
case 'X': |
| 559 |
ret = RegularExpression.XMLSCHEMA_MODE; |
| 560 |
break; |
| 561 |
case ',': |
| 562 |
ret = RegularExpression.SPECIAL_COMMA; |
| 563 |
break; |
| 564 |
default: |
| 565 |
} |
| 566 |
return ret; |
| 567 |
} |
| 568 |
|
| 569 |
static final int parseOptions(String opts) throws ParseException { |
| 570 |
if (opts == null) return 0; |
| 571 |
int options = 0; |
| 572 |
for (int i = 0; i < opts.length(); i ++) { |
| 573 |
int v = getOptionValue(opts.charAt(i)); |
| 574 |
if (v == 0) |
| 575 |
throw new ParseException("Unknown Option: "+opts.substring(i), -1); |
| 576 |
options |= v; |
| 577 |
} |
| 578 |
return options; |
| 579 |
} |
| 580 |
|
| 581 |
static final String createOptionString(int options) { |
| 582 |
StringBuffer sb = new StringBuffer(9); |
| 583 |
if ((options & RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION) != 0) |
| 584 |
sb.append('F'); |
| 585 |
if ((options & RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) != 0) |
| 586 |
sb.append('H'); |
| 587 |
if ((options & RegularExpression.XMLSCHEMA_MODE) != 0) |
| 588 |
sb.append('X'); |
| 589 |
if ((options & RegularExpression.IGNORE_CASE) != 0) |
| 590 |
sb.append('i'); |
| 591 |
if ((options & RegularExpression.MULTIPLE_LINES) != 0) |
| 592 |
sb.append('m'); |
| 593 |
if ((options & RegularExpression.SINGLE_LINE) != 0) |
| 594 |
sb.append('s'); |
| 595 |
if ((options & RegularExpression.USE_UNICODE_CATEGORY) != 0) |
| 596 |
sb.append('u'); |
| 597 |
if ((options & RegularExpression.UNICODE_WORD_BOUNDARY) != 0) |
| 598 |
sb.append('w'); |
| 599 |
if ((options & RegularExpression.EXTENDED_COMMENT) != 0) |
| 600 |
sb.append('x'); |
| 601 |
if ((options & RegularExpression.SPECIAL_COMMA) != 0) |
| 602 |
sb.append(','); |
| 603 |
return sb.toString().intern(); |
| 604 |
} |
| 605 |
|
| 606 |
// ================================================================ |
| 607 |
|
| 608 |
static String stripExtendedComment(String regex) { |
| 609 |
int len = regex.length(); |
| 610 |
StringBuffer buffer = new StringBuffer(len); |
| 611 |
int offset = 0; |
| 612 |
while (offset < len) { |
| 613 |
int ch = regex.charAt(offset++); |
| 614 |
// Skips a white space. |
| 615 |
if (ch == '\t' || ch == '\n' || ch == '\f' || ch == '\r' || ch == ' ') |
| 616 |
continue; |
| 617 |
|
| 618 |
if (ch == '#') { // Skips chracters between '#' and a line end. |
| 619 |
while (offset < len) { |
| 620 |
ch = regex.charAt(offset++); |
| 621 |
if (ch == '\r' || ch == '\n') |
| 622 |
break; |
| 623 |
} |
| 624 |
continue; |
| 625 |
} |
| 626 |
|
| 627 |
int next; // Strips an escaped white space. |
| 628 |
if (ch == '\\' && offset < len) { |
| 629 |
if ((next = regex.charAt(offset)) == '#' |
| 630 |
|| next == '\t' || next == '\n' || next == '\f' |
| 631 |
|| next == '\r' || next == ' ') { |
| 632 |
buffer.append((char)next); |
| 633 |
offset ++; |
| 634 |
} else { // Other escaped character. |
| 635 |
buffer.append('\\'); |
| 636 |
buffer.append((char)next); |
| 637 |
offset ++; |
| 638 |
} |
| 639 |
} else // As is. |
| 640 |
buffer.append((char)ch); |
| 641 |
} |
| 642 |
return buffer.toString(); |
| 643 |
} |
| 644 |
|
| 645 |
// ================================================================ |
| 646 |
|
| 647 |
/** |
| 648 |
* Sample entry. |
| 649 |
* <div>Usage: <KBD>org.apache.xerces.utils.regex.REUtil <regex> <string></KBD></div> |
| 650 |
*/ |
| 651 |
public static void main(String[] argv) { |
| 652 |
String pattern = null; |
| 653 |
try { |
| 654 |
String options = ""; |
| 655 |
String target = null; |
| 656 |
if( argv.length == 0 ) { |
| 657 |
System.out.println( "Error:Usage: java REUtil -i|-m|-s|-u|-w|-X regularExpression String" ); |
| 658 |
System.exit( 0 ); |
| 659 |
} |
| 660 |
for (int i = 0; i < argv.length; i ++) { |
| 661 |
if (argv[i].length() == 0 || argv[i].charAt(0) != '-') { |
| 662 |
if (pattern == null) |
| 663 |
pattern = argv[i]; |
| 664 |
else if (target == null) |
| 665 |
target = argv[i]; |
| 666 |
else |
| 667 |
System.err.println("Unnecessary: "+argv[i]); |
| 668 |
} else if (argv[i].equals("-i")) { |
| 669 |
options += "i"; |
| 670 |
} else if (argv[i].equals("-m")) { |
| 671 |
options += "m"; |
| 672 |
} else if (argv[i].equals("-s")) { |
| 673 |
options += "s"; |
| 674 |
} else if (argv[i].equals("-u")) { |
| 675 |
options += "u"; |
| 676 |
} else if (argv[i].equals("-w")) { |
| 677 |
options += "w"; |
| 678 |
} else if (argv[i].equals("-X")) { |
| 679 |
options += "X"; |
| 680 |
} else { |
| 681 |
System.err.println("Unknown option: "+argv[i]); |
| 682 |
} |
| 683 |
} |
| 684 |
RegularExpression reg = new RegularExpression(pattern, options); |
| 685 |
System.out.println("RegularExpression: "+reg); |
| 686 |
Match match = new Match(); |
| 687 |
reg.matches(target, match); |
| 688 |
for (int i = 0; i < match.getNumberOfGroups(); i ++) { |
| 689 |
if (i == 0 ) System.out.print("Matched range for the whole pattern: "); |
| 690 |
else System.out.print("["+i+"]: "); |
| 691 |
if (match.getBeginning(i) < 0) |
| 692 |
System.out.println("-1"); |
| 693 |
else { |
| 694 |
System.out.print(match.getBeginning(i)+", "+match.getEnd(i)+", "); |
| 695 |
System.out.println("\""+match.getCapturedText(i)+"\""); |
| 696 |
} |
| 697 |
} |
| 698 |
} catch (ParseException pe) { |
| 699 |
if (pattern == null) { |
| 700 |
pe.printStackTrace(); |
| 701 |
} else { |
| 702 |
System.err.println("org.apache.xerces.utils.regex.ParseException: "+pe.getMessage()); |
| 703 |
String indent = " "; |
| 704 |
System.err.println(indent+pattern); |
| 705 |
int loc = pe.getLocation(); |
| 706 |
if (loc >= 0) { |
| 707 |
System.err.print(indent); |
| 708 |
for (int i = 0; i < loc; i ++) System.err.print("-"); |
| 709 |
System.err.println("^"); |
| 710 |
} |
| 711 |
} |
| 712 |
} catch (Exception e) { |
| 713 |
e.printStackTrace(); |
| 714 |
} |
| 715 |
} |
| 716 |
|
| 717 |
static final int CACHESIZE = 20; |
| 718 |
static final RegularExpression[] regexCache = new RegularExpression[CACHESIZE]; |
| 719 |
/** |
| 720 |
* Creates a RegularExpression instance. |
| 721 |
* This method caches created instances. |
| 722 |
* |
| 723 |
* @see RegularExpression#RegularExpression(String, String) |
| 724 |
*/ |
| 725 |
public static RegularExpression createRegex(String pattern, String options) |
| 726 |
throws ParseException { |
| 727 |
RegularExpression re = null; |
| 728 |
int intOptions = REUtil.parseOptions(options); |
| 729 |
synchronized (REUtil.regexCache) { |
| 730 |
int i; |
| 731 |
for (i = 0; i < REUtil.CACHESIZE; i ++) { |
| 732 |
RegularExpression cached = REUtil.regexCache[i]; |
| 733 |
if (cached == null) { |
| 734 |
i = -1; |
| 735 |
break; |
| 736 |
} |
| 737 |
if (cached.equals(pattern, intOptions)) { |
| 738 |
re = cached; |
| 739 |
break; |
| 740 |
} |
| 741 |
} |
| 742 |
if (re != null) { |
| 743 |
if (i != 0) { |
| 744 |
System.arraycopy(REUtil.regexCache, 0, REUtil.regexCache, 1, i); |
| 745 |
REUtil.regexCache[0] = re; |
| 746 |
} |
| 747 |
} else { |
| 748 |
re = new RegularExpression(pattern, options); |
| 749 |
System.arraycopy(REUtil.regexCache, 0, REUtil.regexCache, 1, REUtil.CACHESIZE-1); |
| 750 |
REUtil.regexCache[0] = re; |
| 751 |
} |
| 752 |
} |
| 753 |
return re; |
| 754 |
} |
| 755 |
|
| 756 |
/** |
| 757 |
* |
| 758 |
* @see RegularExpression#matches(String) |
| 759 |
*/ |
| 760 |
public static boolean matches(String regex, String target) throws ParseException { |
| 761 |
return REUtil.createRegex(regex, null).matches(target); |
| 762 |
} |
| 763 |
|
| 764 |
/** |
| 765 |
* |
| 766 |
* @see RegularExpression#matches(String) |
| 767 |
*/ |
| 768 |
public static boolean matches(String regex, String options, String target) throws ParseException { |
| 769 |
return REUtil.createRegex(regex, options).matches(target); |
| 770 |
} |
| 771 |
|
| 772 |
// ================================================================ |
| 773 |
|
| 774 |
/** |
| 775 |
* |
| 776 |
*/ |
| 777 |
public static String quoteMeta(String literal) { |
| 778 |
int len = literal.length(); |
| 779 |
StringBuffer buffer = null; |
| 780 |
for (int i = 0; i < len; i ++) { |
| 781 |
int ch = literal.charAt(i); |
| 782 |
if (".*+?{[()|\\^$".indexOf(ch) >= 0) { |
| 783 |
if (buffer == null) { |
| 784 |
buffer = new StringBuffer(i+(len-i)*2); |
| 785 |
if (i > 0) buffer.append(literal.substring(0, i)); |
| 786 |
} |
| 787 |
buffer.append('\\'); |
| 788 |
buffer.append((char)ch); |
| 789 |
} else if (buffer != null) |
| 790 |
buffer.append((char)ch); |
| 791 |
} |
| 792 |
return buffer != null ? buffer.toString() : literal; |
| 793 |
} |
| 794 |
|
| 795 |
// ================================================================ |
| 796 |
|
| 797 |
static void dumpString(String v) { |
| 798 |
for (int i = 0; i < v.length(); i ++) { |
| 799 |
System.out.print(Integer.toHexString(v.charAt(i))); |
| 800 |
System.out.print(" "); |
| 801 |
} |
| 802 |
System.out.println(); |
| 803 |
} |
| 804 |
} |
| 805 |
|
| 806 |
|
| 807 |
/** |
| 808 |
* A regular expression matching engine using Non-deterministic Finite Automaton (NFA). |
| 809 |
* This engine does not conform to the POSIX regular expression. |
| 810 |
* |
| 811 |
* <hr width="50%"> |
| 812 |
* <h3>How to use</h3> |
| 813 |
* |
| 814 |
* <dl> |
| 815 |
* <dt>A. Standard way |
| 816 |
* <dd> |
| 817 |
* <pre> |
| 818 |
* RegularExpression re = new RegularExpression(<var>regex</var>); |
| 819 |
* if (re.matches(text)) { ... } |
| 820 |
* </pre> |
| 821 |
* |
| 822 |
* <dt>B. Capturing groups |
| 823 |
* <dd> |
| 824 |
* <pre> |
| 825 |
* RegularExpression re = new RegularExpression(<var>regex</var>); |
| 826 |
* Match match = new Match(); |
| 827 |
* if (re.matches(text, match)) { |
| 828 |
* ... // You can refer captured texts with methods of the <code>Match</code> class. |
| 829 |
* } |
| 830 |
* </pre> |
| 831 |
* |
| 832 |
* </dl> |
| 833 |
* |
| 834 |
* <h4>Case-insensitive matching</h4> |
| 835 |
* <pre> |
| 836 |
* RegularExpression re = new RegularExpression(<var>regex</var>, "i"); |
| 837 |
* if (re.matches(text) >= 0) { ...} |
| 838 |
* </pre> |
| 839 |
* |
| 840 |
* <h4>Options</h4> |
| 841 |
* <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a> |
| 842 |
* or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>. |
| 843 |
* This <var>options</var> parameter consists of the following characters. |
| 844 |
* </p> |
| 845 |
* <dl> |
| 846 |
* <dt><a name="I_OPTION"><code>"i"</code></a> |
| 847 |
* <dd>This option indicates case-insensitive matching. |
| 848 |
* <dt><a name="M_OPTION"><code>"m"</code></a> |
| 849 |
* <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text. |
| 850 |
* <dt><a name="S_OPTION"><code>"s"</code></a> |
| 851 |
* <dd class="REGEX"><kbd>.</kbd> matches any one character. |
| 852 |
* <dt><a name="U_OPTION"><code>"u"</code></a> |
| 853 |
* <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \< \></kbd> as becoming to Unicode. |
| 854 |
* <dt><a name="W_OPTION"><code>"w"</code></a> |
| 855 |
* <dd class="REGEX">By this option, <kbd>\b \B \< \></kbd> are processed with the method of |
| 856 |
* 'Unicode Regular Expression Guidelines' Revision 4. |
| 857 |
* When "w" and "u" are specified at the same time, |
| 858 |
* <kbd>\b \B \< \></kbd> are processed for the "w" option. |
| 859 |
* <dt><a name="COMMA_OPTION"><code>","</code></a> |
| 860 |
* <dd>The parser treats a comma in a character class as a range separator. |
| 861 |
* <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option. |
| 862 |
* <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option. |
| 863 |
* |
| 864 |
* <dt><a name="X_OPTION"><code>"X"</code></a> |
| 865 |
* <dd class="REGEX"> |
| 866 |
* By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>. |
| 867 |
* The <code>match()</code> method does not do subsring matching |
| 868 |
* but entire string matching. |
| 869 |
* |
| 870 |
* </dl> |
| 871 |
* |
| 872 |
* <hr width="50%"> |
| 873 |
* <h3>Syntax</h3> |
| 874 |
* <table border="1" bgcolor="#ddeeff"> |
| 875 |
* <tr> |
| 876 |
* <td> |
| 877 |
* <h4>Differences from the Perl 5 regular expression</h4> |
| 878 |
* <ul> |
| 879 |
* <li>There is 6-digit hexadecimal character representation (<kbd>\u005cv</kbd><var>HHHHHH</var>.) |
| 880 |
* <li>Supports subtraction, union, and intersection operations for character classes. |
| 881 |
* <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations), |
| 882 |
* <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>, |
| 883 |
* <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>, |
| 884 |
* <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>, |
| 885 |
* <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd> |
| 886 |
* </ul> |
| 887 |
* </td> |
| 888 |
* </tr> |
| 889 |
* </table> |
| 890 |
* |
| 891 |
* <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P> |
| 892 |
* <ul> |
| 893 |
* <li>Character |
| 894 |
* <dl> |
| 895 |
* <dt class="REGEX"><kbd>.</kbd> (A period) |
| 896 |
* <dd>Matches any one character except the following characters. |
| 897 |
* <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D), |
| 898 |
* PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028) |
| 899 |
* <dd>This expression matches one code point in Unicode. It can match a pair of surrogates. |
| 900 |
* <dd>When <a href="#S_OPTION">the "s" option</a> is specified, |
| 901 |
* it matches any character including the above four characters. |
| 902 |
* |
| 903 |
* <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd> |
| 904 |
* <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A), |
| 905 |
* CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009) |
| 906 |
* |
| 907 |
* <dt class="REGEX"><kbd>\c</kbd><var>C</var> |
| 908 |
* <dd>Matches a control character. |
| 909 |
* The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>', |
| 910 |
* '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'. |
| 911 |
* It matches a control character of which the character code is less than |
| 912 |
* the character code of the <var>C</var> by 0x0040. |
| 913 |
* <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A), |
| 914 |
* and a <kbd>\c[</kbd> matches an ESCAPE (U+001B). |
| 915 |
* |
| 916 |
* <dt class="REGEX">a non-meta character |
| 917 |
* <dd>Matches the character. |
| 918 |
* |
| 919 |
* <dt class="REGEX"><KBD>\</KBD> + a meta character |
| 920 |
* <dd>Matches the meta character. |
| 921 |
* |
| 922 |
* <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> |
| 923 |
* <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode. |
| 924 |
* You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and |
| 925 |
* variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>. |
| 926 |
* |
| 927 |
* <!-- |
| 928 |
* <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var> |
| 929 |
* <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode. |
| 930 |
* --> |
| 931 |
* |
| 932 |
* <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var> |
| 933 |
* <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode. |
| 934 |
* |
| 935 |
* <dt class="REGEX"><kbd>\g</kbd> |
| 936 |
* <dd>Matches a grapheme. |
| 937 |
* <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd> |
| 938 |
* |
| 939 |
* <dt class="REGEX"><kbd>\X</kbd> |
| 940 |
* <dd class="REGEX">Matches a combining character sequence. |
| 941 |
* It is equivalent to <kbd>(?:\PM\pM*)</kbd> |
| 942 |
* </dl> |
| 943 |
* </li> |
| 944 |
* |
| 945 |
* <li>Character class |
| 946 |
* <dl> |
| 947 |
+ * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>) |
| 948 |
+ * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>) |
| 949 |
* <dd>Positive character class. It matches a character in ranges. |
| 950 |
* <dd><var>R<sub>n</sub></var>: |
| 951 |
* <ul> |
| 952 |
* <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>) |
| 953 |
* <p>This range matches the character. |
| 954 |
* <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var> |
| 955 |
* <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and <= <var>C<sub>2</sub></var>'s code point. |
| 956 |
+ * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>, |
| 957 |
+ * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd> |
| 958 |
* <p>... |
| 959 |
* <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd> |
| 960 |
* <p>These expressions specifies the same ranges as the following expressions. |
| 961 |
* </ul> |
| 962 |
* <p class="REGEX">Enumerated ranges are merged (union operation). |
| 963 |
* <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd> |
| 964 |
* |
| 965 |
* <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>) |
| 966 |
* <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>) |
| 967 |
* <dd>Negative character class. It matches a character not in ranges. |
| 968 |
* |
| 969 |
* <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd> |
| 970 |
* (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.) |
| 971 |
* <dd>Subtraction or union or intersection for character classes. |
| 972 |
* <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>. |
| 973 |
* <dd>The result of this operations is a <u>positive character class</u> |
| 974 |
* even if an expression includes any negative character classes. |
| 975 |
* You have to take care on this in case-insensitive matching. |
| 976 |
* For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>, |
| 977 |
* which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching. |
| 978 |
* But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because |
| 979 |
* it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>' |
| 980 |
* though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>. |
| 981 |
* |
| 982 |
* <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt> |
| 983 |
* <dd>Character class subtraction for the XML Schema. |
| 984 |
* You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>. |
| 985 |
* |
| 986 |
* <dt class="REGEX"><kbd>\d</kbd> |
| 987 |
* <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>. |
| 988 |
* <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to |
| 989 |
* <span class="REGEX"><kbd>\p{Nd}</kbd></span>. |
| 990 |
* |
| 991 |
* <dt class="REGEX"><kbd>\D</kbd> |
| 992 |
* <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd> |
| 993 |
* <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to |
| 994 |
* <span class="REGEX"><kbd>\P{Nd}</kbd></span>. |
| 995 |
* |
| 996 |
* <dt class="REGEX"><kbd>\s</kbd> |
| 997 |
* <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd> |
| 998 |
* <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to |
| 999 |
* <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>. |
| 1000 |
* |
| 1001 |
* <dt class="REGEX"><kbd>\S</kbd> |
| 1002 |
* <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd> |
| 1003 |
* <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to |
| 1004 |
* <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>. |
| 1005 |
* |
| 1006 |
* <dt class="REGEX"><kbd>\w</kbd> |
| 1007 |
* <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd> |
| 1008 |
* <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to |
| 1009 |
* <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>. |
| 1010 |
* |
| 1011 |
* <dt class="REGEX"><kbd>\W</kbd> |
| 1012 |
* <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd> |
| 1013 |
* <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to |
| 1014 |
* <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>. |
| 1015 |
* |
| 1016 |
* <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd> |
| 1017 |
* <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>. |
| 1018 |
* The following names are available: |
| 1019 |
* <dl> |
| 1020 |
* <dt>Unicode General Categories: |
| 1021 |
* <dd><kbd> |
| 1022 |
* L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp, |
| 1023 |
* Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So, |
| 1024 |
* </kbd> |
| 1025 |
* <dd>(Currently the Cn category includes U+10000-U+10FFFF characters) |
| 1026 |
* <dt>Unicode Blocks: |
| 1027 |
* <dd><kbd> |
| 1028 |
* Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B, |
| 1029 |
* IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek, |
| 1030 |
* Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati, |
| 1031 |
* Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian, |
| 1032 |
* Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation, |
| 1033 |
* Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols, |
| 1034 |
* Letterlike Symbols, Number Forms, Arrows, Mathematical Operators, |
| 1035 |
* Miscellaneous Technical, Control Pictures, Optical Character Recognition, |
| 1036 |
* Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes, |
| 1037 |
* Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana, |
| 1038 |
* Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun, |
| 1039 |
* Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs, |
| 1040 |
* Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates, |
| 1041 |
* Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms, |
| 1042 |
* Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms, |
| 1043 |
* Small Form Variants, Arabic Presentation Forms-B, Specials, |
| 1044 |
* Halfwidth and Fullwidth Forms |
| 1045 |
* </kbd> |
| 1046 |
* <dt>Others: |
| 1047 |
* <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>) |
| 1048 |
* <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>) |
| 1049 |
* <dd><kbd>UNASSGINED</kbd> |
| 1050 |
* (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>) |
| 1051 |
* </dl> |
| 1052 |
* |
| 1053 |
* <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd> |
| 1054 |
* <dd>Matches one character not in the specified General Category or the specified Block. |
| 1055 |
* </dl> |
| 1056 |
* </li> |
| 1057 |
* |
| 1058 |
* <li>Selection and Quantifier |
| 1059 |
* <dl> |
| 1060 |
* <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR> |
| 1061 |
* <dd>... |
| 1062 |
* |
| 1063 |
* <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD> |
| 1064 |
* <dd>Matches 0 or more <var>X</var>. |
| 1065 |
* |
| 1066 |
* <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD> |
| 1067 |
* <dd>Matches 1 or more <var>X</var>. |
| 1068 |
* |
| 1069 |
* <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD> |
| 1070 |
* <dd>Matches 0 or 1 <var>X</var>. |
| 1071 |
* |
| 1072 |
* <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd> |
| 1073 |
* <dd>Matches <var>number</var> times. |
| 1074 |
* |
| 1075 |
* <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd> |
| 1076 |
* <dd>... |
| 1077 |
* |
| 1078 |
* <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd> |
| 1079 |
* <dd>... |
| 1080 |
* |
| 1081 |
* <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd> |
| 1082 |
* <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd> |
| 1083 |
* <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd> |
| 1084 |
* <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd> |
| 1085 |
* <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd> |
| 1086 |
* <dd>Non-greedy matching. |
| 1087 |
* </dl> |
| 1088 |
* </li> |
| 1089 |
* |
| 1090 |
* <li>Grouping, Capturing, and Back-reference |
| 1091 |
* <dl> |
| 1092 |
* <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD> |
| 1093 |
* <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>". |
| 1094 |
* If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>", |
| 1095 |
* you have to write "<KBD>(?:foo)+</KBD>". |
| 1096 |
* |
| 1097 |
* <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD> |
| 1098 |
* <dd>Grouping with capturing. |
| 1099 |
* It make a group and applications can know |
| 1100 |
* where in target text a group matched with methods of a <code>Match</code> instance |
| 1101 |
* after <code><a href="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>. |
| 1102 |
* The 0th group means whole of this regular expression. |
| 1103 |
* The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis. |
| 1104 |
* |
| 1105 |
* <p>For instance, a regular expression is |
| 1106 |
* "<FONT color=blue><KBD> *([^<:]*) +<([^>]*)> *</KBD></FONT>" |
| 1107 |
* and target text is |
| 1108 |
* "<FONT color=red><KBD>From: TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>": |
| 1109 |
* <ul> |
| 1110 |
* <li><code>Match.getCapturedText(0)</code>: |
| 1111 |
* "<FONT color=red><KBD> TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>" |
| 1112 |
* <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>" |
| 1113 |
* <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>" |
| 1114 |
* </ul> |
| 1115 |
* |
| 1116 |
* <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd> |
| 1117 |
* <dd> |
| 1118 |
* |
| 1119 |
* <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd> |
| 1120 |
* <dd>Independent expression group. ................ |
| 1121 |
* |
| 1122 |
* <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd> |
| 1123 |
* <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd> |
| 1124 |
* <dd>............................ |
| 1125 |
* <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'. |
| 1126 |
* Note that it can not contain 'u'. |
| 1127 |
* |
| 1128 |
* <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd> |
| 1129 |
* <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd> |
| 1130 |
* <dd>...... |
| 1131 |
* <dd>These expressions must be at the beginning of a group. |
| 1132 |
* </dl> |
| 1133 |
* </li> |
| 1134 |
* |
| 1135 |
* <li>Anchor |
| 1136 |
* <dl> |
| 1137 |
* <dt class="REGEX"><kbd>\A</kbd> |
| 1138 |
* <dd>Matches the beginnig of the text. |
| 1139 |
* |
| 1140 |
* <dt class="REGEX"><kbd>\Z</kbd> |
| 1141 |
* <dd>Matches the end of the text, or before an EOL character at the end of the text, |
| 1142 |
* or CARRIAGE RETURN + LINE FEED at the end of the text. |
| 1143 |
* |
| 1144 |
* <dt class="REGEX"><kbd>\z</kbd> |
| 1145 |
* <dd>Matches the end of the text. |
| 1146 |
* |
| 1147 |
* <dt class="REGEX"><kbd>^</kbd> |
| 1148 |
* <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>. |
| 1149 |
* <dd>When <a href="#M_OPTION">a "m" option</a> is set, |
| 1150 |
* it matches the beginning of the text, or after one of EOL characters ( |
| 1151 |
* LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028), |
| 1152 |
* PARAGRAPH SEPARATOR (U+2029).) |
| 1153 |
* |
| 1154 |
* <dt class="REGEX"><kbd>$</kbd> |
| 1155 |
* <dd>Matches the end of the text, or before an EOL character at the end of the text, |
| 1156 |
* or CARRIAGE RETURN + LINE FEED at the end of the text. |
| 1157 |
* <dd>When <a href="#M_OPTION">a "m" option</a> is set, |
| 1158 |
* it matches the end of the text, or before an EOL character. |
| 1159 |
* |
| 1160 |
* <dt class="REGEX"><kbd>\b</kbd> |
| 1161 |
* <dd>Matches word boundary. |
| 1162 |
* (See <a href="#W_OPTION">a "w" option</a>) |
| 1163 |
* |
| 1164 |
* <dt class="REGEX"><kbd>\B</kbd> |
| 1165 |
* <dd>Matches non word boundary. |
| 1166 |
* (See <a href="#W_OPTION">a "w" option</a>) |
| 1167 |
* |
| 1168 |
* <dt class="REGEX"><kbd>\<</kbd> |
| 1169 |
* <dd>Matches the beginning of a word. |
| 1170 |
* (See <a href="#W_OPTION">a "w" option</a>) |
| 1171 |
* |
| 1172 |
* <dt class="REGEX"><kbd>\></kbd> |
| 1173 |
* <dd>Matches the end of a word. |
| 1174 |
* (See <a href="#W_OPTION">a "w" option</a>) |
| 1175 |
* </dl> |
| 1176 |
* </li> |
| 1177 |
* <li>Lookahead and lookbehind |
| 1178 |
* <dl> |
| 1179 |
* <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd> |
| 1180 |
* <dd>Lookahead. |
| 1181 |
* |
| 1182 |
* <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd> |
| 1183 |
* <dd>Negative lookahead. |
| 1184 |
* |
| 1185 |
* <dt class="REGEX"><kbd>(?<=</kbd><var>X</var><kbd>)</kbd> |
| 1186 |
* <dd>Lookbehind. |
| 1187 |
* <dd>(Note for text capturing......) |
| 1188 |
* |
| 1189 |
* <dt class="REGEX"><kbd>(?<!</kbd><var>X</var><kbd>)</kbd> |
| 1190 |
* <dd>Negative lookbehind. |
| 1191 |
* </dl> |
| 1192 |
* </li> |
| 1193 |
* |
| 1194 |
* <li>Misc. |
| 1195 |
* <dl> |
| 1196 |
* <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>, |
| 1197 |
* <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd> |
| 1198 |
* <dd>...... |
| 1199 |
* <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd> |
| 1200 |
* <dd>Comment. A comment string consists of characters except '<kbd>)</kbd>'. |
| 1201 |
* You can not write comments in character classes and before quantifiers. |
| 1202 |
* </dl> |
| 1203 |
* </li> |
| 1204 |
* </ul> |
| 1205 |
* |
| 1206 |
* |
| 1207 |
* <hr width="50%"> |
| 1208 |
* <h3>BNF for the regular expression</h3> |
| 1209 |
* <pre> |
| 1210 |
* regex ::= ('(?' options ')')? term ('|' term)* |
| 1211 |
* term ::= factor+ |
| 1212 |
* factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )? |
| 1213 |
* | '(?#' [^)]* ')' |
| 1214 |
* minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}' |
| 1215 |
* atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9] |
| 1216 |
* | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X' |
| 1217 |
* | '(?>' regex ')' | '(?' options ':' regex ')' |
| 1218 |
* | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')' |
| 1219 |
* options ::= [imsw]* ('-' [imsw]+)? |
| 1220 |
* anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' |
| 1221 |
* looks ::= '(?=' regex ')' | '(?!' regex ')' |
| 1222 |
* | '(?<=' regex ')' | '(?<!' regex ')' |
| 1223 |
* char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1 |
| 1224 |
* category-block ::= '\' [pP] category-symbol-1 |
| 1225 |
* | ('\p{' | '\P{') (category-symbol | block-name |
| 1226 |
* | other-properties) '}' |
| 1227 |
* category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S' |
| 1228 |
* category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo' |
| 1229 |
* | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No' |
| 1230 |
* | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs' |
| 1231 |
* | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po' |
| 1232 |
* | 'Sm' | 'Sc' | 'Sk' | 'So' |
| 1233 |
* block-name ::= (See above) |
| 1234 |
* other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED' |
| 1235 |
* character-1 ::= (any character except meta-characters) |
| 1236 |
* |
| 1237 |
* char-class ::= '[' ranges ']' |
| 1238 |
* | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')' |
| 1239 |
* ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+ |
| 1240 |
* range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block |
| 1241 |
* | range-char | range-char '-' range-char |
| 1242 |
* range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2 |
| 1243 |
* code-point ::= '\x' hex-char hex-char |
| 1244 |
* | '\x{' hex-char+ '}' |
| 1245 |
* <!-- | '\u005c u' hex-char hex-char hex-char hex-char |
| 1246 |
* --> | '\v' hex-char hex-char hex-char hex-char hex-char hex-char |
| 1247 |
* hex-char ::= [0-9a-fA-F] |
| 1248 |
* character-2 ::= (any character except \[]-,) |
| 1249 |
* </pre> |
| 1250 |
* |
| 1251 |
* <hr width="50%"> |
| 1252 |
* <h3>to do</h3> |
| 1253 |
* <ul> |
| 1254 |
* <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a> |
| 1255 |
* <ul> |
| 1256 |
* <li>2.4 Canonical Equivalents |
| 1257 |
* <li>Level 3 |
| 1258 |
* </ul> |
| 1259 |
* <li>Parsing performance |
| 1260 |
* </ul> |
| 1261 |
* |
| 1262 |
* <hr width="50%"> |
| 1263 |
* |
| 1264 |
* @author TAMURA Kent <kent@trl.ibm.co.jp> |
| 1265 |
* @version $Id: RegEx.java,v 1.8.4.1 2007/08/14 18:18:44 emerks Exp $ |
| 1266 |
*/ |
| 1267 |
public static class RegularExpression implements java.io.Serializable { |
| 1268 |
static final boolean DEBUG = false; |
| 1269 |
|
| 1270 |
/** |
| 1271 |
* Compiles a token tree into an operation flow. |
| 1272 |
*/ |
| 1273 |
private synchronized void compile(Token tok) { |
| 1274 |
if (this.operations != null) |
| 1275 |
return; |
| 1276 |
this.numberOfClosures = 0; |
| 1277 |
this.operations = this.compile(tok, null, false); |
| 1278 |
} |
| 1279 |
|
| 1280 |
/** |
| 1281 |
* Converts a token to an operation. |
| 1282 |
*/ |
| 1283 |
private Op compile(Token tok, Op next, boolean reverse) { |
| 1284 |
Op ret; |
| 1285 |
switch (tok.type) { |
| 1286 |
case Token.DOT: |
| 1287 |
ret = Op.createDot(); |
| 1288 |
ret.next = next; |
| 1289 |
break; |
| 1290 |
|
| 1291 |
case Token.CHAR: |
| 1292 |
ret = Op.createChar(tok.getChar()); |
| 1293 |
ret.next = next; |
| 1294 |
break; |
| 1295 |
|
| 1296 |
case Token.ANCHOR: |
| 1297 |
ret = Op.createAnchor(tok.getChar()); |
| 1298 |
ret.next = next; |
| 1299 |
break; |
| 1300 |
|
| 1301 |
case Token.RANGE: |
| 1302 |
case Token.NRANGE: |
| 1303 |
ret = Op.createRange(tok); |
| 1304 |
ret.next = next; |
| 1305 |
break; |
| 1306 |
|
| 1307 |
case Token.CONCAT: |
| 1308 |
ret = next; |
| 1309 |
if (!reverse) { |
| 1310 |
for (int i = tok.size()-1; i >= 0; i --) { |
| 1311 |
ret = compile(tok.getChild(i), ret, false); |
| 1312 |
} |
| 1313 |
} else { |
| 1314 |
for (int i = 0; i < tok.size(); i ++) { |
| 1315 |
ret = compile(tok.getChild(i), ret, true); |
| 1316 |
} |
| 1317 |
} |
| 1318 |
break; |
| 1319 |
|
| 1320 |
case Token.UNION: |
| 1321 |
Op.UnionOp uni = Op.createUnion(tok.size()); |
| 1322 |
for (int i = 0; i < tok.size(); i ++) { |
| 1323 |
uni.addElement(compile(tok.getChild(i), next, reverse)); |
| 1324 |
} |
| 1325 |
ret = uni; // ret.next is null. |
| 1326 |
break; |
| 1327 |
|
| 1328 |
case Token.CLOSURE: |
| 1329 |
case Token.NONGREEDYCLOSURE: |
| 1330 |
Token child = tok.getChild(0); |
| 1331 |
int min = tok.getMin(); |
| 1332 |
int max = tok.getMax(); |
| 1333 |
if (min >= 0 && min == max) { // {n} |
| 1334 |
ret = next; |
| 1335 |
for (int i = 0; i < min; i ++) { |
| 1336 |
ret = compile(child, ret, reverse); |
| 1337 |
} |
| 1338 |
break; |
| 1339 |
} |
| 1340 |
if (min > 0 && max > 0) |
| 1341 |
max -= min; |
| 1342 |
if (max > 0) { |
| 1343 |
// X{2,6} -> XX(X(X(XX?)?)?)? |
| 1344 |
ret = next; |
| 1345 |
for (int i = 0; i < max; i ++) { |
| 1346 |
Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE); |
| 1347 |
q.next = next; |
| 1348 |
q.setChild(compile(child, ret, reverse)); |
| 1349 |
ret = q; |
| 1350 |
} |
| 1351 |
} else { |
| 1352 |
Op.ChildOp op; |
| 1353 |
if (tok.type == Token.NONGREEDYCLOSURE) { |
| 1354 |
op = Op.createNonGreedyClosure(); |
| 1355 |
} else { // Token.CLOSURE |
| 1356 |
if (child.getMinLength() == 0) |
| 1357 |
op = Op.createClosure(this.numberOfClosures++); |
| 1358 |
else |
| 1359 |
op = Op.createClosure(-1); |
| 1360 |
} |
| 1361 |
op.next = next; |
| 1362 |
op.setChild(compile(child, op, reverse)); |
| 1363 |
ret = op; |
| 1364 |
} |
| 1365 |
if (min > 0) { |
| 1366 |
for (int i = 0; i < min; i ++) { |
| 1367 |
ret = compile(child, ret, reverse); |
| 1368 |
} |
| 1369 |
} |
| 1370 |
break; |
| 1371 |
|
| 1372 |
case Token.EMPTY: |
| 1373 |
ret = next; |
| 1374 |
break; |
| 1375 |
|
| 1376 |
case Token.STRING: |
| 1377 |
ret = Op.createString(tok.getString()); |
| 1378 |
ret.next = next; |
| 1379 |
break; |
| 1380 |
|
| 1381 |
case Token.BACKREFERENCE: |
| 1382 |
ret = Op.createBackReference(tok.getReferenceNumber()); |
| 1383 |
ret.next = next; |
| 1384 |
break; |
| 1385 |
|
| 1386 |
case Token.PAREN: |
| 1387 |
if (tok.getParenNumber() == 0) { |
| 1388 |
ret = compile(tok.getChild(0), next, reverse); |
| 1389 |
} else if (reverse) { |
| 1390 |
next = Op.createCapture(tok.getParenNumber(), next); |
| 1391 |
next = compile(tok.getChild(0), next, reverse); |
| 1392 |
ret = Op.createCapture(-tok.getParenNumber(), next); |
| 1393 |
} else { |
| 1394 |
next = Op.createCapture(-tok.getParenNumber(), next); |
| 1395 |
next = compile(tok.getChild(0), next, reverse); |
| 1396 |
ret = Op.createCapture(tok.getParenNumber(), next); |
| 1397 |
} |
| 1398 |
break; |
| 1399 |
|
| 1400 |
case Token.LOOKAHEAD: |
| 1401 |
ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false)); |
| 1402 |
break; |
| 1403 |
case Token.NEGATIVELOOKAHEAD: |
| 1404 |
ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false)); |
| 1405 |
break; |
| 1406 |
case Token.LOOKBEHIND: |
| 1407 |
ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true)); |
| 1408 |
break; |
| 1409 |
case Token.NEGATIVELOOKBEHIND: |
| 1410 |
ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true)); |
| 1411 |
break; |
| 1412 |
|
| 1413 |
case Token.INDEPENDENT: |
| 1414 |
ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse)); |
| 1415 |
break; |
| 1416 |
|
| 1417 |
case Token.MODIFIERGROUP: |
| 1418 |
ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse), |
| 1419 |
((Token.ModifierToken)tok).getOptions(), |
| 1420 |
((Token.ModifierToken)tok).getOptionsMask()); |
| 1421 |
break; |
| 1422 |
|
| 1423 |
case Token.CONDITION: |
| 1424 |
Token.ConditionToken ctok = (Token.ConditionToken)tok; |
| 1425 |
int ref = ctok.refNumber; |
| 1426 |
Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse); |
| 1427 |
Op yes = compile(ctok.yes, next, reverse); |
| 1428 |
Op no = ctok.no == null ? null : compile(ctok.no, next, reverse); |
| 1429 |
ret = Op.createCondition(next, ref, condition, yes, no); |
| 1430 |
break; |
| 1431 |
|
| 1432 |
default: |
| 1433 |
throw new RuntimeException("Unknown token type: "+tok.type); |
| 1434 |
} // switch (tok.type) |
| 1435 |
return ret; |
| 1436 |
} |
| 1437 |
|
| 1438 |
|
| 1439 |
// Public |
| 1440 |
|
| 1441 |
/** |
| 1442 |
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. |
| 1443 |
* |
| 1444 |
* @return true if the target is matched to this regular expression. |
| 1445 |
*/ |
| 1446 |
public boolean matches(char[] target) { |
| 1447 |
return this.matches(target, 0, target .length , (Match)null); |
| 1448 |
} |
| 1449 |
|
| 1450 |
/** |
| 1451 |
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern |
| 1452 |
* in specified range or not. |
| 1453 |
* |
| 1454 |
* @param start Start offset of the range. |
| 1455 |
* @param end End offset +1 of the range. |
| 1456 |
* @return true if the target is matched to this regular expression. |
| 1457 |
*/ |
| 1458 |
public boolean matches(char[] target, int start, int end) { |
| 1459 |
return this.matches(target, start, end, (Match)null); |
| 1460 |
} |
| 1461 |
|
| 1462 |
/** |
| 1463 |
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. |
| 1464 |
* |
| 1465 |
* @param match A Match instance for storing matching result. |
| 1466 |
* @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. |
| 1467 |
*/ |
| 1468 |
public boolean matches(char[] target, Match match) { |
| 1469 |
return this.matches(target, 0, target .length , match); |
| 1470 |
} |
| 1471 |
|
| 1472 |
|
| 1473 |
/** |
| 1474 |
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern |
| 1475 |
* in specified range or not. |
| 1476 |
* |
| 1477 |
* @param start Start offset of the range. |
| 1478 |
* @param end End offset +1 of the range. |
| 1479 |
* @param match A Match instance for storing matching result. |
| 1480 |
* @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. |
| 1481 |
*/ |
| 1482 |
public boolean matches(char[] target, int start, int end, Match match) { |
| 1483 |
|
| 1484 |
synchronized (this) { |
| 1485 |
if (this.operations == null) |
| 1486 |
this.prepare(); |
| 1487 |
if (this.context == null) |
| 1488 |
this.context = new Context(); |
| 1489 |
} |
| 1490 |
Context con = null; |
| 1491 |
synchronized (this.context) { |
| 1492 |
con = this.context.inuse ? new Context() : this.context; |
| 1493 |
con.reset(target, start, end, this.numberOfClosures); |
| 1494 |
} |
| 1495 |
if (match != null) { |
| 1496 |
match.setNumberOfGroups(this.nofparen); |
| 1497 |
match.setSource(target); |
| 1498 |
} else if (this.hasBackReferences) { |
| 1499 |
match = new Match(); |
| 1500 |
match.setNumberOfGroups(this.nofparen); |
| 1501 |
// Need not to call setSource() because |
| 1502 |
// a caller can not access this match instance. |
| 1503 |
} |
| 1504 |
con.match = match; |
| 1505 |
|
| 1506 |
if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { |
| 1507 |
int matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options); |
| 1508 |
//System.err.println("DEBUG: matchEnd="+matchEnd); |
| 1509 |
if (matchEnd == con.limit) { |
| 1510 |
if (con.match != null) { |
| 1511 |
con.match.setBeginning(0, con.start); |
| 1512 |
con.match.setEnd(0, matchEnd); |
| 1513 |
} |
| 1514 |
con.inuse = false; |
| 1515 |
return true; |
| 1516 |
} |
| 1517 |
return false; |
| 1518 |
} |
| 1519 |
|
| 1520 |
/* |
| 1521 |
* The pattern has only fixed string. |
| 1522 |
* The engine uses Boyer-Moore. |
| 1523 |
*/ |
| 1524 |
if (this.fixedStringOnly) { |
| 1525 |
//System.err.println("DEBUG: fixed-only: "+this.fixedString); |
| 1526 |
int o = this.fixedStringTable.matches(target, con.start, con.limit); |
| 1527 |
if (o >= 0) { |
| 1528 |
if (con.match != null) { |
| 1529 |
con.match.setBeginning(0, o); |
| 1530 |
con.match.setEnd(0, o+this.fixedString.length()); |
| 1531 |
} |
| 1532 |
con.inuse = false; |
| 1533 |
return true; |
| 1534 |
} |
| 1535 |
con.inuse = false; |
| 1536 |
return false; |
| 1537 |
} |
| 1538 |
|
| 1539 |
/* |
| 1540 |
* The pattern contains a fixed string. |
| 1541 |
* The engine checks with Boyer-Moore whether the text contains the fixed string or not. |
| 1542 |
* If not, it return with false. |
| 1543 |
*/ |
| 1544 |
if (this.fixedString != null) { |
| 1545 |
int o = this.fixedStringTable.matches(target, con.start, con.limit); |
| 1546 |
if (o < 0) { |
| 1547 |
//System.err.println("Non-match in fixed-string search."); |
| 1548 |
con.inuse = false; |
| 1549 |
return false; |
| 1550 |
} |
| 1551 |
} |
| 1552 |
|
| 1553 |
int limit = con.limit-this.minlength; |
| 1554 |
int matchStart; |
| 1555 |
int matchEnd = -1; |
| 1556 |
|
| 1557 |
/* |
| 1558 |
* Checks whether the expression starts with ".*". |
| 1559 |
*/ |
| 1560 |
if (this.operations != null |
| 1561 |
&& this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { |
| 1562 |
if (isSet(this.options, SINGLE_LINE)) { |
| 1563 |
matchStart = con.start; |
| 1564 |
matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options); |
| 1565 |
} else { |
| 1566 |
boolean previousIsEOL = true; |
| 1567 |
for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| 1568 |
int ch = target [ matchStart ] ; |
| 1569 |
if (isEOLChar(ch)) { |
| 1570 |
previousIsEOL = true; |
| 1571 |
} else { |
| 1572 |
if (previousIsEOL) { |
| 1573 |
if (0 <= (matchEnd = this. matchCharArray (con, this.operations, |
| 1574 |
matchStart, 1, this.options))) |
| 1575 |
break; |
| 1576 |
} |
| 1577 |
previousIsEOL = false; |
| 1578 |
} |
| 1579 |
} |
| 1580 |
} |
| 1581 |
} |
| 1582 |
|
| 1583 |
/* |
| 1584 |
* Optimization against the first character. |
| 1585 |
*/ |
| 1586 |
else if (this.firstChar != null) { |
| 1587 |
//System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); |
| 1588 |
RangeToken range = this.firstChar; |
| 1589 |
if (RegularExpression.isSet(this.options, IGNORE_CASE)) { |
| 1590 |
range = this.firstChar.getCaseInsensitiveToken(); |
| 1591 |
for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| 1592 |
int ch = target [ matchStart ] ; |
| 1593 |
if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { |
| 1594 |
ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] ); |
| 1595 |
if (!range.match(ch)) continue; |
| 1596 |
} else { |
| 1597 |
if (!range.match(ch)) { |
| 1598 |
char ch1 = Character.toUpperCase((char)ch); |
| 1599 |
if (!range.match(ch1)) |
| 1600 |
if (!range.match(Character.toLowerCase(ch1))) |
| 1601 |
continue; |
| 1602 |
} |
| 1603 |
} |
| 1604 |
if (0 <= (matchEnd = this. matchCharArray (con, this.operations, |
| 1605 |
matchStart, 1, this.options))) |
| 1606 |
break; |
| 1607 |
} |
| 1608 |
} else { |
| 1609 |
for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| 1610 |
int ch = target [ matchStart ] ; |
| 1611 |
if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) |
| 1612 |
ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] ); |
| 1613 |
if (!range.match(ch)) continue; |
| 1614 |
if (0 <= (matchEnd = this. matchCharArray (con, this.operations, |
| 1615 |
matchStart, 1, this.options))) |
| 1616 |
break; |
| 1617 |
} |
| 1618 |
} |
| 1619 |
} |
| 1620 |
|
| 1621 |
/* |
| 1622 |
* Straightforward matching. |
| 1623 |
*/ |
| 1624 |
else { |
| 1625 |
for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| 1626 |
if (0 <= (matchEnd = this. matchCharArray (con, this.operations, matchStart, 1, this.options))) |
| 1627 |
break; |
| 1628 |
} |
| 1629 |
} |
| 1630 |
|
| 1631 |
if (matchEnd >= 0) { |
| 1632 |
if (con.match != null) { |
| 1633 |
con.match.setBeginning(0, matchStart); |
| 1634 |
con.match.setEnd(0, matchEnd); |
| 1635 |
} |
| 1636 |
con.inuse = false; |
| 1637 |
return true; |
| 1638 |
} else { |
| 1639 |
con.inuse = false; |
| 1640 |
return false; |
| 1641 |
} |
| 1642 |
} |
| 1643 |
|
| 1644 |
/** |
| 1645 |
* @return -1 when not match; offset of the end of matched string when match. |
| 1646 |
*/ |
| 1647 |
private int matchCharArray (Context con, Op op, int offset, int dx, int opts) { |
| 1648 |
|
| 1649 |
char[] target = con.charTarget; |
| 1650 |
|
| 1651 |
|
| 1652 |
while (true) { |
| 1653 |
if (op == null) |
| 1654 |
return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; |
| 1655 |
if (offset > con.limit || offset < con.start) |
| 1656 |
return -1; |
| 1657 |
switch (op.type) { |
| 1658 |
case Op.CHAR: |
| 1659 |
if (isSet(opts, IGNORE_CASE)) { |
| 1660 |
int ch = op.getData(); |
| 1661 |
if (dx > 0) { |
| 1662 |
if (offset >= con.limit || !matchIgnoreCase(ch, target [ offset ] )) |
| 1663 |
return -1; |
| 1664 |
offset ++; |
| 1665 |
} else { |
| 1666 |
int o1 = offset-1; |
| 1667 |
if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target [ o1 ] )) |
| 1668 |
return -1; |
| 1669 |
offset = o1; |
| 1670 |
} |
| 1671 |
} else { |
| 1672 |
int ch = op.getData(); |
| 1673 |
if (dx > 0) { |
| 1674 |
if (offset >= con.limit || ch != target [ offset ] ) |
| 1675 |
return -1; |
| 1676 |
offset ++; |
| 1677 |
} else { |
| 1678 |
int o1 = offset-1; |
| 1679 |
if (o1 >= con.limit || o1 < 0 || ch != target [ o1 ] ) |
| 1680 |
return -1; |
| 1681 |
offset = o1; |
| 1682 |
} |
| 1683 |
} |
| 1684 |
op = op.next; |
| 1685 |
break; |
| 1686 |
|
| 1687 |
case Op.DOT: |
| 1688 |
if (dx > 0) { |
| 1689 |
if (offset >= con.limit) |
| 1690 |
return -1; |
| 1691 |
int ch = target [ offset ] ; |
| 1692 |
if (isSet(opts, SINGLE_LINE)) { |
| 1693 |
if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) |
| 1694 |
offset ++; |
| 1695 |
} else { |
| 1696 |
if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) |
| 1697 |
ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] ); |
| 1698 |
if (isEOLChar(ch)) |
| 1699 |
return -1; |
| 1700 |
} |
| 1701 |
offset ++; |
| 1702 |
} else { |
| 1703 |
int o1 = offset-1; |
| 1704 |
if (o1 >= con.limit || o1 < 0) |
| 1705 |
return -1; |
| 1706 |
int ch = target [ o1 ] ; |
| 1707 |
if (isSet(opts, SINGLE_LINE)) { |
| 1708 |
if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) |
| 1709 |
o1 --; |
| 1710 |
} else { |
| 1711 |
if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) |
| 1712 |
ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch); |
| 1713 |
if (!isEOLChar(ch)) |
| 1714 |
return -1; |
| 1715 |
} |
| 1716 |
offset = o1; |
| 1717 |
} |
| 1718 |
op = op.next; |
| 1719 |
break; |
| 1720 |
|
| 1721 |
case Op.RANGE: |
| 1722 |
case Op.NRANGE: |
| 1723 |
if (dx > 0) { |
| 1724 |
if (offset >= con.limit) |
| 1725 |
return -1; |
| 1726 |
int ch = target [ offset ] ; |
| 1727 |
if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) |
| 1728 |
ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] ); |
| 1729 |
RangeToken tok = op.getToken(); |
| 1730 |
if (isSet(opts, IGNORE_CASE)) { |
| 1731 |
tok = tok.getCaseInsensitiveToken(); |
| 1732 |
if (!tok.match(ch)) { |
| 1733 |
if (ch >= 0x10000) return -1; |
| 1734 |
char uch; |
| 1735 |
if (!tok.match(uch = Character.toUpperCase((char)ch)) |
| 1736 |
&& !tok.match(Character.toLowerCase(uch))) |
| 1737 |
return -1; |
| 1738 |
} |
| 1739 |
} else { |
| 1740 |
if (!tok.match(ch)) return -1; |
| 1741 |
} |
| 1742 |
offset ++; |
| 1743 |
} else { |
| 1744 |
int o1 = offset-1; |
| 1745 |
if (o1 >= con.limit || o1 < 0) |
| 1746 |
return -1; |
| 1747 |
int ch = target [ o1 ] ; |
| 1748 |
if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) |
| 1749 |
ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch); |
| 1750 |
RangeToken tok = op.getToken(); |
| 1751 |
if (isSet(opts, IGNORE_CASE)) { |
| 1752 |
tok = tok.getCaseInsensitiveToken(); |
| 1753 |
if (!tok.match(ch)) { |
| 1754 |
if (ch >= 0x10000) return -1; |
| 1755 |
char uch; |
| 1756 |
if (!tok.match(uch = Character.toUpperCase((char)ch)) |
| 1757 |
&& !tok.match(Character.toLowerCase(uch))) |
| 1758 |
return -1; |
| 1759 |
} |
| 1760 |
} else { |
| 1761 |
if (!tok.match(ch)) return -1; |
| 1762 |
} |
| 1763 |
offset = o1; |
| 1764 |
} |
| 1765 |
op = op.next; |
| 1766 |
break; |
| 1767 |
|
| 1768 |
case Op.ANCHOR: |
| 1769 |
boolean go = false; |
| 1770 |
switch (op.getData()) { |
| 1771 |
case '^': |
| 1772 |
if (isSet(opts, MULTIPLE_LINES)) { |
| 1773 |
if (!(offset == con.start |
| 1774 |
|| offset > con.start && isEOLChar( target [ offset-1 ] ))) |
| 1775 |
return -1; |
| 1776 |
} else { |
| 1777 |
if (offset != con.start) |
| 1778 |
return -1; |
| 1779 |
} |
| 1780 |
break; |
| 1781 |
|
| 1782 |
case '@': // Internal use only. |
| 1783 |
// The @ always matches line beginnings. |
| 1784 |
if (!(offset == con.start |
| 1785 |
|| offset > con.start && isEOLChar( target [ offset-1 ] ))) |
| 1786 |
return -1; |
| 1787 |
break; |
| 1788 |
|
| 1789 |
case '$': |
| 1790 |
if (isSet(opts, MULTIPLE_LINES)) { |
| 1791 |
if (!(offset == con.limit |
| 1792 |
|| offset < con.limit && isEOLChar( target [ offset ] ))) |
| 1793 |
return -1; |
| 1794 |
} else { |
| 1795 |
if (!(offset == con.limit |
| 1796 |
|| offset+1 == con.limit && isEOLChar( target [ offset ] ) |
| 1797 |
|| offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN |
| 1798 |
&& target [ offset+1 ] == LINE_FEED)) |
| 1799 |
return -1; |
| 1800 |
} |
| 1801 |
break; |
| 1802 |
|
| 1803 |
case 'A': |
| 1804 |
if (offset != con.start) return -1; |
| 1805 |
break; |
| 1806 |
|
| 1807 |
case 'Z': |
| 1808 |
if (!(offset == con.limit |
| 1809 |
|| offset+1 == con.limit && isEOLChar( target [ offset ] ) |
| 1810 |
|| offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN |
| 1811 |
&& target [ offset+1 ] == LINE_FEED)) |
| 1812 |
return -1; |
| 1813 |
break; |
| 1814 |
|
| 1815 |
case 'z': |
| 1816 |
if (offset != con.limit) return -1; |
| 1817 |
break; |
| 1818 |
|
| 1819 |
case 'b': |
| 1820 |
if (con.length == 0) return -1; |
| 1821 |
{ |
| 1822 |
int after = getWordType(target, con.start, con.limit, offset, opts); |
| 1823 |
if (after == WT_IGNORE) return -1; |
| 1824 |
int before = getPreviousWordType(target, con.start, con.limit, offset, opts); |
| 1825 |
if (after == before) return -1; |
| 1826 |
} |
| 1827 |
break; |
| 1828 |
|
| 1829 |
case 'B': |
| 1830 |
if (con.length == 0) |
| 1831 |
go = true; |
| 1832 |
else { |
| 1833 |
int after = getWordType(target, con.start, con.limit, offset, opts); |
| 1834 |
go = after == WT_IGNORE |
| 1835 |
|| after == getPreviousWordType(target, con.start, con.limit, offset, opts); |
| 1836 |
} |
| 1837 |
if (!go) return -1; |
| 1838 |
break; |
| 1839 |
|
| 1840 |
case '<': |
| 1841 |
if (con.length == 0 || offset == con.limit) return -1; |
| 1842 |
if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER |
| 1843 |
|| getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) |
| 1844 |
return -1; |
| 1845 |
break; |
| 1846 |
|
| 1847 |
case '>': |
| 1848 |
if (con.length == 0 || offset == con.start) return -1; |
| 1849 |
if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER |
| 1850 |
|| getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) |
| 1851 |
return -1; |
| 1852 |
break; |
| 1853 |
} // switch anchor type |
| 1854 |
op = op.next; |
| 1855 |
break; |
| 1856 |
|
| 1857 |
case Op.BACKREFERENCE: |
| 1858 |
{ |
| 1859 |
int refno = op.getData(); |
| 1860 |
if (refno <= 0 || refno >= this.nofparen) |
| 1861 |
throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno); |
| 1862 |
if (con.match.getBeginning(refno) < 0 |
| 1863 |
|| con.match.getEnd(refno) < 0) |
| 1864 |
return -1; // ******** |
| 1865 |
int o2 = con.match.getBeginning(refno); |
| 1866 |
int literallen = con.match.getEnd(refno)-o2; |
| 1867 |
if (!isSet(opts, IGNORE_CASE)) { |
| 1868 |
if (dx > 0) { |
| 1869 |
if (!regionMatches(target, offset, con.limit, o2, literallen)) |
| 1870 |
return -1; |
| 1871 |
offset += literallen; |
| 1872 |
} else { |
| 1873 |
if (!regionMatches(target, offset-literallen, con.limit, o2, literallen)) |
| 1874 |
return -1; |
| 1875 |
offset -= literallen; |
| 1876 |
} |
| 1877 |
} else { |
| 1878 |
if (dx > 0) { |
| 1879 |
if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen)) |
| 1880 |
return -1; |
| 1881 |
offset += literallen; |
| 1882 |
} else { |
| 1883 |
if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, |
| 1884 |
o2, literallen)) |
| 1885 |
return -1; |
| 1886 |
offset -= literallen; |
| 1887 |
} |
| 1888 |
} |
| 1889 |
} |
| 1890 |
op = op.next; |
| 1891 |
break; |
| 1892 |
case Op.STRING: |
| 1893 |
{ |
| 1894 |
String literal = op.getString(); |
| 1895 |
int literallen = literal.length(); |
| 1896 |
if (!isSet(opts, IGNORE_CASE)) { |
| 1897 |
if (dx > 0) { |
| 1898 |
if (!regionMatches(target, offset, con.limit, literal, literallen)) |
| 1899 |
return -1; |
| 1900 |
offset += literallen; |
| 1901 |
} else { |
| 1902 |
if (!regionMatches(target, offset-literallen, con.limit, literal, literallen)) |
| 1903 |
return -1; |
| 1904 |
offset -= literallen; |
| 1905 |
} |
| 1906 |
} else { |
| 1907 |
if (dx > 0) { |
| 1908 |
if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen)) |
| 1909 |
return -1; |
| 1910 |
offset += literallen; |
| 1911 |
} else { |
| 1912 |
if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, |
| 1913 |
literal, literallen)) |
| 1914 |
return -1; |
| 1915 |
offset -= literallen; |
| 1916 |
} |
| 1917 |
} |
| 1918 |
} |
| 1919 |
op = op.next; |
| 1920 |
break; |
| 1921 |
|
| 1922 |
case Op.CLOSURE: |
| 1923 |
{ |
| 1924 |
/* |
| 1925 |
* Saves current position to avoid |
| 1926 |
* zero-width repeats. |
| 1927 |
*/ |
| 1928 |
int id = op.getData(); |
| 1929 |
if (id >= 0) { |
| 1930 |
int previousOffset = con.offsets[id]; |
| 1931 |
if (previousOffset < 0 || previousOffset != offset) { |
| 1932 |
con.offsets[id] = offset; |
| 1933 |
} else { |
| 1934 |
con.offsets[id] = -1; |
| 1935 |
op = op.next; |
| 1936 |
break; |
| 1937 |
} |
| 1938 |
} |
| 1939 |
|
| 1940 |
int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts); |
| 1941 |
if (id >= 0) con.offsets[id] = -1; |
| 1942 |
if (ret >= 0) return ret; |
| 1943 |
op = op.next; |
| 1944 |
} |
| 1945 |
break; |
| 1946 |
|
| 1947 |
case Op.QUESTION: |
| 1948 |
{ |
| 1949 |
int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts); |
| 1950 |
if (ret >= 0) return ret; |
| 1951 |
op = op.next; |
| 1952 |
} |
| 1953 |
break; |
| 1954 |
|
| 1955 |
case Op.NONGREEDYCLOSURE: |
| 1956 |
case Op.NONGREEDYQUESTION: |
| 1957 |
{ |
| 1958 |
int ret = this. matchCharArray (con, op.next, offset, dx, opts); |
| 1959 |
if (ret >= 0) return ret; |
| 1960 |
op = op.getChild(); |
| 1961 |
} |
| 1962 |
break; |
| 1963 |
|
| 1964 |
case Op.UNION: |
| 1965 |
for (int i = 0; i < op.size(); i ++) { |
| 1966 |
int ret = this. matchCharArray (con, op.elementAt(i), offset, dx, opts); |
| 1967 |
if (DEBUG) { |
| 1968 |
System.err.println("UNION: "+i+", ret="+ret); |
| 1969 |
} |
| 1970 |
if (ret >= 0) return ret; |
| 1971 |
} |
| 1972 |
return -1; |
| 1973 |
|
| 1974 |
case Op.CAPTURE: |
| 1975 |
int refno = op.getData(); |
| 1976 |
if (con.match != null && refno > 0) { |
| 1977 |
int save = con.match.getBeginning(refno); |
| 1978 |
con.match.setBeginning(refno, offset); |
| 1979 |
int ret = this. matchCharArray (con, op.next, offset, dx, opts); |
| 1980 |
if (ret < 0) con.match.setBeginning(refno, save); |
| 1981 |
return ret; |
| 1982 |
} else if (con.match != null && refno < 0) { |
| 1983 |
int index = -refno; |
| 1984 |
int save = con.match.getEnd(index); |
| 1985 |
con.match.setEnd(index, offset); |
| 1986 |
int ret = this. matchCharArray (con, op.next, offset, dx, opts); |
| 1987 |
if (ret < 0) con.match.setEnd(index, save); |
| 1988 |
return ret; |
| 1989 |
} |
| 1990 |
op = op.next; |
| 1991 |
break; |
| 1992 |
|
| 1993 |
case Op.LOOKAHEAD: |
| 1994 |
if (0 > this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1; |
| 1995 |
op = op.next; |
| 1996 |
break; |
| 1997 |
case Op.NEGATIVELOOKAHEAD: |
| 1998 |
if (0 <= this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1; |
| 1999 |
op = op.next; |
| 2000 |
break; |
| 2001 |
case Op.LOOKBEHIND: |
| 2002 |
if (0 > this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1; |
| 2003 |
op = op.next; |
| 2004 |
break; |
| 2005 |
case Op.NEGATIVELOOKBEHIND: |
| 2006 |
if (0 <= this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1; |
| 2007 |
op = op.next; |
| 2008 |
break; |
| 2009 |
|
| 2010 |
case Op.INDEPENDENT: |
| 2011 |
{ |
| 2012 |
int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts); |
| 2013 |
if (ret < 0) return ret; |
| 2014 |
offset = ret; |
| 2015 |
op = op.next; |
| 2016 |
} |
| 2017 |
break; |
| 2018 |
|
| 2019 |
case Op.MODIFIER: |
| 2020 |
{ |
| 2021 |
int localopts = opts; |
| 2022 |
localopts |= op.getData(); |
| 2023 |
localopts &= ~op.getData2(); |
| 2024 |
//System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16)); |
| 2025 |
int ret = this. matchCharArray (con, op.getChild(), offset, dx, localopts); |
| 2026 |
if (ret < 0) return ret; |
| 2027 |
offset = ret; |
| 2028 |
op = op.next; |
| 2029 |
} |
| 2030 |
break; |
| 2031 |
|
| 2032 |
case Op.CONDITION: |
| 2033 |
{ |
| 2034 |
Op.ConditionOp cop = (Op.ConditionOp)op; |
| 2035 |
boolean matchp = false; |
| 2036 |
if (cop.refNumber > 0) { |
| 2037 |
if (cop.refNumber >= this.nofparen) |
| 2038 |
throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber); |
| 2039 |
matchp = con.match.getBeginning(cop.refNumber) >= 0 |
| 2040 |
&& con.match.getEnd(cop.refNumber) >= 0; |
| 2041 |
} else { |
| 2042 |
matchp = 0 <= this. matchCharArray (con, cop.condition, offset, dx, opts); |
| 2043 |
} |
| 2044 |
|
| 2045 |
if (matchp) { |
| 2046 |
op = cop.yes; |
| 2047 |
} else if (cop.no != null) { |
| 2048 |
op = cop.no; |
| 2049 |
} else { |
| 2050 |
op = cop.next; |
| 2051 |
} |
| 2052 |
} |
| 2053 |
break; |
| 2054 |
|
| 2055 |
default: |
| 2056 |
throw new RuntimeException("Unknown operation type: "+op.type); |
| 2057 |
} // switch (op.type) |
| 2058 |
} // while |
| 2059 |
} |
| 2060 |
|
| 2061 |
private static final int getPreviousWordType(char[] target, int begin, int end, |
| 2062 |
int offset, int opts) { |
| 2063 |
int ret = getWordType(target, begin, end, --offset, opts); |
| 2064 |
while (ret == WT_IGNORE) |
| 2065 |
ret = getWordType(target, begin, end, --offset, opts); |
| 2066 |
return ret; |
| 2067 |
} |
| 2068 |
|
| 2069 |
private static final int getWordType(char[] target, int begin, int end, |
| 2070 |
int offset, int opts) { |
| 2071 |
if (offset < begin || offset >= end) return WT_OTHER; |
| 2072 |
return getWordType0( target [ offset ] , opts); |
| 2073 |
} |
| 2074 |
|
| 2075 |
|
| 2076 |
|
| 2077 |
private static final boolean regionMatches(char[] target, int offset, int limit, |
| 2078 |
String part, int partlen) { |
| 2079 |
if (offset < 0) return false; |
| 2080 |
if (limit-offset < partlen) |
| 2081 |
return false; |
| 2082 |
int i = 0; |
| 2083 |
while (partlen-- > 0) { |
| 2084 |
if ( target [ offset++ ] != part.charAt(i++)) |
| 2085 |
return false; |
| 2086 |
} |
| 2087 |
return true; |
| 2088 |
} |
| 2089 |
|
| 2090 |
private static final boolean regionMatches(char[] target, int offset, int limit, |
| 2091 |
int offset2, int partlen) { |
| 2092 |
if (offset < 0) return false; |
| 2093 |
if (limit-offset < partlen) |
| 2094 |
return false; |
| 2095 |
int i = offset2; |
| 2096 |
while (partlen-- > 0) { |
| 2097 |
if ( target [ offset++ ] != target [ i++ ] ) |
| 2098 |
return false; |
| 2099 |
} |
| 2100 |
return true; |
| 2101 |
} |
| 2102 |
|
| 2103 |
/** |
| 2104 |
* @see java.lang.String#regionMatches |
| 2105 |
*/ |
| 2106 |
private static final boolean regionMatchesIgnoreCase(char[] target, int offset, int limit, |
| 2107 |
String part, int partlen) { |
| 2108 |
if (offset < 0) return false; |
| 2109 |
if (limit-offset < partlen) |
| 2110 |
return false; |
| 2111 |
int i = 0; |
| 2112 |
while (partlen-- > 0) { |
| 2113 |
char ch1 = target [ offset++ ] ; |
| 2114 |
char ch2 = part.charAt(i++); |
| 2115 |
if (ch1 == ch2) |
| 2116 |
continue; |
| 2117 |
char uch1 = Character.toUpperCase(ch1); |
| 2118 |
char uch2 = Character.toUpperCase(ch2); |
| 2119 |
if (uch1 == uch2) |
| 2120 |
continue; |
| 2121 |
if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) |
| 2122 |
return false; |
| 2123 |
} |
| 2124 |
return true; |
| 2125 |
} |
| 2126 |
|
| 2127 |
private static final boolean regionMatchesIgnoreCase(char[] target, int offset, int limit, |
| 2128 |
int offset2, int partlen) { |
| 2129 |
if (offset < 0) return false; |
| 2130 |
if (limit-offset < partlen) |
| 2131 |
return false; |
| 2132 |
int i = offset2; |
| 2133 |
while (partlen-- > 0) { |
| 2134 |
char ch1 = target [ offset++ ] ; |
| 2135 |
char ch2 = target [ i++ ] ; |
| 2136 |
if (ch1 == ch2) |
| 2137 |
continue; |
| 2138 |
char uch1 = Character.toUpperCase(ch1); |
| 2139 |
char uch2 = Character.toUpperCase(ch2); |
| 2140 |
if (uch1 == uch2) |
| 2141 |
continue; |
| 2142 |
if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) |
| 2143 |
return false; |
| 2144 |
} |
| 2145 |
return true; |
| 2146 |
} |
| 2147 |
|
| 2148 |
|
| 2149 |
|
| 2150 |
|
| 2151 |
/** |
| 2152 |
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. |
| 2153 |
* |
| 2154 |
* @return true if the target is matched to this regular expression. |
| 2155 |
*/ |
| 2156 |
public boolean matches(String target) { |
| 2157 |
return this.matches(target, 0, target .length() , (Match)null); |
| 2158 |
} |
| 2159 |
|
| 2160 |
/** |
| 2161 |
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern |
| 2162 |
* in specified range or not. |
| 2163 |
* |
| 2164 |
* @param start Start offset of the range. |
| 2165 |
* @param end End offset +1 of the range. |
| 2166 |
* @return true if the target is matched to this regular expression. |
| 2167 |
*/ |
| 2168 |
public boolean matches(String target, int start, int end) { |
| 2169 |
return this.matches(target, start, end, (Match)null); |
| 2170 |
} |
| 2171 |
|
| 2172 |
/** |
| 2173 |
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. |
| 2174 |
* |
| 2175 |
* @param match A Match instance for storing matching result. |
| 2176 |
* @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. |
| 2177 |
*/ |
| 2178 |
public boolean matches(String target, Match match) { |
| 2179 |
return this.matches(target, 0, target .length() , match); |
| 2180 |
} |
| 2181 |
|
| 2182 |
/** |
| 2183 |
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern |
| 2184 |
* in specified range or not. |
| 2185 |
* |
| 2186 |
* @param start Start offset of the range. |
| 2187 |
* @param end End offset +1 of the range. |
| 2188 |
* @param match A Match instance for storing matching result. |
| 2189 |
* @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. |
| 2190 |
*/ |
| 2191 |
public boolean matches(String target, int start, int end, Match match) { |
| 2192 |
|
| 2193 |
synchronized (this) { |
| 2194 |
if (this.operations == null) |
| 2195 |
this.prepare(); |
| 2196 |
if (this.context == null) |
| 2197 |
this.context = new Context(); |
| 2198 |
} |
| 2199 |
Context con = null; |
| 2200 |
synchronized (this.context) { |
| 2201 |
con = this.context.inuse ? new Context() : this.context; |
| 2202 |
con.reset(target, start, end, this.numberOfClosures); |
| 2203 |
} |
| 2204 |
if (match != null) { |
| 2205 |
match.setNumberOfGroups(this.nofparen); |
| 2206 |
match.setSource(target); |
| 2207 |
} else if (this.hasBackReferences) { |
| 2208 |
match = new Match(); |
| 2209 |
match.setNumberOfGroups(this.nofparen); |
| 2210 |
// Need not to call setSource() because |
| 2211 |
// a caller can not access this match instance. |
| 2212 |
} |
| 2213 |
con.match = match; |
| 2214 |
|
| 2215 |
if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { |
| 2216 |
if (DEBUG) { |
| 2217 |
System.err.println("target string="+target); |
| 2218 |
} |
| 2219 |
int matchEnd = this. matchString (con, this.operations, con.start, 1, this.options); |
| 2220 |
if (DEBUG) { |
| 2221 |
System.err.println("matchEnd="+matchEnd); |
| 2222 |
System.err.println("con.limit="+con.limit); |
| 2223 |
} |
| 2224 |
if (matchEnd == con.limit) { |
| 2225 |
if (con.match != null) { |
| 2226 |
con.match.setBeginning(0, con.start); |
| 2227 |
con.match.setEnd(0, matchEnd); |
| 2228 |
} |
| 2229 |
con.inuse = false; |
| 2230 |
return true; |
| 2231 |
} |
| 2232 |
return false; |
| 2233 |
} |
| 2234 |
|
| 2235 |
/* |
| 2236 |
* The pattern has only fixed string. |
| 2237 |
* The engine uses Boyer-Moore. |
| 2238 |
*/ |
| 2239 |
if (this.fixedStringOnly) { |
| 2240 |
//System.err.println("DEBUG: fixed-only: "+this.fixedString); |
| 2241 |
int o = this.fixedStringTable.matches(target, con.start, con.limit); |
| 2242 |
if (o >= 0) { |
| 2243 |
if (con.match != null) { |
| 2244 |
con.match.setBeginning(0, o); |
| 2245 |
con.match.setEnd(0, o+this.fixedString.length()); |
| 2246 |
} |
| 2247 |
con.inuse = false; |
| 2248 |
return true; |
| 2249 |
} |
| 2250 |
con.inuse = false; |
| 2251 |
return false; |
| 2252 |
} |
| 2253 |
|
| 2254 |
/* |
| 2255 |
* The pattern contains a fixed string. |
| 2256 |
* The engine checks with Boyer-Moore whether the text contains the fixed string or not. |
| 2257 |
* If not, it return with false. |
| 2258 |
*/ |
| 2259 |
if (this.fixedString != null) { |
| 2260 |
int o = this.fixedStringTable.matches(target, con.start, con.limit); |
| 2261 |
if (o < 0) { |
| 2262 |
//System.err.println("Non-match in fixed-string search."); |
| 2263 |
con.inuse = false; |
| 2264 |
return false; |
| 2265 |
} |
| 2266 |
} |
| 2267 |
|
| 2268 |
int limit = con.limit-this.minlength; |
| 2269 |
int matchStart; |
| 2270 |
int matchEnd = -1; |
| 2271 |
|
| 2272 |
/* |
| 2273 |
* Checks whether the expression starts with ".*". |
| 2274 |
*/ |
| 2275 |
if (this.operations != null |
| 2276 |
&& this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { |
| 2277 |
if (isSet(this.options, SINGLE_LINE)) { |
| 2278 |
matchStart = con.start; |
| 2279 |
matchEnd = this. matchString (con, this.operations, con.start, 1, this.options); |
| 2280 |
} else { |
| 2281 |
boolean previousIsEOL = true; |
| 2282 |
for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| 2283 |
int ch = target .charAt( matchStart ) ; |
| 2284 |
if (isEOLChar(ch)) { |
| 2285 |
previousIsEOL = true; |
| 2286 |
} else { |
| 2287 |
if (previousIsEOL) { |
| 2288 |
if (0 <= (matchEnd = this. matchString (con, this.operations, |
| 2289 |
matchStart, 1, this.options))) |
| 2290 |
break; |
| 2291 |
} |
| 2292 |
previousIsEOL = false; |
| 2293 |
} |
| 2294 |
} |
| 2295 |
} |
| 2296 |
} |
| 2297 |
|
| 2298 |
/* |
| 2299 |
* Optimization against the first character. |
| 2300 |
*/ |
| 2301 |
else if (this.firstChar != null) { |
| 2302 |
//System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); |
| 2303 |
RangeToken range = this.firstChar; |
| 2304 |
if (RegularExpression.isSet(this.options, IGNORE_CASE)) { |
| 2305 |
range = this.firstChar.getCaseInsensitiveToken(); |
| 2306 |
for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| 2307 |
int ch = target .charAt( matchStart ) ; |
| 2308 |
if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { |
| 2309 |
ch = REUtil.composeFromSurrogates(ch, target .charAt( matchStart+1 ) ); |
| 2310 |
if (!range.match(ch)) continue; |
| 2311 |
} else { |
| 2312 |
if (!range.match(ch)) { |
| 2313 |
char ch1 = Character.toUpperCase((char)ch); |
| 2314 |
if (!range.match(ch1)) |
| 2315 |
if (!range.match(Character.toLowerCase(ch1))) |
| 2316 |
continue; |
| 2317 |
} |
| 2318 |
} |
| 2319 |
if (0 <= (matchEnd = this. matchString (con, this.operations, |
| 2320 |
matchStart, 1, this.options))) |
| 2321 |
break; |
| 2322 |
} |
| 2323 |
} else { |
| 2324 |
for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| 2325 |
int ch = target .charAt( matchStart ) ; |
| 2326 |
if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) |
| 2327 |
ch = REUtil.composeFromSurrogates(ch, target .charAt( matchStart+1 ) ); |
| 2328 |
if (!range.match(ch)) continue; |
| 2329 |
if (0 <= (matchEnd = this. matchString (con, this.operations, |
| 2330 |
matchStart, 1, this.options))) |
| 2331 |
break; |
| 2332 |
} |
| 2333 |
} |
| 2334 |
} |
| 2335 |
|
| 2336 |
/* |
| 2337 |
* Straightforward matching. |
| 2338 |
*/ |
| 2339 |
else { |
| 2340 |
for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| 2341 |
if (0 <= (matchEnd = this. matchString (con, this.operations, matchStart, 1, this.options))) |
| 2342 |
break; |
| 2343 |
} |
| 2344 |
} |
| 2345 |
|
| 2346 |
if (matchEnd >= 0) { |
| 2347 |
if (con.match != null) { |
| 2348 |
con.match.setBeginning(0, matchStart); |
| 2349 |
con.match.setEnd(0, matchEnd); |
| 2350 |
} |
| 2351 |
con.inuse = false; |
| 2352 |
return true; |
| 2353 |
} else { |
| 2354 |
con.inuse = false; |
| 2355 |
return false; |
| 2356 |
} |
| 2357 |
} |
| 2358 |
|
| 2359 |
/** |
| 2360 |
* @return -1 when not match; offset of the end of matched string when match. |
| 2361 |
*/ |
| 2362 |
private int matchString (Context con, Op op, int offset, int dx, int opts) { |
| 2363 |
|
| 2364 |
|
| 2365 |
|
| 2366 |
|
| 2367 |
String target = con.strTarget; |
| 2368 |
|
| 2369 |
|
| 2370 |
|
| 2371 |
|
| 2372 |
while (true) { |
| 2373 |
if (op == null) |
| 2374 |
return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; |
| 2375 |
if (offset > con.limit || offset < con.start) |
| 2376 |
return -1; |
| 2377 |
switch (op.type) { |
| 2378 |
case Op.CHAR: |
| 2379 |
if (isSet(opts, IGNORE_CASE)) { |
| 2380 |
int ch = op.getData(); |
| 2381 |
if (dx > 0) { |
| 2382 |
if (offset >= con.limit || !matchIgnoreCase(ch, target .charAt( offset ) )) |
| 2383 |
return -1; |
| 2384 |
offset ++; |
| 2385 |
} else { |
| 2386 |
int o1 = offset-1; |
| 2387 |
if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target .charAt( o1 ) )) |
| 2388 |
return -1; |
| 2389 |
offset = o1; |
| 2390 |
} |
| 2391 |
} else { |
| 2392 |
int ch = op.getData(); |
| 2393 |
if (dx > 0) { |
| 2394 |
if (offset >= con.limit || ch != target .charAt( offset ) ) |
| 2395 |
return -1; |
| 2396 |
offset ++; |
| 2397 |
} else { |
| 2398 |
int o1 = offset-1; |
| 2399 |
if (o1 >= con.limit || o1 < 0 || ch != target .charAt( o1 ) ) |
| 2400 |
return -1; |
| 2401 |
offset = o1; |
| 2402 |
} |
| 2403 |
} |
| 2404 |
op = op.next; |
| 2405 |
break; |
| 2406 |
|
| 2407 |
case Op.DOT: |
| 2408 |
if (dx > 0) { |
| 2409 |
if (offset >= con.limit) |
| 2410 |
return -1; |
| 2411 |
int ch = target .charAt( offset ) ; |
| 2412 |
if (isSet(opts, SINGLE_LINE)) { |
| 2413 |
if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) |
| 2414 |
offset ++; |
| 2415 |
} else { |
| 2416 |
if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) |
| 2417 |
ch = REUtil.composeFromSurrogates(ch, target .charAt( ++offset ) ); |
| 2418 |
if (isEOLChar(ch)) |
| 2419 |
return -1; |
| 2420 |
} |
| 2421 |
offset ++; |
| 2422 |
} else { |
| 2423 |
int o1 = offset-1; |
| 2424 |
if (o1 >= con.limit || o1 < 0) |
| 2425 |
return -1; |
| 2426 |
int ch = target .charAt( o1 ) ; |
| 2427 |
if (isSet(opts, SINGLE_LINE)) { |
| 2428 |
if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) |
| 2429 |
o1 --; |
| 2430 |
} else { |
| 2431 |
if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) |
| 2432 |
ch = REUtil.composeFromSurrogates( target .charAt( --o1 ) , ch); |
| 2433 |
if (!isEOLChar(ch)) |
| 2434 |
return -1; |
| 2435 |
} |
| 2436 |
offset = o1; |
| 2437 |
} |
| 2438 |
op = op.next; |
| 2439 |
break; |
| 2440 |
|
| 2441 |
case Op.RANGE: |
| 2442 |
case Op.NRANGE: |
| 2443 |
if (dx > 0) { |
| 2444 |
if (offset >= con.limit) |
| 2445 |
return -1; |
| 2446 |
int ch = target .charAt( offset ) ; |
| 2447 |
if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) |
| 2448 |
ch = REUtil.composeFromSurrogates(ch, target .charAt( ++offset ) ); |
| 2449 |
RangeToken tok = op.getToken(); |
| 2450 |
if (isSet(opts, IGNORE_CASE)) { |
| 2451 |
tok = tok.getCaseInsensitiveToken(); |
| 2452 |
if (!tok.match(ch)) { |
| 2453 |
if (ch >= 0x10000) return -1; |
| 2454 |
char uch; |
| 2455 |
if (!tok.match(uch = Character.toUpperCase((char)ch)) |
| 2456 |
&& !tok.match(Character.toLowerCase(uch))) |
| 2457 |
return -1; |
| 2458 |
} |
| 2459 |
} else { |
| 2460 |
if (!tok.match(ch)) return -1; |
| 2461 |
} |
| 2462 |
offset ++; |
| 2463 |
} else { |
| 2464 |
int o1 = offset-1; |
| 2465 |
if (o1 >= con.limit || o1 < 0) |
| 2466 |
return -1; |
| 2467 |
int ch = target .charAt( o1 ) ; |
| 2468 |
if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) |
| 2469 |
ch = REUtil.composeFromSurrogates( target .charAt( --o1 ) , ch); |
| 2470 |
RangeToken tok = op.getToken(); |
| 2471 |
if (isSet(opts, IGNORE_CASE)) { |
| 2472 |
tok = tok.getCaseInsensitiveToken(); |
| 2473 |
if (!tok.match(ch)) { |
| 2474 |
if (ch >= 0x10000) return -1; |
| 2475 |
char uch; |
| 2476 |
if (!tok.match(uch = Character.toUpperCase((char)ch)) |
| 2477 |
&& !tok.match(Character.toLowerCase(uch))) |
| 2478 |
return -1; |
| 2479 |
} |
| 2480 |
} else { |
| 2481 |
if (!tok.match(ch)) return -1; |
| 2482 |
} |
| 2483 |
offset = o1; |
| 2484 |
} |
| 2485 |
op = op.next; |
| 2486 |
break; |
| 2487 |
|
| 2488 |
case Op.ANCHOR: |
| 2489 |
boolean go = false; |
| 2490 |
switch (op.getData()) { |
| 2491 |
case '^': |
| 2492 |
if (isSet(opts, MULTIPLE_LINES)) { |
| 2493 |
if (!(offset == con.start |
| 2494 |
|| offset > con.start && isEOLChar( target .charAt( offset-1 ) ))) |
| 2495 |
return -1; |
| 2496 |
} else { |
| 2497 |
if (offset != con.start) |
| 2498 |
return -1; |
| 2499 |
} |
| 2500 |
break; |
| 2501 |
|
| 2502 |
case '@': // Internal use only. |
| 2503 |
// The @ always matches line beginnings. |
| 2504 |
if (!(offset == con.start |
| 2505 |
|| offset > con.start && isEOLChar( target .charAt( offset-1 ) ))) |
| 2506 |
return -1; |
| 2507 |
break; |
| 2508 |
|
| 2509 |
case '$': |
| 2510 |
if (isSet(opts, MULTIPLE_LINES)) { |
| 2511 |
if (!(offset == con.limit |
| 2512 |
|| offset < con.limit && isEOLChar( target .charAt( offset ) ))) |
| 2513 |
return -1; |
| 2514 |
} else { |
| 2515 |
if (!(offset == con.limit |
| 2516 |
|| offset+1 == con.limit && isEOLChar( target .charAt( offset ) ) |
| 2517 |
|| offset+2 == con.limit && target .charAt( offset ) == CARRIAGE_RETURN |
| 2518 |
&& target .charAt( offset+1 ) == LINE_FEED)) |
| 2519 |
return -1; |
| 2520 |
} |
| 2521 |
break; |
| 2522 |
|
| 2523 |
case 'A': |
| 2524 |
if (offset != con.start) return -1; |
| 2525 |
break; |
| 2526 |
|
| 2527 |
case 'Z': |
| 2528 |
if (!(offset == con.limit |
| 2529 |
|| offset+1 == con.limit && isEOLChar( target .charAt( offset ) ) |
| 2530 |
|| offset+2 == con.limit && target .charAt( offset ) == CARRIAGE_RETURN |
| 2531 |
&& target .charAt( offset+1 ) == LINE_FEED)) |
| 2532 |
return -1; |
| 2533 |
break; |
| 2534 |
|
| 2535 |
case 'z': |
| 2536 |
if (offset != con.limit) return -1; |
| 2537 |
break; |
| 2538 |
|
| 2539 |
case 'b': |
| 2540 |
if (con.length == 0) return -1; |
| 2541 |
{ |
| 2542 |
int after = getWordType(target, con.start, con.limit, offset, opts); |
| 2543 |
if (after == WT_IGNORE) return -1; |
| 2544 |
int before = getPreviousWordType(target, con.start, con.limit, offset, opts); |
| 2545 |
if (after == before) return -1; |
| 2546 |
} |
| 2547 |
break; |
| 2548 |
|
| 2549 |
case 'B': |
| 2550 |
if (con.length == 0) |
| 2551 |
go = true; |
| 2552 |
else { |
| 2553 |
int after = getWordType(target, con.start, con.limit, offset, opts); |
| 2554 |
go = after == WT_IGNORE |
| 2555 |
|| after == getPreviousWordType(target, con.start, con.limit, offset, opts); |
| 2556 |
} |
| 2557 |
if (!go) return -1; |
| 2558 |
break; |
| 2559 |
|
| 2560 |
case '<': |
| 2561 |
if (con.length == 0 || offset == con.limit) return -1; |
| 2562 |
if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER |
| 2563 |
|| getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) |
| 2564 |
return -1; |
| 2565 |
break; |
| 2566 |
|
| 2567 |
case '>': |
| 2568 |
if (con.length == 0 || offset == con.start) return -1; |
| 2569 |
if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER |
| 2570 |
|| getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) |
| 2571 |
return -1; |
| 2572 |
break; |
| 2573 |
} // switch anchor type |
| 2574 |
op = op.next; |
| 2575 |
break; |
| 2576 |
|
| 2577 |
case Op.BACKREFERENCE: |
| 2578 |
{ |
| 2579 |
int refno = op.getData(); |
| 2580 |
if (refno <= 0 || refno >= this.nofparen) |
| 2581 |
throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno); |
| 2582 |
if (con.match.getBeginning(refno) < 0 |
| 2583 |
|| con.match.getEnd(refno) < 0) |
| 2584 |
return -1; // ******** |
| 2585 |
int o2 = con.match.getBeginning(refno); |
| 2586 |
int literallen = con.match.getEnd(refno)-o2; |
| 2587 |
if (!isSet(opts, IGNORE_CASE)) { |
| 2588 |
if (dx > 0) { |
| 2589 |
if (!regionMatches(target, offset, con.limit, o2, literallen)) |
| 2590 |
return -1; |
| 2591 |
offset += literallen; |
| 2592 |
} else { |
| 2593 |
if (!regionMatches(target, offset-literallen, con.limit, o2, literallen)) |
| 2594 |
return -1; |
| 2595 |
offset -= literallen; |
| 2596 |
} |
| 2597 |
} else { |
| 2598 |
if (dx > 0) { |
| 2599 |
if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen)) |
| 2600 |
return -1; |
| 2601 |
offset += literallen; |
| 2602 |
} else { |
| 2603 |
if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, |
| 2604 |
o2, literallen)) |
| 2605 |
return -1; |
| 2606 |
offset -= literallen; |
| 2607 |
} |
| 2608 |
} |
| 2609 |
} |
| 2610 |
op = op.next; |
| 2611 |
break; |
| 2612 |
case Op.STRING: |
| 2613 |
{ |
| 2614 |
String literal = op.getString(); |
| 2615 |
int literallen = literal.length(); |
| 2616 |
if (!isSet(opts, IGNORE_CASE)) { |
| 2617 |
if (dx > 0) { |
| 2618 |
if (!regionMatches(target, offset, con.limit, literal, literallen)) |
| 2619 |
return -1; |
| 2620 |
offset += literallen; |
| 2621 |
} else { |
| 2622 |
if (!regionMatches(target, offset-literallen, con.limit, literal, literallen)) |
| 2623 |
return -1; |
| 2624 |
offset -= literallen; |
| 2625 |
} |
| 2626 |
} else { |
| 2627 |
if (dx > 0) { |
| 2628 |
if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen)) |
| 2629 |
return -1; |
| 2630 |
offset += literallen; |
| 2631 |
} else { |
| 2632 |
if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, |
| 2633 |
literal, literallen)) |
| 2634 |
return -1; |
| 2635 |
offset -= literallen; |
| 2636 |
} |
| 2637 |
} |
| 2638 |
} |
| 2639 |
op = op.next; |
| 2640 |
break; |
| 2641 |
|
| 2642 |
case Op.CLOSURE: |
| 2643 |
{ |
| 2644 |
/* |
| 2645 |
* Saves current position to avoid |
| 2646 |
* zero-width repeats. |
| 2647 |
*/ |
| 2648 |
int id = op.getData(); |
| 2649 |
if (id >= 0) { |
| 2650 |
int previousOffset = con.offsets[id]; |
| 2651 |
if (previousOffset < 0 || previousOffset != offset) { |
| 2652 |
con.offsets[id] = offset; |
| 2653 |
} else { |
| 2654 |
con.offsets[id] = -1; |
| 2655 |
op = op.next; |
| 2656 |
break; |
| 2657 |
} |
| 2658 |
} |
| 2659 |
int ret = this. matchString (con, op.getChild(), offset, dx, opts); |
| 2660 |
if (id >= 0) con.offsets[id] = -1; |
| 2661 |
if (ret >= 0) return ret; |
| 2662 |
op = op.next; |
| 2663 |
} |
| 2664 |
break; |
| 2665 |
|
| 2666 |
case Op.QUESTION: |
| 2667 |
{ |
| 2668 |
int ret = this. matchString (con, op.getChild(), offset, dx, opts); |
| 2669 |
if (ret >= 0) return ret; |
| 2670 |
op = op.next; |
| 2671 |
} |
| 2672 |
break; |
| 2673 |
|
| 2674 |
case Op.NONGREEDYCLOSURE: |
| 2675 |
case Op.NONGREEDYQUESTION: |
| 2676 |
{ |
| 2677 |
int ret = this. matchString (con, op.next, offset, dx, opts); |
| 2678 |
if (ret >= 0) return ret; |
| 2679 |
op = op.getChild(); |
| 2680 |
} |
| 2681 |
break; |
| 2682 |
|
| 2683 |
case Op.UNION: |
| 2684 |
for (int i = 0; i < op.size(); i ++) { |
| 2685 |
int ret = this. matchString (con, op.elementAt(i), offset, dx, opts); |
| 2686 |
if (DEBUG) { |
| 2687 |
System.err.println("UNION: "+i+", ret="+ret); |
| 2688 |
} |
| 2689 |
if (ret >= 0) return ret; |
| 2690 |
} |
| 2691 |
return -1; |
| 2692 |
|
| 2693 |
case Op.CAPTURE: |
| 2694 |
int refno = op.getData(); |
| 2695 |
if (con.match != null && refno > 0) { |
| 2696 |
int save = con.match.getBeginning(refno); |
| 2697 |
con.match.setBeginning(refno, offset); |
| 2698 |
int ret = this. matchString (con, op.next, offset, dx, opts); |
| 2699 |
if (ret < 0) con.match.setBeginning(refno, save); |
| 2700 |
return ret; |
| 2701 |
} else if (con.match != null && refno < 0) { |
| 2702 |
int index = -refno; |
| 2703 |
int save = con.match.getEnd(index); |
| 2704 |
con.match.setEnd(index, offset); |
| 2705 |
int ret = this. matchString (con, op.next, offset, dx, opts); |
| 2706 |
if (ret < 0) con.match.setEnd(index, save); |
| 2707 |
return ret; |
| 2708 |
} |
| 2709 |
op = op.next; |
| 2710 |
break; |
| 2711 |
|
| 2712 |
case Op.LOOKAHEAD: |
| 2713 |
if (0 > this. matchString (con, op.getChild(), offset, 1, opts)) return -1; |
| 2714 |
op = op.next; |
| 2715 |
break; |
| 2716 |
case Op.NEGATIVELOOKAHEAD: |
| 2717 |
if (0 <= this. matchString (con, op.getChild(), offset, 1, opts)) return -1; |
| 2718 |
op = op.next; |
| 2719 |
break; |
| 2720 |
case Op.LOOKBEHIND: |
| 2721 |
if (0 > this. matchString (con, op.getChild(), offset, -1, opts)) return -1; |
| 2722 |
op = op.next; |
| 2723 |
break; |
| 2724 |
case Op.NEGATIVELOOKBEHIND: |
| 2725 |
if (0 <= this. matchString (con, op.getChild(), offset, -1, opts)) return -1; |
| 2726 |
op = op.next; |
| 2727 |
break; |
| 2728 |
|
| 2729 |
case Op.INDEPENDENT: |
| 2730 |
{ |
| 2731 |
int ret = this. matchString (con, op.getChild(), offset, dx, opts); |
| 2732 |
if (ret < 0) return ret; |
| 2733 |
offset = ret; |
| 2734 |
op = op.next; |
| 2735 |
} |
| 2736 |
break; |
| 2737 |
|
| 2738 |
case Op.MODIFIER: |
| 2739 |
{ |
| 2740 |
int localopts = opts; |
| 2741 |
localopts |= op.getData(); |
| 2742 |
localopts &= ~op.getData2(); |
| 2743 |
//System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16)); |
| 2744 |
int ret = this. matchString (con, op.getChild(), offset, dx, localopts); |
| 2745 |
if (ret < 0) return ret; |
| 2746 |
offset = ret; |
| 2747 |
op = op.next; |
| 2748 |
} |
| 2749 |
break; |
| 2750 |
|
| 2751 |
case Op.CONDITION: |
| 2752 |
{ |
| 2753 |
Op.ConditionOp cop = (Op.ConditionOp)op; |
| 2754 |
boolean matchp = false; |
| 2755 |
if (cop.refNumber > 0) { |
| 2756 |
if (cop.refNumber >= this.nofparen) |
| 2757 |
throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber); |
| 2758 |
matchp = con.match.getBeginning(cop.refNumber) >= 0 |
| 2759 |
&& con.match.getEnd(cop.refNumber) >= 0; |
| 2760 |
} else { |
| 2761 |
matchp = 0 <= this. matchString (con, cop.condition, offset, dx, opts); |
| 2762 |
} |
| 2763 |
|
| 2764 |
if (matchp) { |
| 2765 |
op = cop.yes; |
| 2766 |
} else if (cop.no != null) { |
| 2767 |
op = cop.no; |
| 2768 |
} else { |
| 2769 |
op = cop.next; |
| 2770 |
} |
| 2771 |
} |
| 2772 |
break; |
| 2773 |
|
| 2774 |
default: |
| 2775 |
throw new RuntimeException("Unknown operation type: "+op.type); |
| 2776 |
} // switch (op.type) |
| 2777 |
} // while |
| 2778 |
} |
| 2779 |
|
| 2780 |
private static final int getPreviousWordType(String target, int begin, int end, |
| 2781 |
int offset, int opts) { |
| 2782 |
int ret = getWordType(target, begin, end, --offset, opts); |
| 2783 |
while (ret == WT_IGNORE) |
| 2784 |
ret = getWordType(target, begin, end, --offset, opts); |
| 2785 |
return ret; |
| 2786 |
} |
| 2787 |
|
| 2788 |
private static final int getWordType(String target, int begin, int end, |
| 2789 |
int offset, int opts) { |
| 2790 |
if (offset < begin || offset >= end) return WT_OTHER; |
| 2791 |
return getWordType0( target .charAt( offset ) , opts); |
| 2792 |
} |
| 2793 |
|
| 2794 |
|
| 2795 |
private static final boolean regionMatches(String text, int offset, int limit, |
| 2796 |
String part, int partlen) { |
| 2797 |
if (limit-offset < partlen) return false; |
| 2798 |
return text.regionMatches(offset, part, 0, partlen); |
| 2799 |
} |
| 2800 |
|
| 2801 |
private static final boolean regionMatches(String text, int offset, int limit, |
| 2802 |
int offset2, int partlen) { |
| 2803 |
if (limit-offset < partlen) return false; |
| 2804 |
return text.regionMatches(offset, text, offset2, partlen); |
| 2805 |
} |
| 2806 |
|
| 2807 |
private static final boolean regionMatchesIgnoreCase(String text, int offset, int limit, |
| 2808 |
String part, int partlen) { |
| 2809 |
return text.regionMatches(true, offset, part, 0, partlen); |
| 2810 |
} |
| 2811 |
|
| 2812 |
private static final boolean regionMatchesIgnoreCase(String text, int offset, int limit, |
| 2813 |
int offset2, int partlen) { |
| 2814 |
if (limit-offset < partlen) return false; |
| 2815 |
return text.regionMatches(true, offset, text, offset2, partlen); |
| 2816 |
} |
| 2817 |
|
| 2818 |
|
| 2819 |
|
| 2820 |
|
| 2821 |
|
| 2822 |
|
| 2823 |
|
| 2824 |
/** |
| 2825 |
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. |
| 2826 |
* |
| 2827 |
* @return true if the target is matched to this regular expression. |
| 2828 |
*/ |
| 2829 |
public boolean matches(CharacterIterator target) { |
| 2830 |
return this.matches(target, (Match)null); |
| 2831 |
} |
| 2832 |
|
| 2833 |
|
| 2834 |
/** |
| 2835 |
* Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. |
| 2836 |
* |
| 2837 |
* @param match A Match instance for storing matching result. |
| 2838 |
* @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. |
| 2839 |
*/ |
| 2840 |
public boolean matches(CharacterIterator target, Match match) { |
| 2841 |
int start = target.getBeginIndex(); |
| 2842 |
int end = target.getEndIndex(); |
| 2843 |
|
| 2844 |
|
| 2845 |
|
| 2846 |
synchronized (this) { |
| 2847 |
if (this.operations == null) |
| 2848 |
this.prepare(); |
| 2849 |
if (this.context == null) |
| 2850 |
this.context = new Context(); |
| 2851 |
} |
| 2852 |
Context con = null; |
| 2853 |
synchronized (this.context) { |
| 2854 |
con = this.context.inuse ? new Context() : this.context; |
| 2855 |
con.reset(target, start, end, this.numberOfClosures); |
| 2856 |
} |
| 2857 |
if (match != null) { |
| 2858 |
match.setNumberOfGroups(this.nofparen); |
| 2859 |
match.setSource(target); |
| 2860 |
} else if (this.hasBackReferences) { |
| 2861 |
match = new Match(); |
| 2862 |
match.setNumberOfGroups(this.nofparen); |
| 2863 |
// Need not to call setSource() because |
| 2864 |
// a caller can not access this match instance. |
| 2865 |
} |
| 2866 |
con.match = match; |
| 2867 |
|
| 2868 |
if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { |
| 2869 |
int matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options); |
| 2870 |
//System.err.println("DEBUG: matchEnd="+matchEnd); |
| 2871 |
if (matchEnd == con.limit) { |
| 2872 |
if (con.match != null) { |
| 2873 |
con.match.setBeginning(0, con.start); |
| 2874 |
con.match.setEnd(0, matchEnd); |
| 2875 |
} |
| 2876 |
con.inuse = false; |
| 2877 |
return true; |
| 2878 |
} |
| 2879 |
return false; |
| 2880 |
} |
| 2881 |
|
| 2882 |
/* |
| 2883 |
* The pattern has only fixed string. |
| 2884 |
* The engine uses Boyer-Moore. |
| 2885 |
*/ |
| 2886 |
if (this.fixedStringOnly) { |
| 2887 |
//System.err.println("DEBUG: fixed-only: "+this.fixedString); |
| 2888 |
int o = this.fixedStringTable.matches(target, con.start, con.limit); |
| 2889 |
if (o >= 0) { |
| 2890 |
if (con.match != null) { |
| 2891 |
con.match.setBeginning(0, o); |
| 2892 |
con.match.setEnd(0, o+this.fixedString.length()); |
| 2893 |
} |
| 2894 |
con.inuse = false; |
| 2895 |
return true; |
| 2896 |
} |
| 2897 |
con.inuse = false; |
| 2898 |
return false; |
| 2899 |
} |
| 2900 |
|
| 2901 |
/* |
| 2902 |
* The pattern contains a fixed string. |
| 2903 |
* The engine checks with Boyer-Moore whether the text contains the fixed string or not. |
| 2904 |
* If not, it return with false. |
| 2905 |
*/ |
| 2906 |
if (this.fixedString != null) { |
| 2907 |
int o = this.fixedStringTable.matches(target, con.start, con.limit); |
| 2908 |
if (o < 0) { |
| 2909 |
//System.err.println("Non-match in fixed-string search."); |
| 2910 |
con.inuse = false; |
| 2911 |
return false; |
| 2912 |
} |
| 2913 |
} |
| 2914 |
|
| 2915 |
int limit = con.limit-this.minlength; |
| 2916 |
int matchStart; |
| 2917 |
int matchEnd = -1; |
| 2918 |
|
| 2919 |
/* |
| 2920 |
* Checks whether the expression starts with ".*". |
| 2921 |
*/ |
| 2922 |
if (this.operations != null |
| 2923 |
&& this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { |
| 2924 |
if (isSet(this.options, SINGLE_LINE)) { |
| 2925 |
matchStart = con.start; |
| 2926 |
matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options); |
| 2927 |
} else { |
| 2928 |
boolean previousIsEOL = true; |
| 2929 |
for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| 2930 |
int ch = target .setIndex( matchStart ) ; |
| 2931 |
if (isEOLChar(ch)) { |
| 2932 |
previousIsEOL = true; |
| 2933 |
} else { |
| 2934 |
if (previousIsEOL) { |
| 2935 |
if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, |
| 2936 |
matchStart, 1, this.options))) |
| 2937 |
break; |
| 2938 |
} |
| 2939 |
previousIsEOL = false; |
| 2940 |
} |
| 2941 |
} |
| 2942 |
} |
| 2943 |
} |
| 2944 |
|
| 2945 |
/* |
| 2946 |
* Optimization against the first character. |
| 2947 |
*/ |
| 2948 |
else if (this.firstChar != null) { |
| 2949 |
//System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); |
| 2950 |
RangeToken range = this.firstChar; |
| 2951 |
if (RegularExpression.isSet(this.options, IGNORE_CASE)) { |
| 2952 |
range = this.firstChar.getCaseInsensitiveToken(); |
| 2953 |
for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| 2954 |
int ch = target .setIndex( matchStart ) ; |
| 2955 |
if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { |
| 2956 |
ch = REUtil.composeFromSurrogates(ch, target .setIndex( matchStart+1 ) ); |
| 2957 |
if (!range.match(ch)) continue; |
| 2958 |
} else { |
| 2959 |
if (!range.match(ch)) { |
| 2960 |
char ch1 = Character.toUpperCase((char)ch); |
| 2961 |
if (!range.match(ch1)) |
| 2962 |
if (!range.match(Character.toLowerCase(ch1))) |
| 2963 |
continue; |
| 2964 |
} |
| 2965 |
} |
| 2966 |
if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, |
| 2967 |
matchStart, 1, this.options))) |
| 2968 |
break; |
| 2969 |
} |
| 2970 |
} else { |
| 2971 |
for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| 2972 |
int ch = target .setIndex( matchStart ) ; |
| 2973 |
if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) |
| 2974 |
ch = REUtil.composeFromSurrogates(ch, target .setIndex( matchStart+1 ) ); |
| 2975 |
if (!range.match(ch)) continue; |
| 2976 |
if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, |
| 2977 |
matchStart, 1, this.options))) |
| 2978 |
break; |
| 2979 |
} |
| 2980 |
} |
| 2981 |
} |
| 2982 |
|
| 2983 |
/* |
| 2984 |
* Straightforward matching. |
| 2985 |
*/ |
| 2986 |
else { |
| 2987 |
for (matchStart = con.start; matchStart <= limit; matchStart ++) { |
| 2988 |
if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, matchStart, 1, this.options))) |
| 2989 |
break; |
| 2990 |
} |
| 2991 |
} |
| 2992 |
|
| 2993 |
if (matchEnd >= 0) { |
| 2994 |
if (con.match != null) { |
| 2995 |
con.match.setBeginning(0, matchStart); |
| 2996 |
con.match.setEnd(0, matchEnd); |
| 2997 |
} |
| 2998 |
con.inuse = false; |
| 2999 |
return true; |
| 3000 |
} else { |
| 3001 |
con.inuse = false; |
| 3002 |
return false; |
| 3003 |
} |
| 3004 |
} |
| 3005 |
|
| 3006 |
/** |
| 3007 |
* @return -1 when not match; offset of the end of matched string when match. |
| 3008 |
*/ |
| 3009 |
private int matchCharacterIterator (Context con, Op op, int offset, int dx, int opts) { |
| 3010 |
|
| 3011 |
|
| 3012 |
CharacterIterator target = con.ciTarget; |
| 3013 |
|
| 3014 |
|
| 3015 |
|
| 3016 |
|
| 3017 |
|
| 3018 |
|
| 3019 |
while (true) { |
| 3020 |
if (op == null) |
| 3021 |
return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; |
| 3022 |
if (offset > con.limit || offset < con.start) |
| 3023 |
return -1; |
| 3024 |
switch (op.type) { |
| 3025 |
case Op.CHAR: |
| 3026 |
if (isSet(opts, IGNORE_CASE)) { |
| 3027 |
int ch = op.getData(); |
| 3028 |
if (dx > 0) { |
| 3029 |
if (offset >= con.limit || !matchIgnoreCase(ch, target .setIndex( offset ) )) |
| 3030 |
return -1; |
| 3031 |
offset ++; |
| 3032 |
} else { |
| 3033 |
int o1 = offset-1; |
| 3034 |
if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target .setIndex( o1 ) )) |
| 3035 |
return -1; |
| 3036 |
offset = o1; |
| 3037 |
} |
| 3038 |
} else { |
| 3039 |
int ch = op.getData(); |
| 3040 |
if (dx > 0) { |
| 3041 |
if (offset >= con.limit || ch != target .setIndex( offset ) ) |
| 3042 |
return -1; |
| 3043 |
offset ++; |
| 3044 |
} else { |
| 3045 |
int o1 = offset-1; |
| 3046 |
if (o1 >= con.limit || o1 < 0 || ch != target .setIndex( o1 ) ) |
| 3047 |
return -1; |
| 3048 |
offset = o1; |
| 3049 |
} |
| 3050 |
} |
| 3051 |
op = op.next; |
| 3052 |
break; |
| 3053 |
|
| 3054 |
case Op.DOT: |
| 3055 |
if (dx > 0) { |
| 3056 |
if (offset >= con.limit) |
| 3057 |
return -1; |
| 3058 |
int ch = target .setIndex( offset ) ; |
| 3059 |
if (isSet(opts, SINGLE_LINE)) { |
| 3060 |
if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) |
| 3061 |
offset ++; |
| 3062 |
} else { |
| 3063 |
if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) |
| 3064 |
ch = REUtil.composeFromSurrogates(ch, target .setIndex( ++offset ) ); |
| 3065 |
if (isEOLChar(ch)) |
| 3066 |
return -1; |
| 3067 |
} |
| 3068 |
offset ++; |
| 3069 |
} else { |
| 3070 |
int o1 = offset-1; |
| 3071 |
if (o1 >= con.limit || o1 < 0) |
| 3072 |
return -1; |
| 3073 |
int ch = target .setIndex( o1 ) ; |
| 3074 |
if (isSet(opts, SINGLE_LINE)) { |
| 3075 |
if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) |
| 3076 |
o1 --; |
| 3077 |
} else { |
| 3078 |
if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) |
| 3079 |
ch = REUtil.composeFromSurrogates( target .setIndex( --o1 ) , ch); |
| 3080 |
if (!isEOLChar(ch)) |
| 3081 |
return -1; |
| 3082 |
} |
| 3083 |
offset = o1; |
| 3084 |
} |
| 3085 |
op = op.next; |
| 3086 |
break; |
| 3087 |
|
| 3088 |
case Op.RANGE: |
| 3089 |
case Op.NRANGE: |
| 3090 |
if (dx > 0) { |
| 3091 |
if (offset >= con.limit) |
| 3092 |
return -1; |
| 3093 |
int ch = target .setIndex( offset ) ; |
| 3094 |
if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) |
| 3095 |
ch = REUtil.composeFromSurrogates(ch, target .setIndex( ++offset ) ); |
| 3096 |
RangeToken tok = op.getToken(); |
| 3097 |
if (isSet(opts, IGNORE_CASE)) { |
| 3098 |
tok = tok.getCaseInsensitiveToken(); |
| 3099 |
if (!tok.match(ch)) { |
| 3100 |
if (ch >= 0x10000) return -1; |
| 3101 |
char uch; |
| 3102 |
if (!tok.match(uch = Character.toUpperCase((char)ch)) |
| 3103 |
&& !tok.match(Character.toLowerCase(uch))) |
| 3104 |
return -1; |
| 3105 |
} |
| 3106 |
} else { |
| 3107 |
if (!tok.match(ch)) return -1; |
| 3108 |
} |
| 3109 |
offset ++; |
| 3110 |
} else { |
| 3111 |
int o1 = offset-1; |
| 3112 |
if (o1 >= con.limit || o1 < 0) |
| 3113 |
return -1; |
| 3114 |
int ch = target .setIndex( o1 ) ; |
| 3115 |
if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) |
| 3116 |
ch = REUtil.composeFromSurrogates( target .setIndex( --o1 ) , ch); |
| 3117 |
RangeToken tok = op.getToken(); |
| 3118 |
if (isSet(opts, IGNORE_CASE)) { |
| 3119 |
tok = tok.getCaseInsensitiveToken(); |
| 3120 |
if (!tok.match(ch)) { |
| 3121 |
if (ch >= 0x10000) return -1; |
| 3122 |
char uch; |
| 3123 |
if (!tok.match(uch = Character.toUpperCase((char)ch)) |
| 3124 |
&& !tok.match(Character.toLowerCase(uch))) |
| 3125 |
return -1; |
| 3126 |
} |
| 3127 |
} else { |
| 3128 |
if (!tok.match(ch)) return -1; |
| 3129 |
} |
| 3130 |
offset = o1; |
| 3131 |
} |
| 3132 |
op = op.next; |
| 3133 |
break; |
| 3134 |
|
| 3135 |
case Op.ANCHOR: |
| 3136 |
boolean go = false; |
| 3137 |
switch (op.getData()) { |
| 3138 |
case '^': |
| 3139 |
if (isSet(opts, MULTIPLE_LINES)) { |
| 3140 |
if (!(offset == con.start |
| 3141 |
|| offset > con.start && isEOLChar( target .setIndex( offset-1 ) ))) |
| 3142 |
return -1; |
| 3143 |
} else { |
| 3144 |
if (offset != con.start) |
| 3145 |
return -1; |
| 3146 |
} |
| 3147 |
break; |
| 3148 |
|
| 3149 |
case '@': // Internal use only. |
| 3150 |
// The @ always matches line beginnings. |
| 3151 |
if (!(offset == con.start |
| 3152 |
|| offset > con.start && isEOLChar( target .setIndex( offset-1 ) ))) |
| 3153 |
return -1; |
| 3154 |
break; |
| 3155 |
|
| 3156 |
case '$': |
| 3157 |
if (isSet(opts, MULTIPLE_LINES)) { |
| 3158 |
if (!(offset == con.limit |
| 3159 |
|| offset < con.limit && isEOLChar( target .setIndex( offset ) ))) |
| 3160 |
return -1; |
| 3161 |
} else { |
| 3162 |
if (!(offset == con.limit |
| 3163 |
|| offset+1 == con.limit && isEOLChar( target .setIndex( offset ) ) |
| 3164 |
|| offset+2 == con.limit && target .setIndex( offset ) == CARRIAGE_RETURN |
| 3165 |
&& target .setIndex( offset+1 ) == LINE_FEED)) |
| 3166 |
return -1; |
| 3167 |
} |
| 3168 |
break; |
| 3169 |
|
| 3170 |
case 'A': |
| 3171 |
if (offset != con.start) return -1; |
| 3172 |
break; |
| 3173 |
|
| 3174 |
case 'Z': |
| 3175 |
if (!(offset == con.limit |
| 3176 |
|| offset+1 == con.limit && isEOLChar( target .setIndex( offset ) ) |
| 3177 |
|| offset+2 == con.limit && target .setIndex( offset ) == CARRIAGE_RETURN |
| 3178 |
&& target .setIndex( offset+1 ) == LINE_FEED)) |
| 3179 |
return -1; |
| 3180 |
break; |
| 3181 |
|
| 3182 |
case 'z': |
| 3183 |
if (offset != con.limit) return -1; |
| 3184 |
break; |
| 3185 |
|
| 3186 |
case 'b': |
| 3187 |
if (con.length == 0) return -1; |
| 3188 |
{ |
| 3189 |
int after = getWordType(target, con.start, con.limit, offset, opts); |
| 3190 |
if (after == WT_IGNORE) return -1; |
| 3191 |
int before = getPreviousWordType(target, con.start, con.limit, offset, opts); |
| 3192 |
if (after == before) return -1; |
| 3193 |
} |
| 3194 |
break; |
| 3195 |
|
| 3196 |
case 'B': |
| 3197 |
if (con.length == 0) |
| 3198 |
go = true; |
| 3199 |
else { |
| 3200 |
int after = getWordType(target, con.start, con.limit, offset, opts); |
| 3201 |
go = after == WT_IGNORE |
| 3202 |
|| after == getPreviousWordType(target, con.start, con.limit, offset, opts); |
| 3203 |
} |
| 3204 |
if (!go) return -1; |
| 3205 |
break; |
| 3206 |
|
| 3207 |
case '<': |
| 3208 |
if (con.length == 0 || offset == con.limit) return -1; |
| 3209 |
if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER |
| 3210 |
|| getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) |
| 3211 |
return -1; |
| 3212 |
break; |
| 3213 |
|
| 3214 |
case '>': |
| 3215 |
if (con.length == 0 || offset == con.start) return -1; |
| 3216 |
if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER |
| 3217 |
|| getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) |
| 3218 |
return -1; |
| 3219 |
break; |
| 3220 |
} // switch anchor type |
| 3221 |
op = op.next; |
| 3222 |
break; |
| 3223 |
|
| 3224 |
case Op.BACKREFERENCE: |
| 3225 |
{ |
| 3226 |
int refno = op.getData(); |
| 3227 |
if (refno <= 0 || refno >= this.nofparen) |
| 3228 |
throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno); |
| 3229 |
if (con.match.getBeginning(refno) < 0 |
| 3230 |
|| con.match.getEnd(refno) < 0) |
| 3231 |
return -1; // ******** |
| 3232 |
int o2 = con.match.getBeginning(refno); |
| 3233 |
int literallen = con.match.getEnd(refno)-o2; |
| 3234 |
if (!isSet(opts, IGNORE_CASE)) { |
| 3235 |
if (dx > 0) { |
| 3236 |
if (!regionMatches(target, offset, con.limit, o2, literallen)) |
| 3237 |
return -1; |
| 3238 |
offset += literallen; |
| 3239 |
} else { |
| 3240 |
if (!regionMatches(target, offset-literallen, con.limit, o2, literallen)) |
| 3241 |
return -1; |
| 3242 |
offset -= literallen; |
| 3243 |
} |
| 3244 |
} else { |
| 3245 |
if (dx > 0) { |
| 3246 |
if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen)) |
| 3247 |
return -1; |
| 3248 |
offset += literallen; |
| 3249 |
} else { |
| 3250 |
if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, |
| 3251 |
o2, literallen)) |
| 3252 |
return -1; |
| 3253 |
offset -= literallen; |
| 3254 |
} |
| 3255 |
} |
| 3256 |
} |
| 3257 |
op = op.next; |
| 3258 |
break; |
| 3259 |
case Op.STRING: |
| 3260 |
{ |
| 3261 |
String literal = op.getString(); |
| 3262 |
int literallen = literal.length(); |
| 3263 |
if (!isSet(opts, IGNORE_CASE)) { |
| 3264 |
if (dx > 0) { |
| 3265 |
if (!regionMatches(target, offset, con.limit, literal, literallen)) |
| 3266 |
return -1; |
| 3267 |
offset += literallen; |
| 3268 |
} else { |
| 3269 |
if (!regionMatches(target, offset-literallen, con.limit, literal, literallen)) |
| 3270 |
return -1; |
| 3271 |
offset -= literallen; |
| 3272 |
} |
| 3273 |
} else { |
| 3274 |
if (dx > 0) { |
| 3275 |
if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen)) |
| 3276 |
return -1; |
| 3277 |
offset += literallen; |
| 3278 |
} else { |
| 3279 |
if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, |
| 3280 |
literal, literallen)) |
| 3281 |
return -1; |
| 3282 |
offset -= literallen; |
| 3283 |
} |
| 3284 |
} |
| 3285 |
} |
| 3286 |
op = op.next; |
| 3287 |
break; |
| 3288 |
|
| 3289 |
case Op.CLOSURE: |
| 3290 |
{ |
| 3291 |
/* |
| 3292 |
* Saves current position to avoid |
| 3293 |
* zero-width repeats. |
| 3294 |
*/ |
| 3295 |
int id = op.getData(); |
| 3296 |
if (id >= 0) { |
| 3297 |
int previousOffset = con.offsets[id]; |
| 3298 |
if (previousOffset < 0 || previousOffset != offset) { |
| 3299 |
con.offsets[id] = offset; |
| 3300 |
} else { |
| 3301 |
con.offsets[id] = -1; |
| 3302 |
op = op.next; |
| 3303 |
break; |
| 3304 |
} |
| 3305 |
} |
| 3306 |
|
| 3307 |
int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts); |
| 3308 |
if (id >= 0) con.offsets[id] = -1; |
| 3309 |
if (ret >= 0) return ret; |
| 3310 |
op = op.next; |
| 3311 |
} |
| 3312 |
break; |
| 3313 |
|
| 3314 |
case Op.QUESTION: |
| 3315 |
{ |
| 3316 |
int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts); |
| 3317 |
if (ret >= 0) return ret; |
| 3318 |
op = op.next; |
| 3319 |
} |
| 3320 |
break; |
| 3321 |
|
| 3322 |
case Op.NONGREEDYCLOSURE: |
| 3323 |
case Op.NONGREEDYQUESTION: |
| 3324 |
{ |
| 3325 |
int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts); |
| 3326 |
if (ret >= 0) return ret; |
| 3327 |
op = op.getChild(); |
| 3328 |
} |
| 3329 |
break; |
| 3330 |
|
| 3331 |
case Op.UNION: |
| 3332 |
for (int i = 0; i < op.size(); i ++) { |
| 3333 |
int ret = this. matchCharacterIterator (con, op.elementAt(i), offset, dx, opts); |
| 3334 |
if (DEBUG) { |
| 3335 |
System.err.println("UNION: "+i+", ret="+ret); |
| 3336 |
} |
| 3337 |
if (ret >= 0) return ret; |
| 3338 |
} |
| 3339 |
return -1; |
| 3340 |
|
| 3341 |
case Op.CAPTURE: |
| 3342 |
int refno = op.getData(); |
| 3343 |
if (con.match != null && refno > 0) { |
| 3344 |
int save = con.match.getBeginning(refno); |
| 3345 |
con.match.setBeginning(refno, offset); |
| 3346 |
int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts); |
| 3347 |
if (ret < 0) con.match.setBeginning(refno, save); |
| 3348 |
return ret; |
| 3349 |
} else if (con.match != null && refno < 0) { |
| 3350 |
int index = -refno; |
| 3351 |
int save = con.match.getEnd(index); |
| 3352 |
con.match.setEnd(index, offset); |
| 3353 |
int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts); |
| 3354 |
if (ret < 0) con.match.setEnd(index, save); |
| 3355 |
return ret; |
| 3356 |
} |
| 3357 |
op = op.next; |
| 3358 |
break; |
| 3359 |
|
| 3360 |
case Op.LOOKAHEAD: |
| 3361 |
if (0 > this. matchCharacterIterator (con, op.getChild(), offset, 1, opts)) return -1; |
| 3362 |
op = op.next; |
| 3363 |
break; |
| 3364 |
case Op.NEGATIVELOOKAHEAD: |
| 3365 |
if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, 1, opts)) return -1; |
| 3366 |
op = op.next; |
| 3367 |
break; |
| 3368 |
case Op.LOOKBEHIND: |
| 3369 |
if (0 > this. matchCharacterIterator (con, op.getChild(), offset, -1, opts)) return -1; |
| 3370 |
op = op.next; |
| 3371 |
break; |
| 3372 |
case Op.NEGATIVELOOKBEHIND: |
| 3373 |
if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, -1, opts)) return -1; |
| 3374 |
op = op.next; |
| 3375 |
break; |
| 3376 |
|
| 3377 |
case Op.INDEPENDENT: |
| 3378 |
{ |
| 3379 |
int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts); |
| 3380 |
if (ret < 0) return ret; |
| 3381 |
offset = ret; |
| 3382 |
op = op.next; |
| 3383 |
} |
| 3384 |
break; |
| 3385 |
|
| 3386 |
case Op.MODIFIER: |
| 3387 |
{ |
| 3388 |
int localopts = opts; |
| 3389 |
localopts |= op.getData(); |
| 3390 |
localopts &= ~op.getData2(); |
| 3391 |
//System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16)); |
| 3392 |
int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, localopts); |
| 3393 |
if (ret < 0) return ret; |
| 3394 |
offset = ret; |
| 3395 |
op = op.next; |
| 3396 |
} |
| 3397 |
break; |
| 3398 |
|
| 3399 |
case Op.CONDITION: |
| 3400 |
{ |
| 3401 |
Op.ConditionOp cop = (Op.ConditionOp)op; |
| 3402 |
boolean matchp = false; |
| 3403 |
if (cop.refNumber > 0) { |
| 3404 |
if (cop.refNumber >= this.nofparen) |
| 3405 |
throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber); |
| 3406 |
matchp = con.match.getBeginning(cop.refNumber) >= 0 |
| 3407 |
&& con.match.getEnd(cop.refNumber) >= 0; |
| 3408 |
} else { |
| 3409 |
matchp = 0 <= this. matchCharacterIterator (con, cop.condition, offset, dx, opts); |
| 3410 |
} |
| 3411 |
|
| 3412 |
if (matchp) { |
| 3413 |
op = cop.yes; |
| 3414 |
} else if (cop.no != null) { |
| 3415 |
op = cop.no; |
| 3416 |
} else { |
| 3417 |
op = cop.next; |
| 3418 |
} |
| 3419 |
} |
| 3420 |
break; |
| 3421 |
|
| 3422 |
default: |
| 3423 |
throw new RuntimeException("Unknown operation type: "+op.type); |
| 3424 |
} // switch (op.type) |
| 3425 |
} // while |
| 3426 |
} |
| 3427 |
|
| 3428 |
private static final int getPreviousWordType(CharacterIterator target, int begin, int end, |
| 3429 |
int offset, int opts) { |
| 3430 |
int ret = getWordType(target, begin, end, --offset, opts); |
| 3431 |
while (ret == WT_IGNORE) |
| 3432 |
ret = getWordType(target, begin, end, --offset, opts); |
| 3433 |
return ret; |
| 3434 |
} |
| 3435 |
|
| 3436 |
private static final int getWordType(CharacterIterator target, int begin, int end, |
| 3437 |
int offset, int opts) { |
| 3438 |
if (offset < begin || offset >= end) return WT_OTHER; |
| 3439 |
return getWordType0( target .setIndex( offset ) , opts); |
| 3440 |
} |
| 3441 |
|
| 3442 |
|
| 3443 |
|
| 3444 |
private static final boolean regionMatches(CharacterIterator target, int offset, int limit, |
| 3445 |
String part, int partlen) { |
| 3446 |
if (offset < 0) return false; |
| 3447 |
if (limit-offset < partlen) |
| 3448 |
return false; |
| 3449 |
int i = 0; |
| 3450 |
while (partlen-- > 0) { |
| 3451 |
if ( target .setIndex( offset++ ) != part.charAt(i++)) |
| 3452 |
return false; |
| 3453 |
} |
| 3454 |
return true; |
| 3455 |
} |
| 3456 |
|
| 3457 |
private static final boolean regionMatches(CharacterIterator target, int offset, int limit, |
| 3458 |
int offset2, int partlen) { |
| 3459 |
if (offset < 0) return false; |
| 3460 |
if (limit-offset < partlen) |
| 3461 |
return false; |
| 3462 |
int i = offset2; |
| 3463 |
while (partlen-- > 0) { |
| 3464 |
if ( target .setIndex( offset++ ) != target .setIndex( i++ ) ) |
| 3465 |
return false; |
| 3466 |
} |
| 3467 |
return true; |
| 3468 |
} |
| 3469 |
|
| 3470 |
/** |
| 3471 |
* @see java.lang.String#regionMatches |
| 3472 |
*/ |
| 3473 |
private static final boolean regionMatchesIgnoreCase(CharacterIterator target, int offset, int limit, |
| 3474 |
String part, int partlen) { |
| 3475 |
if (offset < 0) return false; |
| 3476 |
if (limit-offset < partlen) |
| 3477 |
return false; |
| 3478 |
int i = 0; |
| 3479 |
while (partlen-- > 0) { |
| 3480 |
char ch1 = target .setIndex( offset++ ) ; |
| 3481 |
char ch2 = part.charAt(i++); |
| 3482 |
if (ch1 == ch2) |
| 3483 |
continue; |
| 3484 |
char uch1 = Character.toUpperCase(ch1); |
| 3485 |
char uch2 = Character.toUpperCase(ch2); |
| 3486 |
if (uch1 == uch2) |
| 3487 |
continue; |
| 3488 |
if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) |
| 3489 |
return false; |
| 3490 |
} |
| 3491 |
return true; |
| 3492 |
} |
| 3493 |
|
| 3494 |
private static final boolean regionMatchesIgnoreCase(CharacterIterator target, int offset, int limit, |
| 3495 |
int offset2, int partlen) { |
| 3496 |
if (offset < 0) return false; |
| 3497 |
if (limit-offset < partlen) |
| 3498 |
return false; |
| 3499 |
int i = offset2; |
| 3500 |
while (partlen-- > 0) { |
| 3501 |
char ch1 = target .setIndex( offset++ ) ; |
| 3502 |
char ch2 = target .setIndex( i++ ) ; |
| 3503 |
if (ch1 == ch2) |
| 3504 |
continue; |
| 3505 |
char uch1 = Character.toUpperCase(ch1); |
| 3506 |
char uch2 = Character.toUpperCase(ch2); |
| 3507 |
if (uch1 == uch2) |
| 3508 |
continue; |
| 3509 |
if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) |
| 3510 |
return false; |
| 3511 |
} |
| 3512 |
return true; |
| 3513 |
} |
| 3514 |
|
| 3515 |
|
| 3516 |
|
| 3517 |
|
| 3518 |
// ================================================================ |
| 3519 |
|
| 3520 |
/** |
| 3521 |
* A regular expression. |
| 3522 |
* @serial |
| 3523 |
*/ |
| 3524 |
String regex; |
| 3525 |
/** |
| 3526 |
* @serial |
| 3527 |
*/ |
| 3528 |
int options; |
| 3529 |
|
| 3530 |
/** |
| 3531 |
* The number of parenthesis in the regular expression. |
| 3532 |
* @serial |
| 3533 |
*/ |
| 3534 |
int nofparen; |
| 3535 |
/** |
| 3536 |
* Internal representation of the regular expression. |
| 3537 |
* @serial |
| 3538 |
*/ |
| 3539 |
Token tokentree; |
| 3540 |
|
| 3541 |
boolean hasBackReferences = false; |
| 3542 |
|
| 3543 |
transient int minlength; |
| 3544 |
transient Op operations = null; |
| 3545 |
transient int numberOfClosures; |
| 3546 |
transient Context context = null; |
| 3547 |
transient RangeToken firstChar = null; |
| 3548 |
|
| 3549 |
transient String fixedString = null; |
| 3550 |
transient int fixedStringOptions; |
| 3551 |
transient BMPattern fixedStringTable = null; |
| 3552 |
transient boolean fixedStringOnly = false; |
| 3553 |
|
| 3554 |
|
| 3555 |
static final class Context { |
| 3556 |
CharacterIterator ciTarget; |
| 3557 |
String strTarget; |
| 3558 |
char[] charTarget; |
| 3559 |
int start; |
| 3560 |
int limit; |
| 3561 |
int length; |
| 3562 |
Match match; |
| 3563 |
boolean inuse = false; |
| 3564 |
int[] offsets; |
| 3565 |
|
| 3566 |
Context() { |
| 3567 |
} |
| 3568 |
|
| 3569 |
private void resetCommon(int nofclosures) { |
| 3570 |
this.length = this.limit-this.start; |
| 3571 |
this.inuse = true; |
| 3572 |
this.match = null; |
| 3573 |
if (this.offsets == null || this.offsets.length != nofclosures) |
| 3574 |
this.offsets = new int[nofclosures]; |
| 3575 |
for (int i = 0; i < nofclosures; i ++) this.offsets[i] = -1; |
| 3576 |
} |
| 3577 |
void reset(CharacterIterator target, int start, int limit, int nofclosures) { |
| 3578 |
this.ciTarget = target; |
| 3579 |
this.start = start; |
| 3580 |
this.limit = limit; |
| 3581 |
this.resetCommon(nofclosures); |
| 3582 |
} |
| 3583 |
void reset(String target, int start, int limit, int nofclosures) { |
| 3584 |
this.strTarget = target; |
| 3585 |
this.start = start; |
| 3586 |
this.limit = limit; |
| 3587 |
this.resetCommon(nofclosures); |
| 3588 |
} |
| 3589 |
void reset(char[] target, int start, int limit, int nofclosures) { |
| 3590 |
this.charTarget = target; |
| 3591 |
this.start = start; |
| 3592 |
this.limit = limit; |
| 3593 |
this.resetCommon(nofclosures); |
| 3594 |
} |
| 3595 |
} |
| 3596 |
|
| 3597 |
/** |
| 3598 |
* Prepares for matching. This method is called just before starting matching. |
| 3599 |
*/ |
| 3600 |
void prepare() { |
| 3601 |
if (Op.COUNT) Op.nofinstances = 0; |
| 3602 |
this.compile(this.tokentree); |
| 3603 |
/* |
| 3604 |
if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .* |
| 3605 |
Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@'); |
| 3606 |
anchor.next = this.operations; |
| 3607 |
this.operations = anchor; |
| 3608 |
} |
| 3609 |
*/ |
| 3610 |
if (Op.COUNT) System.err.println("DEBUG: The number of operations: "+Op.nofinstances); |
| 3611 |
|
| 3612 |
this.minlength = this.tokentree.getMinLength(); |
| 3613 |
|
| 3614 |
this.firstChar = null; |
| 3615 |
if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) |
| 3616 |
&& !isSet(this.options, XMLSCHEMA_MODE)) { |
| 3617 |
RangeToken firstChar = Token.createRange(); |
| 3618 |
int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options); |
| 3619 |
if (fresult == Token.FC_TERMINAL) { |
| 3620 |
firstChar.compactRanges(); |
| 3621 |
this.firstChar = firstChar; |
| 3622 |
if (DEBUG) |
| 3623 |
System.err.println("DEBUG: Use the first character optimization: "+firstChar); |
| 3624 |
} |
| 3625 |
} |
| 3626 |
|
| 3627 |
if (this.operations != null |
| 3628 |
&& (this.operations.type == Op.STRING || this.operations.type == Op.CHAR) |
| 3629 |
&& this.operations.next == null) { |
| 3630 |
if (DEBUG) |
| 3631 |
System.err.print(" *** Only fixed string! *** "); |
| 3632 |
this.fixedStringOnly = true; |
| 3633 |
if (this.operations.type == Op.STRING) |
| 3634 |
this.fixedString = this.operations.getString(); |
| 3635 |
else if (this.operations.getData() >= 0x10000) { // Op.CHAR |
| 3636 |
this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData()); |
| 3637 |
} else { |
| 3638 |
char[] ac = new char[1]; |
| 3639 |
ac[0] = (char)this.operations.getData(); |
| 3640 |
this.fixedString = new String(ac); |
| 3641 |
} |
| 3642 |
this.fixedStringOptions = this.options; |
| 3643 |
this.fixedStringTable = new BMPattern(this.fixedString, 256, |
| 3644 |
isSet(this.fixedStringOptions, IGNORE_CASE)); |
| 3645 |
} else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION) |
| 3646 |
&& !isSet(this.options, XMLSCHEMA_MODE)) { |
| 3647 |
Token.FixedStringContainer container = new Token.FixedStringContainer(); |
| 3648 |
this.tokentree.findFixedString(container, this.options); |
| 3649 |
this.fixedString = container.token == null ? null : container.token.getString(); |
| 3650 |
this.fixedStringOptions = container.options; |
| 3651 |
if (this.fixedString != null && this.fixedString.length() < 2) |
| 3652 |
this.fixedString = null; |
| 3653 |
// This pattern has a fixed string of which length is more than one. |
| 3654 |
if (this.fixedString != null) { |
| 3655 |
this.fixedStringTable = new BMPattern(this.fixedString, 256, |
| 3656 |
isSet(this.fixedStringOptions, IGNORE_CASE)); |
| 3657 |
if (DEBUG) { |
| 3658 |
System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length() |
| 3659 |
+"/" //+this.fixedString |
| 3660 |
+"/"+REUtil.createOptionString(this.fixedStringOptions)); |
| 3661 |
System.err.print("String: "); |
| 3662 |
REUtil.dumpString(this.fixedString); |
| 3663 |
} |
| 3664 |
} |
| 3665 |
} |
| 3666 |
} |
| 3667 |
|
| 3668 |
/** |
| 3669 |
* An option. |
| 3670 |
* If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span> |
| 3671 |
* captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span> |
| 3672 |
* does not capture. |
| 3673 |
* |
| 3674 |
* @see #RegularExpression(java.lang.String,int) |
| 3675 |
* @see #setPattern(java.lang.String,int) |
| 3676 |
static final int MARK_PARENS = 1<<0; |
| 3677 |
*/ |
| 3678 |
|
| 3679 |
/** |
| 3680 |
* "i" |
| 3681 |
*/ |
| 3682 |
static final int IGNORE_CASE = 1<<1; |
| 3683 |
|
| 3684 |
/** |
| 3685 |
* "s" |
| 3686 |
*/ |
| 3687 |
static final int SINGLE_LINE = 1<<2; |
| 3688 |
|
| 3689 |
/** |
| 3690 |
* "m" |
| 3691 |
*/ |
| 3692 |
static final int MULTIPLE_LINES = 1<<3; |
| 3693 |
|
| 3694 |
/** |
| 3695 |
* "x" |
| 3696 |
*/ |
| 3697 |
static final int EXTENDED_COMMENT = 1<<4; |
| 3698 |
|
| 3699 |
/** |
| 3700 |
* This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>. |
| 3701 |
* |
| 3702 |
* @see #RegularExpression(java.lang.String,int) |
| 3703 |
* @see #setPattern(java.lang.String,int) |
| 3704 |
* @see #UNICODE_WORD_BOUNDARY |
| 3705 |
*/ |
| 3706 |
static final int USE_UNICODE_CATEGORY = 1<<5; // "u" |
| 3707 |
|
| 3708 |
/** |
| 3709 |
* An option. |
| 3710 |
* This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \< \></kbd></span>. |
| 3711 |
* <p>By default, the engine considers a position between a word character |
| 3712 |
* (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character |
| 3713 |
* is a word boundary. |
| 3714 |
* <p>By this option, the engine checks word boundaries with the method of |
| 3715 |
* 'Unicode Regular Expression Guidelines' Revision 4. |
| 3716 |
* |
| 3717 |
* @see #RegularExpression(java.lang.String,int) |
| 3718 |
* @see #setPattern(java.lang.String,int) |
| 3719 |
*/ |
| 3720 |
static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w" |
| 3721 |
|
| 3722 |
/** |
| 3723 |
* "H" |
| 3724 |
*/ |
| 3725 |
static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7; |
| 3726 |
/** |
| 3727 |
* "F" |
| 3728 |
*/ |
| 3729 |
static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8; |
| 3730 |
/** |
| 3731 |
* "X". XML Schema mode. |
| 3732 |
*/ |
| 3733 |
static final int XMLSCHEMA_MODE = 1<<9; |
| 3734 |
/** |
| 3735 |
* ",". |
| 3736 |
*/ |
| 3737 |
static final int SPECIAL_COMMA = 1<<10; |
| 3738 |
|
| 3739 |
|
| 3740 |
private static final boolean isSet(int options, int flag) { |
| 3741 |
return (options & flag) == flag; |
| 3742 |
} |
| 3743 |
|
| 3744 |
/** |
| 3745 |
* Creates a new RegularExpression instance. |
| 3746 |
* |
| 3747 |
* @param regex A regular expression |
| 3748 |
* @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax. |
| 3749 |
*/ |
| 3750 |
public RegularExpression(String regex) throws ParseException { |
| 3751 |
this.setPattern(regex, null); |
| 3752 |
} |
| 3753 |
|
| 3754 |
/** |
| 3755 |
* Creates a new RegularExpression instance with options. |
| 3756 |
* |
| 3757 |
* @param regex A regular expression |
| 3758 |
* @param options A String consisted of "i" "m" "s" "u" "w" "," "X" |
| 3759 |
* @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax. |
| 3760 |
*/ |
| 3761 |
public RegularExpression(String regex, String options) throws ParseException { |
| 3762 |
this.setPattern(regex, options); |
| 3763 |
} |
| 3764 |
|
| 3765 |
RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) { |
| 3766 |
this.regex = regex; |
| 3767 |
this.tokentree = tok; |
| 3768 |
this.nofparen = parens; |
| 3769 |
this.options = options; |
| 3770 |
this.hasBackReferences = hasBackReferences; |
| 3771 |
} |
| 3772 |
|
| 3773 |
/** |
| 3774 |
* |
| 3775 |
*/ |
| 3776 |
public void setPattern(String newPattern) throws ParseException { |
| 3777 |
this.setPattern(newPattern, this.options); |
| 3778 |
} |
| 3779 |
|
| 3780 |
private void setPattern(String newPattern, int options) throws ParseException { |
| 3781 |
this.regex = newPattern; |
| 3782 |
this.options = options; |
| 3783 |
RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE) |
| 3784 |
? new ParserForXMLSchema() : new RegexParser(); |
| 3785 |
this.tokentree = rp.parse(this.regex, this.options); |
| 3786 |
this.nofparen = rp.parennumber; |
| 3787 |
this.hasBackReferences = rp.hasBackReferences; |
| 3788 |
|
| 3789 |
this.operations = null; |
| 3790 |
this.context = null; |
| 3791 |
} |
| 3792 |
/** |
| 3793 |
* |
| 3794 |
*/ |
| 3795 |
public void setPattern(String newPattern, String options) throws ParseException { |
| 3796 |
this.setPattern(newPattern, REUtil.parseOptions(options)); |
| 3797 |
} |
| 3798 |
|
| 3799 |
/** |
| 3800 |
* |
| 3801 |
*/ |
| 3802 |
public String getPattern() { |
| 3803 |
return this.regex; |
| 3804 |
} |
| 3805 |
|
| 3806 |
/** |
| 3807 |
* Represents this instence in String. |
| 3808 |
*/ |
| 3809 |
public String toString() { |
| 3810 |
return this.tokentree.toString(this.options); |
| 3811 |
} |
| 3812 |
|
| 3813 |
/** |
| 3814 |
* Returns a option string. |
| 3815 |
* The order of letters in it may be different from a string specified |
| 3816 |
* in a constructor or <code>setPattern()</code>. |
| 3817 |
* |
| 3818 |
* @see #RegularExpression(String, String) |
| 3819 |
* @see #setPattern(java.lang.String,java.lang.String) |
| 3820 |
*/ |
| 3821 |
public String getOptions() { |
| 3822 |
return REUtil.createOptionString(this.options); |
| 3823 |
} |
| 3824 |
|
| 3825 |
/** |
| 3826 |
* Return true if patterns are the same and the options are equivalent. |
| 3827 |
*/ |
| 3828 |
public boolean equals(Object obj) { |
| 3829 |
if (obj == null) return false; |
| 3830 |
if (!(obj instanceof RegularExpression)) |
| 3831 |
return false; |
| 3832 |
RegularExpression r = (RegularExpression)obj; |
| 3833 |
return this.regex.equals(r.regex) && this.options == r.options; |
| 3834 |
} |
| 3835 |
|
| 3836 |
boolean equals(String pattern, int options) { |
| 3837 |
return this.regex.equals(pattern) && this.options == options; |
| 3838 |
} |
| 3839 |
|
| 3840 |
/** |
| 3841 |
* |
| 3842 |
*/ |
| 3843 |
public int hashCode() { |
| 3844 |
return (this.regex+"/"+this.getOptions()).hashCode(); |
| 3845 |
} |
| 3846 |
|
| 3847 |
/** |
| 3848 |
* Return the number of regular expression groups. |
| 3849 |
* This method returns 1 when the regular expression has no capturing-parenthesis. |
| 3850 |
* |
| 3851 |
*/ |
| 3852 |
public int getNumberOfGroups() { |
| 3853 |
return this.nofparen; |
| 3854 |
} |
| 3855 |
|
| 3856 |
// ================================================================ |
| 3857 |
|
| 3858 |
private static final int WT_IGNORE = 0; |
| 3859 |
private static final int WT_LETTER = 1; |
| 3860 |
private static final int WT_OTHER = 2; |
| 3861 |
private static final int getWordType0(char ch, int opts) { |
| 3862 |
if (!isSet(opts, UNICODE_WORD_BOUNDARY)) { |
| 3863 |
if (isSet(opts, USE_UNICODE_CATEGORY)) { |
| 3864 |
return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER; |
| 3865 |
} |
| 3866 |
return isWordChar(ch) ? WT_LETTER : WT_OTHER; |
| 3867 |
} |
| 3868 |
|
| 3869 |
switch (Character.getType(ch)) { |
| 3870 |
case Character.UPPERCASE_LETTER: // L |
| 3871 |
case Character.LOWERCASE_LETTER: // L |
| 3872 |
case Character.TITLECASE_LETTER: // L |
| 3873 |
case Character.MODIFIER_LETTER: // L |
| 3874 |
case Character.OTHER_LETTER: // L |
| 3875 |
case Character.LETTER_NUMBER: // N |
| 3876 |
case Character.DECIMAL_DIGIT_NUMBER: // N |
| 3877 |
case Character.OTHER_NUMBER: // N |
| 3878 |
case Character.COMBINING_SPACING_MARK: // Mc |
| 3879 |
return WT_LETTER; |
| 3880 |
|
| 3881 |
case Character.FORMAT: // Cf |
| 3882 |
case Character.NON_SPACING_MARK: // Mn |
| 3883 |
case Character.ENCLOSING_MARK: // Mc |
| 3884 |
return WT_IGNORE; |
| 3885 |
|
| 3886 |
case Character.CONTROL: // Cc |
| 3887 |
switch (ch) { |
| 3888 |
case '\t': |
| 3889 |
case '\n': |
| 3890 |
case '\u000B': |
| 3891 |
case '\f': |
| 3892 |
case '\r': |
| 3893 |
return WT_OTHER; |
| 3894 |
default: |
| 3895 |
return WT_IGNORE; |
| 3896 |
} |
| 3897 |
|
| 3898 |
default: |
| 3899 |
return WT_OTHER; |
| 3900 |
} |
| 3901 |
} |
| 3902 |
|
| 3903 |
// ================================================================ |
| 3904 |
|
| 3905 |
static final int LINE_FEED = 0x000A; |
| 3906 |
static final int CARRIAGE_RETURN = 0x000D; |
| 3907 |
static final int LINE_SEPARATOR = 0x2028; |
| 3908 |
static final int PARAGRAPH_SEPARATOR = 0x2029; |
| 3909 |
|
| 3910 |
private static final boolean isEOLChar(int ch) { |
| 3911 |
return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR |
| 3912 |
|| ch == PARAGRAPH_SEPARATOR; |
| 3913 |
} |
| 3914 |
|
| 3915 |
private static final boolean isWordChar(int ch) { // Legacy word characters |
| 3916 |
if (ch == '_') return true; |
| 3917 |
if (ch < '0') return false; |
| 3918 |
if (ch > 'z') return false; |
| 3919 |
if (ch <= '9') return true; |
| 3920 |
if (ch < 'A') return false; |
| 3921 |
if (ch <= 'Z') return true; |
| 3922 |
if (ch < 'a') return false; |
| 3923 |
return true; |
| 3924 |
} |
| 3925 |
|
| 3926 |
private static final boolean matchIgnoreCase(int chardata, int ch) { |
| 3927 |
if (chardata == ch) return true; |
| 3928 |
if (chardata > 0xffff || ch > 0xffff) return false; |
| 3929 |
char uch1 = Character.toUpperCase((char)chardata); |
| 3930 |
char uch2 = Character.toUpperCase((char)ch); |
| 3931 |
if (uch1 == uch2) return true; |
| 3932 |
return Character.toLowerCase(uch1) == Character.toLowerCase(uch2); |
| 3933 |
} |
| 3934 |
} |
| 3935 |
|
| 3936 |
public static class ParseException extends RuntimeException { |
| 3937 |
int location; |
| 3938 |
|
| 3939 |
/* |
| 3940 |
public ParseException(String mes) { |
| 3941 |
this(mes, -1); |
| 3942 |
} |
| 3943 |
*/ |
| 3944 |
/** |
| 3945 |
* |
| 3946 |
*/ |
| 3947 |
public ParseException(String mes, int location) { |
| 3948 |
super(mes); |
| 3949 |
this.location = location; |
| 3950 |
} |
| 3951 |
|
| 3952 |
/** |
| 3953 |
* |
| 3954 |
* @return -1 if location information is not available. |
| 3955 |
*/ |
| 3956 |
public int getLocation() { |
| 3957 |
return this.location; |
| 3958 |
} |
| 3959 |
} |
| 3960 |
|
| 3961 |
static class Op { |
| 3962 |
static final int DOT = 0; |
| 3963 |
static final int CHAR = 1; // Single character |
| 3964 |
static final int RANGE = 3; // [a-zA-Z] |
| 3965 |
static final int NRANGE = 4; // [^a-zA-Z] |
| 3966 |
static final int ANCHOR = 5; // ^ $ ... |
| 3967 |
static final int STRING = 6; // literal String |
| 3968 |
static final int CLOSURE = 7; // X* |
| 3969 |
static final int NONGREEDYCLOSURE = 8; // X*? |
| 3970 |
static final int QUESTION = 9; // X? |
| 3971 |
static final int NONGREEDYQUESTION = 10; // X?? |
| 3972 |
static final int UNION = 11; // X|Y |
| 3973 |
static final int CAPTURE = 15; // ( and ) |
| 3974 |
static final int BACKREFERENCE = 16; // \1 \2 ... |
| 3975 |
static final int LOOKAHEAD = 20; // (?=...) |
| 3976 |
static final int NEGATIVELOOKAHEAD = 21; // (?!...) |
| 3977 |
static final int LOOKBEHIND = 22; // (?<=...) |
| 3978 |
static final int NEGATIVELOOKBEHIND = 23; // (?<!...) |
| 3979 |
static final int INDEPENDENT = 24; // (?>...) |
| 3980 |
static final int MODIFIER = 25; // (?ims-ims:...) |
| 3981 |
static final int CONDITION = 26; // (?(..)yes|no) |
| 3982 |
|
| 3983 |
static int nofinstances = 0; |
| 3984 |
static final boolean COUNT = false; |
| 3985 |
|
| 3986 |
static Op createDot() { |
| 3987 |
if (Op.COUNT) Op.nofinstances ++; |
| 3988 |
return new Op(Op.DOT); |
| 3989 |
} |
| 3990 |
static CharOp createChar(int data) { |
| 3991 |
if (Op.COUNT) Op.nofinstances ++; |
| 3992 |
return new CharOp(Op.CHAR, data); |
| 3993 |
} |
| 3994 |
static CharOp createAnchor(int data) { |
| 3995 |
if (Op.COUNT) Op.nofinstances ++; |
| 3996 |
return new CharOp(Op.ANCHOR, data); |
| 3997 |
} |
| 3998 |
static CharOp createCapture(int number, Op next) { |
| 3999 |
if (Op.COUNT) Op.nofinstances ++; |
| 4000 |
CharOp op = new CharOp(Op.CAPTURE, number); |
| 4001 |
op.next = next; |
| 4002 |
return op; |
| 4003 |
} |
| 4004 |
static UnionOp createUnion(int size) { |
| 4005 |
if (Op.COUNT) Op.nofinstances ++; |
| 4006 |
//System.err.println("Creates UnionOp"); |
| 4007 |
return new UnionOp(Op.UNION, size); |
| 4008 |
} |
| 4009 |
static ChildOp createClosure(int id) { |
| 4010 |
if (Op.COUNT) Op.nofinstances ++; |
| 4011 |
return new ModifierOp(Op.CLOSURE, id, -1); |
| 4012 |
} |
| 4013 |
static ChildOp createNonGreedyClosure() { |
| 4014 |
if (Op.COUNT) Op.nofinstances ++; |
| 4015 |
return new ChildOp(Op.NONGREEDYCLOSURE); |
| 4016 |
} |
| 4017 |
static ChildOp createQuestion(boolean nongreedy) { |
| 4018 |
if (Op.COUNT) Op.nofinstances ++; |
| 4019 |
return new ChildOp(nongreedy ? Op.NONGREEDYQUESTION : Op.QUESTION); |
| 4020 |
} |
| 4021 |
static RangeOp createRange(Token tok) { |
| 4022 |
if (Op.COUNT) Op.nofinstances ++; |
| 4023 |
return new RangeOp(Op.RANGE, tok); |
| 4024 |
} |
| 4025 |
static ChildOp createLook(int type, Op next, Op branch) { |
| 4026 |
if (Op.COUNT) Op.nofinstances ++; |
| 4027 |
ChildOp op = new ChildOp(type); |
| 4028 |
op.setChild(branch); |
| 4029 |
op.next = next; |
| 4030 |
return op; |
| 4031 |
} |
| 4032 |
static CharOp createBackReference(int refno) { |
| 4033 |
if (Op.COUNT) Op.nofinstances ++; |
| 4034 |
return new CharOp(Op.BACKREFERENCE, refno); |
| 4035 |
} |
| 4036 |
static StringOp createString(String literal) { |
| 4037 |
if (Op.COUNT) Op.nofinstances ++; |
| 4038 |
return new StringOp(Op.STRING, literal); |
| 4039 |
} |
| 4040 |
static ChildOp createIndependent(Op next, Op branch) { |
| 4041 |
if (Op.COUNT) Op.nofinstances ++; |
| 4042 |
ChildOp op = new ChildOp(Op.INDEPENDENT); |
| 4043 |
op.setChild(branch); |
| 4044 |
op.next = next; |
| 4045 |
return op; |
| 4046 |
} |
| 4047 |
static ModifierOp createModifier(Op next, Op branch, int add, int mask) { |
| 4048 |
if (Op.COUNT) Op.nofinstances ++; |
| 4049 |
ModifierOp op = new ModifierOp(Op.MODIFIER, add, mask); |
| 4050 |
op.setChild(branch); |
| 4051 |
op.next = next; |
| 4052 |
return op; |
| 4053 |
} |
| 4054 |
static ConditionOp createCondition(Op next, int ref, Op conditionflow, Op yesflow, Op noflow) { |
| 4055 |
if (Op.COUNT) Op.nofinstances ++; |
| 4056 |
ConditionOp op = new ConditionOp(Op.CONDITION, ref, conditionflow, yesflow, noflow); |
| 4057 |
op.next = next; |
| 4058 |
return op; |
| 4059 |
} |
| 4060 |
|
| 4061 |
int type; |
| 4062 |
Op next = null; |
| 4063 |
|
| 4064 |
protected Op(int type) { |
| 4065 |
this.type = type; |
| 4066 |
} |
| 4067 |
|
| 4068 |
int size() { // for UNION |
| 4069 |
return 0; |
| 4070 |
} |
| 4071 |
Op elementAt(int index) { // for UNIoN |
| 4072 |
throw new RuntimeException("Internal Error: type="+this.type); |
| 4073 |
} |
| 4074 |
Op getChild() { // for CLOSURE, QUESTION |
| 4075 |
throw new RuntimeException("Internal Error: type="+this.type); |
| 4076 |
} |
| 4077 |
// ModifierOp |
| 4078 |
int getData() { // CharOp for CHAR, BACKREFERENCE, CAPTURE, ANCHOR, |
| 4079 |
throw new RuntimeException("Internal Error: type="+this.type); |
| 4080 |
} |
| 4081 |
int getData2() { // ModifierOp |
| 4082 |
throw new RuntimeException("Internal Error: type="+this.type); |
| 4083 |
} |
| 4084 |
RangeToken getToken() { // RANGE, NRANGE |
| 4085 |
throw new RuntimeException("Internal Error: type="+this.type); |
| 4086 |
} |
| 4087 |
String getString() { // STRING |
| 4088 |
throw new RuntimeException("Internal Error: type="+this.type); |
| 4089 |
} |
| 4090 |
|
| 4091 |
// ================================================================ |
| 4092 |
static class CharOp extends Op { |
| 4093 |
int charData; |
| 4094 |
CharOp(int type, int data) { |
| 4095 |
super(type); |
| 4096 |
this.charData = data; |
| 4097 |
} |
| 4098 |
int getData() { |
| 4099 |
return this.charData; |
| 4100 |
} |
| 4101 |
} |
| 4102 |
|
| 4103 |
// ================================================================ |
| 4104 |
static class UnionOp extends Op { |
| 4105 |
Vector branches; |
| 4106 |
UnionOp(int type, int size) { |
| 4107 |
super(type); |
| 4108 |
this.branches = new Vector(size); |
| 4109 |
} |
| 4110 |
void addElement(Op op) { |
| 4111 |
this.branches.addElement(op); |
| 4112 |
} |
| 4113 |
int size() { |
| 4114 |
return this.branches.size(); |
| 4115 |
} |
| 4116 |
Op elementAt(int index) { |
| 4117 |
return (Op)this.branches.elementAt(index); |
| 4118 |
} |
| 4119 |
} |
| 4120 |
|
| 4121 |
// ================================================================ |
| 4122 |
static class ChildOp extends Op { |
| 4123 |
Op child; |
| 4124 |
ChildOp(int type) { |
| 4125 |
super(type); |
| 4126 |
} |
| 4127 |
void setChild(Op child) { |
| 4128 |
this.child = child; |
| 4129 |
} |
| 4130 |
Op getChild() { |
| 4131 |
return this.child; |
| 4132 |
} |
| 4133 |
} |
| 4134 |
// ================================================================ |
| 4135 |
static class ModifierOp extends ChildOp { |
| 4136 |
int v1; |
| 4137 |
int v2; |
| 4138 |
ModifierOp(int type, int v1, int v2) { |
| 4139 |
super(type); |
| 4140 |
this.v1 = v1; |
| 4141 |
this.v2 = v2; |
| 4142 |
} |
| 4143 |
int getData() { |
| 4144 |
return this.v1; |
| 4145 |
} |
| 4146 |
int getData2() { |
| 4147 |
return this.v2; |
| 4148 |
} |
| 4149 |
} |
| 4150 |
// ================================================================ |
| 4151 |
static class RangeOp extends Op { |
| 4152 |
Token tok; |
| 4153 |
RangeOp(int type, Token tok) { |
| 4154 |
super(type); |
| 4155 |
this.tok = tok; |
| 4156 |
} |
| 4157 |
RangeToken getToken() { |
| 4158 |
return (RangeToken)this.tok; |
| 4159 |
} |
| 4160 |
} |
| 4161 |
// ================================================================ |
| 4162 |
static class StringOp extends Op { |
| 4163 |
String string; |
| 4164 |
StringOp(int type, String literal) { |
| 4165 |
super(type); |
| 4166 |
this.string = literal; |
| 4167 |
} |
| 4168 |
String getString() { |
| 4169 |
return this.string; |
| 4170 |
} |
| 4171 |
} |
| 4172 |
// ================================================================ |
| 4173 |
static class ConditionOp extends Op { |
| 4174 |
int refNumber; |
| 4175 |
Op condition; |
| 4176 |
Op yes; |
| 4177 |
Op no; |
| 4178 |
ConditionOp(int type, int refno, Op conditionflow, Op yesflow, Op noflow) { |
| 4179 |
super(type); |
| 4180 |
this.refNumber = refno; |
| 4181 |
this.condition = conditionflow; |
| 4182 |
this.yes = yesflow; |
| 4183 |
this.no = noflow; |
| 4184 |
} |
| 4185 |
} |
| 4186 |
} |
| 4187 |
|
| 4188 |
final static class RangeToken extends Token implements java.io.Serializable { |
| 4189 |
|
| 4190 |
int[] ranges; |
| 4191 |
boolean sorted; |
| 4192 |
boolean compacted; |
| 4193 |
RangeToken icaseCache = null; |
| 4194 |
int[] map = null; |
| 4195 |
int nonMapIndex; |
| 4196 |
|
| 4197 |
RangeToken(int type) { |
| 4198 |
super(type); |
| 4199 |
this.setSorted(false); |
| 4200 |
} |
| 4201 |
|
| 4202 |
// for RANGE or NRANGE |
| 4203 |
protected void addRange(int start, int end) { |
| 4204 |
this.icaseCache = null; |
| 4205 |
//System.err.println("Token#addRange(): "+start+" "+end); |
| 4206 |
int r1, r2; |
| 4207 |
if (start <= end) { |
| 4208 |
r1 = start; |
| 4209 |
r2 = end; |
| 4210 |
} else { |
| 4211 |
r1 = end; |
| 4212 |
r2 = start; |
| 4213 |
} |
| 4214 |
|
| 4215 |
int pos = 0; |
| 4216 |
if (this.ranges == null) { |
| 4217 |
this.ranges = new int[2]; |
| 4218 |
this.ranges[0] = r1; |
| 4219 |
this.ranges[1] = r2; |
| 4220 |
this.setSorted(true); |
| 4221 |
} else { |
| 4222 |
pos = this.ranges.length; |
| 4223 |
if (this.ranges[pos-1]+1 == r1) { |
| 4224 |
this.ranges[pos-1] = r2; |
| 4225 |
return; |
| 4226 |
} |
| 4227 |
int[] temp = new int[pos+2]; |
| 4228 |
System.arraycopy(this.ranges, 0, temp, 0, pos); |
| 4229 |
this.ranges = temp; |
| 4230 |
if (this.ranges[pos-1] >= r1) |
| 4231 |
this.setSorted(false); |
| 4232 |
this.ranges[pos++] = r1; |
| 4233 |
this.ranges[pos] = r2; |
| 4234 |
if (!this.sorted) |
| 4235 |
this.sortRanges(); |
| 4236 |
} |
| 4237 |
} |
| 4238 |
|
| 4239 |
private final boolean isSorted() { |
| 4240 |
return this.sorted; |
| 4241 |
} |
| 4242 |
private final void setSorted(boolean sort) { |
| 4243 |
this.sorted = sort; |
| 4244 |
if (!sort) this.compacted = false; |
| 4245 |
} |
| 4246 |
private final boolean isCompacted() { |
| 4247 |
return this.compacted; |
| 4248 |
} |
| 4249 |
private final void setCompacted() { |
| 4250 |
this.compacted = true; |
| 4251 |
} |
| 4252 |
|
| 4253 |
protected void sortRanges() { |
| 4254 |
if (this.isSorted()) |
| 4255 |
return; |
| 4256 |
if (this.ranges == null) |
| 4257 |
return; |
| 4258 |
//System.err.println("Do sorting: "+this.ranges.length); |
| 4259 |
|
| 4260 |
// Bubble sort |
| 4261 |
// Why? -- In many cases, |
| 4262 |
// this.ranges has few elements. |
| 4263 |
for (int i = this.ranges.length-4; i >= 0; i -= 2) { |
| 4264 |
for (int j = 0; j <= i; j += 2) { |
| 4265 |
if (this.ranges[j] > this.ranges[j+2] |
| 4266 |
|| this.ranges[j] == this.ranges[j+2] && this.ranges[j+1] > this.ranges[j+3]) { |
| 4267 |
int tmp; |
| 4268 |
tmp = this.ranges[j+2]; |
| 4269 |
this.ranges[j+2] = this.ranges[j]; |
| 4270 |
this.ranges[j] = tmp; |
| 4271 |
tmp = this.ranges[j+3]; |
| 4272 |
this.ranges[j+3] = this.ranges[j+1]; |
| 4273 |
this.ranges[j+1] = tmp; |
| 4274 |
} |
| 4275 |
} |
| 4276 |
} |
| 4277 |
this.setSorted(true); |
| 4278 |
} |
| 4279 |
|
| 4280 |
/** |
| 4281 |
* this.ranges is sorted. |
| 4282 |
*/ |
| 4283 |
protected void compactRanges() { |
| 4284 |
boolean DEBUG = false; |
| 4285 |
if (this.ranges == null || this.ranges.length <= 2) |
| 4286 |
return; |
| 4287 |
if (this.isCompacted()) |
| 4288 |
return; |
| 4289 |
int base = 0; // Index of writing point |
| 4290 |
int target = 0; // Index of processing point |
| 4291 |
|
| 4292 |
while (target < this.ranges.length) { |
| 4293 |
if (base != target) { |
| 4294 |
this.ranges[base] = this.ranges[target++]; |
| 4295 |
this.ranges[base+1] = this.ranges[target++]; |
| 4296 |
} else |
| 4297 |
target += 2; |
| 4298 |
int baseend = this.ranges[base+1]; |
| 4299 |
while (target < this.ranges.length) { |
| 4300 |
if (baseend+1 < this.ranges[target]) |
| 4301 |
break; |
| 4302 |
if (baseend+1 == this.ranges[target]) { |
| 4303 |
if (DEBUG) |
| 4304 |
System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base] |
| 4305 |
+", "+this.ranges[base+1] |
| 4306 |
+"], ["+this.ranges[target] |
| 4307 |
+", "+this.ranges[target+1] |
| 4308 |
+"] -> ["+this.ranges[base] |
| 4309 |
+", "+this.ranges[target+1] |
| 4310 |
+"]"); |
| 4311 |
this.ranges[base+1] = this.ranges[target+1]; |
| 4312 |
baseend = this.ranges[base+1]; |
| 4313 |
target += 2; |
| 4314 |
} else if (baseend >= this.ranges[target+1]) { |
| 4315 |
if (DEBUG) |
| 4316 |
System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base] |
| 4317 |
+", "+this.ranges[base+1] |
| 4318 |
+"], ["+this.ranges[target] |
| 4319 |
+", "+this.ranges[target+1] |
| 4320 |
+"] -> ["+this.ranges[base] |
| 4321 |
+", "+this.ranges[base+1] |
| 4322 |
+"]"); |
| 4323 |
target += 2; |
| 4324 |
} else if (baseend < this.ranges[target+1]) { |
| 4325 |
if (DEBUG) |
| 4326 |
System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base] |
| 4327 |
+", "+this.ranges[base+1] |
| 4328 |
+"], ["+this.ranges[target] |
| 4329 |
+", "+this.ranges[target+1] |
| 4330 |
+"] -> ["+this.ranges[base] |
| 4331 |
+", "+this.ranges[target+1] |
| 4332 |
+"]"); |
| 4333 |
this.ranges[base+1] = this.ranges[target+1]; |
| 4334 |
baseend = this.ranges[base+1]; |
| 4335 |
target += 2; |
| 4336 |
} else { |
| 4337 |
throw new RuntimeException("Token#compactRanges(): Internel Error: [" |
| 4338 |
+this.ranges[base] |
| 4339 |
+","+this.ranges[base+1] |
| 4340 |
+"] ["+this.ranges[target] |
| 4341 |
+","+this.ranges[target+1]+"]"); |
| 4342 |
} |
| 4343 |
} // while |
| 4344 |
base += 2; |
| 4345 |
} |
| 4346 |
|
| 4347 |
if (base != this.ranges.length) { |
| 4348 |
int[] result = new int[base]; |
| 4349 |
System.arraycopy(this.ranges, 0, result, 0, base); |
| 4350 |
this.ranges = result; |
| 4351 |
} |
| 4352 |
this.setCompacted(); |
| 4353 |
} |
| 4354 |
|
| 4355 |
protected void mergeRanges(Token token) { |
| 4356 |
RangeToken tok = (RangeToken)token; |
| 4357 |
this.sortRanges(); |
| 4358 |
tok.sortRanges(); |
| 4359 |
if (tok.ranges == null) |
| 4360 |
return; |
| 4361 |
this.icaseCache = null; |
| 4362 |
this.setSorted(true); |
| 4363 |
if (this.ranges == null) { |
| 4364 |
this.ranges = new int[tok.ranges.length]; |
| 4365 |
System.arraycopy(tok.ranges, 0, this.ranges, 0, tok.ranges.length); |
| 4366 |
return; |
| 4367 |
} |
| 4368 |
int[] result = new int[this.ranges.length+tok.ranges.length]; |
| 4369 |
for (int i = 0, j = 0, k = 0; i < this.ranges.length || j < tok.ranges.length;) { |
| 4370 |
if (i >= this.ranges.length) { |
| 4371 |
result[k++] = tok.ranges[j++]; |
| 4372 |
result[k++] = tok.ranges[j++]; |
| 4373 |
} else if (j >= tok.ranges.length) { |
| 4374 |
result[k++] = this.ranges[i++]; |
| 4375 |
result[k++] = this.ranges[i++]; |
| 4376 |
} else if (tok.ranges[j] < this.ranges[i] |
| 4377 |
|| tok.ranges[j] == this.ranges[i] && tok.ranges[j+1] < this.ranges[i+1]) { |
| 4378 |
result[k++] = tok.ranges[j++]; |
| 4379 |
result[k++] = tok.ranges[j++]; |
| 4380 |
} else { |
| 4381 |
result[k++] = this.ranges[i++]; |
| 4382 |
result[k++] = this.ranges[i++]; |
| 4383 |
} |
| 4384 |
} |
| 4385 |
this.ranges = result; |
| 4386 |
} |
| 4387 |
|
| 4388 |
protected void subtractRanges(Token token) { |
| 4389 |
if (token.type == NRANGE) { |
| 4390 |
this.intersectRanges(token); |
| 4391 |
return; |
| 4392 |
} |
| 4393 |
RangeToken tok = (RangeToken)token; |
| 4394 |
if (tok.ranges == null || this.ranges == null) |
| 4395 |
return; |
| 4396 |
this.icaseCache = null; |
| 4397 |
this.sortRanges(); |
| 4398 |
this.compactRanges(); |
| 4399 |
tok.sortRanges(); |
| 4400 |
tok.compactRanges(); |
| 4401 |
|
| 4402 |
//System.err.println("Token#substractRanges(): Entry: "+this.ranges.length+", "+tok.ranges.length); |
| 4403 |
|
| 4404 |
int[] result = new int[this.ranges.length+tok.ranges.length]; |
| 4405 |
int wp = 0, src = 0, sub = 0; |
| 4406 |
while (src < this.ranges.length && sub < tok.ranges.length) { |
| 4407 |
int srcbegin = this.ranges[src]; |
| 4408 |
int srcend = this.ranges[src+1]; |
| 4409 |
int subbegin = tok.ranges[sub]; |
| 4410 |
int subend = tok.ranges[sub+1]; |
| 4411 |
if (srcend < subbegin) { // Not overlapped |
| 4412 |
// src: o-----o |
| 4413 |
// sub: o-----o |
| 4414 |
// res: o-----o |
| 4415 |
// Reuse sub |
| 4416 |
result[wp++] = this.ranges[src++]; |
| 4417 |
result[wp++] = this.ranges[src++]; |
| 4418 |
} else if (srcend >= subbegin |
| 4419 |
&& srcbegin <= subend) { // Overlapped |
| 4420 |
// src: o--------o |
| 4421 |
// sub: o----o |
| 4422 |
// sub: o----o |
| 4423 |
// sub: o----o |
| 4424 |
// sub: o------------o |
| 4425 |
if (subbegin <= srcbegin && srcend <= subend) { |
| 4426 |
// src: o--------o |
| 4427 |
// sub: o------------o |
| 4428 |
// res: empty |
| 4429 |
// Reuse sub |
| 4430 |
src += 2; |
| 4431 |
} else if (subbegin <= srcbegin) { |
| 4432 |
// src: o--------o |
| 4433 |
// sub: o----o |
| 4434 |
// res: o-----o |
| 4435 |
// Reuse src(=res) |
| 4436 |
this.ranges[src] = subend+1; |
| 4437 |
sub += 2; |
| 4438 |
} else if (srcend <= subend) { |
| 4439 |
// src: o--------o |
| 4440 |
// sub: o----o |
| 4441 |
// res: o-----o |
| 4442 |
// Reuse sub |
| 4443 |
result[wp++] = srcbegin; |
| 4444 |
result[wp++] = subbegin-1; |
| 4445 |
src += 2; |
| 4446 |
} else { |
| 4447 |
// src: o--------o |
| 4448 |
// sub: o----o |
| 4449 |
// res: o-o o-o |
| 4450 |
// Reuse src(=right res) |
| 4451 |
result[wp++] = srcbegin; |
| 4452 |
result[wp++] = subbegin-1; |
| 4453 |
this.ranges[src] = subend+1; |
| 4454 |
sub += 2; |
| 4455 |
} |
| 4456 |
} else if (subend < srcbegin) { |
| 4457 |
// Not overlapped |
| 4458 |
// src: o-----o |
| 4459 |
// sub: o----o |
| 4460 |
sub += 2; |
| 4461 |
} else { |
| 4462 |
throw new RuntimeException("Token#subtractRanges(): Internal Error: ["+this.ranges[src] |
| 4463 |
+","+this.ranges[src+1] |
| 4464 |
+"] - ["+tok.ranges[sub] |
| 4465 |
+","+tok.ranges[sub+1] |
| 4466 |
+"]"); |
| 4467 |
} |
| 4468 |
} |
| 4469 |
while (src < this.ranges.length) { |
| 4470 |
result[wp++] = this.ranges[src++]; |
| 4471 |
result[wp++] = this.ranges[src++]; |
| 4472 |
} |
| 4473 |
this.ranges = new int[wp]; |
| 4474 |
System.arraycopy(result, 0, this.ranges, 0, wp); |
| 4475 |
// this.ranges is sorted and compacted. |
| 4476 |
} |
| 4477 |
|
| 4478 |
/** |
| 4479 |
* @param tok Ignore whether it is NRANGE or not. |
| 4480 |
*/ |
| 4481 |
protected void intersectRanges(Token token) { |
| 4482 |
RangeToken tok = (RangeToken)token; |
| 4483 |
if (tok.ranges == null || this.ranges == null) |
| 4484 |
return; |
| 4485 |
this.icaseCache = null; |
| 4486 |
this.sortRanges(); |
| 4487 |
this.compactRanges(); |
| 4488 |
tok.sortRanges(); |
| 4489 |
tok.compactRanges(); |
| 4490 |
|
| 4491 |
int[] result = new int[this.ranges.length+tok.ranges.length]; |
| 4492 |
int wp = 0, src1 = 0, src2 = 0; |
| 4493 |
while (src1 < this.ranges.length && src2 < tok.ranges.length) { |
| 4494 |
int src1begin = this.ranges[src1]; |
| 4495 |
int src1end = this.ranges[src1+1]; |
| 4496 |
int src2begin = tok.ranges[src2]; |
| 4497 |
int src2end = tok.ranges[src2+1]; |
| 4498 |
if (src1end < src2begin) { // Not overlapped |
| 4499 |
// src1: o-----o |
| 4500 |
// src2: o-----o |
| 4501 |
// res: empty |
| 4502 |
// Reuse src2 |
| 4503 |
src1 += 2; |
| 4504 |
} else if (src1end >= src2begin |
| 4505 |
&& src1begin <= src2end) { // Overlapped |
| 4506 |
// src1: o--------o |
| 4507 |
// src2: o----o |
| 4508 |
// src2: o----o |
| 4509 |
// src2: o----o |
| 4510 |
// src2: o------------o |
| 4511 |
if (src2begin <= src2begin && src1end <= src2end) { |
| 4512 |
// src1: o--------o |
| 4513 |
// src2: o------------o |
| 4514 |
// res: o--------o |
| 4515 |
// Reuse src2 |
| 4516 |
result[wp++] = src1begin; |
| 4517 |
result[wp++] = src1end; |
| 4518 |
src1 += 2; |
| 4519 |
} else if (src2begin <= src1begin) { |
| 4520 |
// src1: o--------o |
| 4521 |
// src2: o----o |
| 4522 |
// res: o--o |
| 4523 |
// Reuse the rest of src1 |
| 4524 |
result[wp++] = src1begin; |
| 4525 |
result[wp++] = src2end; |
| 4526 |
this.ranges[src1] = src2end+1; |
| 4527 |
src2 += 2; |
| 4528 |
} else if (src1end <= src2end) { |
| 4529 |
// src1: o--------o |
| 4530 |
// src2: o----o |
| 4531 |
// res: o--o |
| 4532 |
// Reuse src2 |
| 4533 |
result[wp++] = src2begin; |
| 4534 |
result[wp++] = src1end; |
| 4535 |
src1 += 2; |
| 4536 |
} else { |
| 4537 |
// src1: o--------o |
| 4538 |
// src2: o----o |
| 4539 |
// res: o----o |
| 4540 |
// Reuse the rest of src1 |
| 4541 |
result[wp++] = src2begin; |
| 4542 |
result[wp++] = src2end; |
| 4543 |
this.ranges[src1] = src2end+1; |
| 4544 |
} |
| 4545 |
} else if (src2end < src1begin) { |
| 4546 |
// Not overlapped |
| 4547 |
// src1: o-----o |
| 4548 |
// src2: o----o |
| 4549 |
src2 += 2; |
| 4550 |
} else { |
| 4551 |
throw new RuntimeException("Token#intersectRanges(): Internal Error: [" |
| 4552 |
+this.ranges[src1] |
| 4553 |
+","+this.ranges[src1+1] |
| 4554 |
+"] & ["+tok.ranges[src2] |
| 4555 |
+","+tok.ranges[src2+1] |
| 4556 |
+"]"); |
| 4557 |
} |
| 4558 |
} |
| 4559 |
while (src1 < this.ranges.length) { |
| 4560 |
result[wp++] = this.ranges[src1++]; |
| 4561 |
result[wp++] = this.ranges[src1++]; |
| 4562 |
} |
| 4563 |
this.ranges = new int[wp]; |
| 4564 |
System.arraycopy(result, 0, this.ranges, 0, wp); |
| 4565 |
// this.ranges is sorted and compacted. |
| 4566 |
} |
| 4567 |
|
| 4568 |
/** |
| 4569 |
* for RANGE: Creates complement. |
| 4570 |
* for NRANGE: Creates the same meaning RANGE. |
| 4571 |
*/ |
| 4572 |
static Token complementRanges(Token token) { |
| 4573 |
if (token.type != RANGE && token.type != NRANGE) |
| 4574 |
throw new IllegalArgumentException("Token#complementRanges(): must be RANGE: "+token.type); |
| 4575 |
RangeToken tok = (RangeToken)token; |
| 4576 |
tok.sortRanges(); |
| 4577 |
tok.compactRanges(); |
| 4578 |
int len = tok.ranges.length+2; |
| 4579 |
if (tok.ranges[0] == 0) |
| 4580 |
len -= 2; |
| 4581 |
int last = tok.ranges[tok.ranges.length-1]; |
| 4582 |
if (last == UTF16_MAX) |
| 4583 |
len -= 2; |
| 4584 |
RangeToken ret = Token.createRange(); |
| 4585 |
ret.ranges = new int[len]; |
| 4586 |
int wp = 0; |
| 4587 |
if (tok.ranges[0] > 0) { |
| 4588 |
ret.ranges[wp++] = 0; |
| 4589 |
ret.ranges[wp++] = tok.ranges[0]-1; |
| 4590 |
} |
| 4591 |
for (int i = 1; i < tok.ranges.length-2; i += 2) { |
| 4592 |
ret.ranges[wp++] = tok.ranges[i]+1; |
| 4593 |
ret.ranges[wp++] = tok.ranges[i+1]-1; |
| 4594 |
} |
| 4595 |
if (last != UTF16_MAX) { |
| 4596 |
ret.ranges[wp++] = last+1; |
| 4597 |
ret.ranges[wp] = UTF16_MAX; |
| 4598 |
} |
| 4599 |
ret.setCompacted(); |
| 4600 |
return ret; |
| 4601 |
} |
| 4602 |
|
| 4603 |
synchronized RangeToken getCaseInsensitiveToken() { |
| 4604 |
if (this.icaseCache != null) |
| 4605 |
return this.icaseCache; |
| 4606 |
|
| 4607 |
RangeToken uppers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange(); |
| 4608 |
for (int i = 0; i < this.ranges.length; i += 2) { |
| 4609 |
for (int ch = this.ranges[i]; ch <= this.ranges[i+1]; ch ++) { |
| 4610 |
if (ch > 0xffff) |
| 4611 |
uppers.addRange(ch, ch); |
| 4612 |
else { |
| 4613 |
char uch = Character.toUpperCase((char)ch); |
| 4614 |
uppers.addRange(uch, uch); |
| 4615 |
} |
| 4616 |
} |
| 4617 |
} |
| 4618 |
RangeToken lowers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange(); |
| 4619 |
for (int i = 0; i < uppers.ranges.length; i += 2) { |
| 4620 |
for (int ch = uppers.ranges[i]; ch <= uppers.ranges[i+1]; ch ++) { |
| 4621 |
if (ch > 0xffff) |
| 4622 |
lowers.addRange(ch, ch); |
| 4623 |
else { |
| 4624 |
char uch = Character.toUpperCase((char)ch); |
| 4625 |
lowers.addRange(uch, uch); |
| 4626 |
} |
| 4627 |
} |
| 4628 |
} |
| 4629 |
lowers.mergeRanges(uppers); |
| 4630 |
lowers.mergeRanges(this); |
| 4631 |
lowers.compactRanges(); |
| 4632 |
|
| 4633 |
this.icaseCache = lowers; |
| 4634 |
return lowers; |
| 4635 |
} |
| 4636 |
|
| 4637 |
void dumpRanges() { |
| 4638 |
System.err.print("RANGE: "); |
| 4639 |
if (this.ranges == null) |
| 4640 |
System.err.println(" NULL"); |
| 4641 |
for (int i = 0; i < this.ranges.length; i += 2) { |
| 4642 |
System.err.print("["+this.ranges[i]+","+this.ranges[i+1]+"] "); |
| 4643 |
} |
| 4644 |
System.err.println(""); |
| 4645 |
} |
| 4646 |
|
| 4647 |
boolean match(int ch) { |
| 4648 |
if (this.map == null) this.createMap(); |
| 4649 |
boolean ret; |
| 4650 |
if (this.type == RANGE) { |
| 4651 |
if (ch < MAPSIZE) |
| 4652 |
return (this.map[ch/32] & (1<<(ch&0x1f))) != 0; |
| 4653 |
ret = false; |
| 4654 |
for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) { |
| 4655 |
if (this.ranges[i] <= ch && ch <= this.ranges[i+1]) |
| 4656 |
return true; |
| 4657 |
} |
| 4658 |
} else { |
| 4659 |
if (ch < MAPSIZE) |
| 4660 |
return (this.map[ch/32] & (1<<(ch&0x1f))) == 0; |
| 4661 |
ret = true; |
| 4662 |
for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) { |
| 4663 |
if (this.ranges[i] <= ch && ch <= this.ranges[i+1]) |
| 4664 |
return false; |
| 4665 |
} |
| 4666 |
} |
| 4667 |
return ret; |
| 4668 |
} |
| 4669 |
|
| 4670 |
private static final int MAPSIZE = 256; |
| 4671 |
private void createMap() { |
| 4672 |
int asize = MAPSIZE/32; // 32 is the number of bits in `int'. |
| 4673 |
this.map = new int[asize]; |
| 4674 |
this.nonMapIndex = this.ranges.length; |
| 4675 |
for (int i = 0; i < asize; i ++) this.map[i] = 0; |
| 4676 |
for (int i = 0; i < this.ranges.length; i += 2) { |
| 4677 |
int s = this.ranges[i]; |
| 4678 |
int e = this.ranges[i+1]; |
| 4679 |
if (s < MAPSIZE) { |
| 4680 |
for (int j = s; j <= e && j < MAPSIZE; j ++) |
| 4681 |
this.map[j/32] |= 1<<(j&0x1f); // s&0x1f : 0-31 |
| 4682 |
} else { |
| 4683 |
this.nonMapIndex = i; |
| 4684 |
break; |
| 4685 |
} |
| 4686 |
if (e >= MAPSIZE) { |
| 4687 |
this.nonMapIndex = i; |
| 4688 |
break; |
| 4689 |
} |
| 4690 |
} |
| 4691 |
//for (int i = 0; i < asize; i ++) System.err.println("Map: "+Integer.toString(this.map[i], 16)); |
| 4692 |
} |
| 4693 |
|
| 4694 |
public String toString(int options) { |
| 4695 |
String ret; |
| 4696 |
if (this.type == RANGE) { |
| 4697 |
if (this == Token.token_dot) |
| 4698 |
ret = "."; |
| 4699 |
else if (this == Token.token_0to9) |
| 4700 |
ret = "\\d"; |
| 4701 |
else if (this == Token.token_wordchars) |
| 4702 |
ret = "\\w"; |
| 4703 |
else if (this == Token.token_spaces) |
| 4704 |
ret = "\\s"; |
| 4705 |
else { |
| 4706 |
StringBuffer sb = new StringBuffer(); |
| 4707 |
sb.append("["); |
| 4708 |
for (int i = 0; i < this.ranges.length; i += 2) { |
| 4709 |
if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(","); |
| 4710 |
if (this.ranges[i] == this.ranges[i+1]) { |
| 4711 |
sb.append(escapeCharInCharClass(this.ranges[i])); |
| 4712 |
} else { |
| 4713 |
sb.append(escapeCharInCharClass(this.ranges[i])); |
| 4714 |
sb.append('-'); |
| 4715 |
sb.append(escapeCharInCharClass(this.ranges[i+1])); |
| 4716 |
} |
| 4717 |
} |
| 4718 |
sb.append("]"); |
| 4719 |
ret = sb.toString(); |
| 4720 |
} |
| 4721 |
} else { |
| 4722 |
if (this == Token.token_not_0to9) |
| 4723 |
ret = "\\D"; |
| 4724 |
else if (this == Token.token_not_wordchars) |
| 4725 |
ret = "\\W"; |
| 4726 |
else if (this == Token.token_not_spaces) |
| 4727 |
ret = "\\S"; |
| 4728 |
else { |
| 4729 |
StringBuffer sb = new StringBuffer(); |
| 4730 |
sb.append("[^"); |
| 4731 |
for (int i = 0; i < this.ranges.length; i += 2) { |
| 4732 |
if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(","); |
| 4733 |
if (this.ranges[i] == this.ranges[i+1]) { |
| 4734 |
sb.append(escapeCharInCharClass(this.ranges[i])); |
| 4735 |
} else { |
| 4736 |
sb.append(escapeCharInCharClass(this.ranges[i])); |
| 4737 |
sb.append('-'); |
| 4738 |
sb.append(escapeCharInCharClass(this.ranges[i+1])); |
| 4739 |
} |
| 4740 |
} |
| 4741 |
sb.append("]"); |
| 4742 |
ret = sb.toString(); |
| 4743 |
} |
| 4744 |
} |
| 4745 |
return ret; |
| 4746 |
} |
| 4747 |
|
| 4748 |
private static String escapeCharInCharClass(int ch) { |
| 4749 |
String ret; |
| 4750 |
switch (ch) { |
| 4751 |
case '[': case ']': case '-': case '^': |
| 4752 |
case ',': case '\\': |
| 4753 |
ret = "\\"+(char)ch; |
| 4754 |
break; |
| 4755 |
case '\f': ret = "\\f"; break; |
| 4756 |
case '\n': ret = "\\n"; break; |
| 4757 |
case '\r': ret = "\\r"; break; |
| 4758 |
case '\t': ret = "\\t"; break; |
| 4759 |
case 0x1b: ret = "\\e"; break; |
| 4760 |
//case 0x0b: ret = "\\v"; break; |
| 4761 |
default: |
| 4762 |
if (ch < 0x20) { |
| 4763 |
String pre = "0"+Integer.toHexString(ch); |
| 4764 |
ret = "\\x"+pre.substring(pre.length()-2, pre.length()); |
| 4765 |
} else if (ch >= 0x10000) { |
| 4766 |
String pre = "0"+Integer.toHexString(ch); |
| 4767 |
ret = "\\v"+pre.substring(pre.length()-6, pre.length()); |
| 4768 |
} else |
| 4769 |
ret = ""+(char)ch; |
| 4770 |
} |
| 4771 |
return ret; |
| 4772 |
} |
| 4773 |
|
| 4774 |
} |
| 4775 |
|
| 4776 |
static class RegexParser { |
| 4777 |
static final int T_CHAR = 0; |
| 4778 |
static final int T_EOF = 1; |
| 4779 |
static final int T_OR = 2; // '|' |
| 4780 |
static final int T_STAR = 3; // '*' |
| 4781 |
static final int T_PLUS = 4; // '+' |
| 4782 |
static final int T_QUESTION = 5; // '?' |
| 4783 |
static final int T_LPAREN = 6; // '(' |
| 4784 |
static final int T_RPAREN = 7; // ')' |
| 4785 |
static final int T_DOT = 8; // '.' |
| 4786 |
static final int T_LBRACKET = 9; // '[' |
| 4787 |
static final int T_BACKSOLIDUS = 10; // '\' |
| 4788 |
static final int T_CARET = 11; // '^' |
| 4789 |
static final int T_DOLLAR = 12; // '$' |
| 4790 |
static final int T_LPAREN2 = 13; // '(?:' |
| 4791 |
static final int T_LOOKAHEAD = 14; // '(?=' |
| 4792 |
static final int T_NEGATIVELOOKAHEAD = 15; // '(?!' |
| 4793 |
static final int T_LOOKBEHIND = 16; // '(?<=' |
| 4794 |
static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!' |
| 4795 |
static final int T_INDEPENDENT = 18; // '(?>' |
| 4796 |
static final int T_SET_OPERATIONS = 19; // '(?[' |
| 4797 |
static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class |
| 4798 |
static final int T_COMMENT = 21; // '(?#' |
| 4799 |
static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z] |
| 4800 |
static final int T_CONDITION = 23; // '(?(' |
| 4801 |
static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class |
| 4802 |
|
| 4803 |
static class ReferencePosition { |
| 4804 |
int refNumber; |
| 4805 |
int position; |
| 4806 |
ReferencePosition(int n, int pos) { |
| 4807 |
this.refNumber = n; |
| 4808 |
this.position = pos; |
| 4809 |
} |
| 4810 |
} |
| 4811 |
|
| 4812 |
int offset; |
| 4813 |
String regex; |
| 4814 |
int regexlen; |
| 4815 |
int options; |
| 4816 |
ResourceBundle resources; |
| 4817 |
int chardata; |
| 4818 |
int nexttoken; |
| 4819 |
static protected final int S_NORMAL = 0; |
| 4820 |
static protected final int S_INBRACKETS = 1; |
| 4821 |
static protected final int S_INXBRACKETS = 2; |
| 4822 |
int context = S_NORMAL; |
| 4823 |
int parennumber = 1; |
| 4824 |
boolean hasBackReferences; |
| 4825 |
Vector references = null; |
| 4826 |
|
| 4827 |
public RegexParser() { |
| 4828 |
//this.setLocale(Locale.getDefault()); |
| 4829 |
} |
| 4830 |
public RegexParser(Locale locale) { |
| 4831 |
//this.setLocale(locale); |
| 4832 |
} |
| 4833 |
|
| 4834 |
public void setLocale(Locale locale) { |
| 4835 |
} |
| 4836 |
|
| 4837 |
final ParseException ex(String key, int loc) { |
| 4838 |
return new ParseException(EcorePlugin.INSTANCE.getString(key), loc); |
| 4839 |
} |
| 4840 |
|
| 4841 |
private final boolean isSet(int flag) { |
| 4842 |
return (this.options & flag) == flag; |
| 4843 |
} |
| 4844 |
|
| 4845 |
synchronized Token parse(String regex, int options) throws ParseException { |
| 4846 |
this.options = options; |
| 4847 |
this.offset = 0; |
| 4848 |
this.setContext(S_NORMAL); |
| 4849 |
this.parennumber = 1; |
| 4850 |
this.hasBackReferences = false; |
| 4851 |
this.regex = regex; |
| 4852 |
if (this.isSet(RegularExpression.EXTENDED_COMMENT)) |
| 4853 |
this.regex = REUtil.stripExtendedComment(this.regex); |
| 4854 |
this.regexlen = this.regex.length(); |
| 4855 |
|
| 4856 |
|
| 4857 |
this.next(); |
| 4858 |
Token ret = this.parseRegex(); |
| 4859 |
if (this.offset != this.regexlen) |
| 4860 |
throw ex("parser.parse.1", this.offset); |
| 4861 |
if (this.references != null) { |
| 4862 |
for (int i = 0; i < this.references.size(); i ++) { |
| 4863 |
ReferencePosition position = (ReferencePosition)this.references.elementAt(i); |
| 4864 |
if (this.parennumber <= position.refNumber) |
| 4865 |
throw ex("parser.parse.2", position.position); |
| 4866 |
} |
| 4867 |
this.references.removeAllElements(); |
| 4868 |
} |
| 4869 |
return ret; |
| 4870 |
} |
| 4871 |
|
| 4872 |
/* |
| 4873 |
public RegularExpression createRegex(String regex, int options) throws ParseException { |
| 4874 |
Token tok = this.parse(regex, options); |
| 4875 |
return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options); |
| 4876 |
} |
| 4877 |
*/ |
| 4878 |
|
| 4879 |
protected final void setContext(int con) { |
| 4880 |
this.context = con; |
| 4881 |
} |
| 4882 |
|
| 4883 |
final int read() { |
| 4884 |
return this.nexttoken; |
| 4885 |
} |
| 4886 |
|
| 4887 |
final void next() { |
| 4888 |
if (this.offset >= this.regexlen) { |
| 4889 |
this.chardata = -1; |
| 4890 |
this.nexttoken = T_EOF; |
| 4891 |
return; |
| 4892 |
} |
| 4893 |
|
| 4894 |
int ret; |
| 4895 |
int ch = this.regex.charAt(this.offset++); |
| 4896 |
this.chardata = ch; |
| 4897 |
|
| 4898 |
if (this.context == S_INBRACKETS) { |
| 4899 |
// In a character class, this.chardata has one character, that is to say, |
| 4900 |
// a pair of surrogates is composed and stored to this.chardata. |
| 4901 |
switch (ch) { |
| 4902 |
case '\\': |
| 4903 |
ret = T_BACKSOLIDUS; |
| 4904 |
if (this.offset >= this.regexlen) |
| 4905 |
throw ex("parser.next.1", this.offset-1); |
| 4906 |
this.chardata = this.regex.charAt(this.offset++); |
| 4907 |
break; |
| 4908 |
|
| 4909 |
case '-': |
| 4910 |
if (this.isSet(RegularExpression.XMLSCHEMA_MODE) |
| 4911 |
&& this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') { |
| 4912 |
this.offset++; |
| 4913 |
ret = T_XMLSCHEMA_CC_SUBTRACTION; |
| 4914 |
} else |
| 4915 |
ret = T_CHAR; |
| 4916 |
break; |
| 4917 |
|
| 4918 |
case '[': |
| 4919 |
if (!this.isSet(RegularExpression.XMLSCHEMA_MODE) |
| 4920 |
&& this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') { |
| 4921 |
this.offset++; |
| 4922 |
ret = T_POSIX_CHARCLASS_START; |
| 4923 |
break; |
| 4924 |
} // Through down |
| 4925 |
default: |
| 4926 |
if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) { |
| 4927 |
int low = this.regex.charAt(this.offset); |
| 4928 |
if (REUtil.isLowSurrogate(low)) { |
| 4929 |
this.chardata = REUtil.composeFromSurrogates(ch, low); |
| 4930 |
this.offset ++; |
| 4931 |
} |
| 4932 |
} |
| 4933 |
ret = T_CHAR; |
| 4934 |
} |
| 4935 |
this.nexttoken = ret; |
| 4936 |
return; |
| 4937 |
} |
| 4938 |
|
| 4939 |
switch (ch) { |
| 4940 |
case '|': ret = T_OR; break; |
| 4941 |
case '*': ret = T_STAR; break; |
| 4942 |
case '+': ret = T_PLUS; break; |
| 4943 |
case '?': ret = T_QUESTION; break; |
| 4944 |
case ')': ret = T_RPAREN; break; |
| 4945 |
case '.': ret = T_DOT; break; |
| 4946 |
case '[': ret = T_LBRACKET; break; |
| 4947 |
case '^': ret = T_CARET; break; |
| 4948 |
case '$': ret = T_DOLLAR; break; |
| 4949 |
case '(': |
| 4950 |
ret = T_LPAREN; |
| 4951 |
if (this.offset >= this.regexlen) |
| 4952 |
break; |
| 4953 |
if (this.regex.charAt(this.offset) != '?') |
| 4954 |
break; |
| 4955 |
if (++this.offset >= this.regexlen) |
| 4956 |
throw ex("parser.next.2", this.offset-1); |
| 4957 |
ch = this.regex.charAt(this.offset++); |
| 4958 |
switch (ch) { |
| 4959 |
case ':': ret = T_LPAREN2; break; |
| 4960 |
case '=': ret = T_LOOKAHEAD; break; |
| 4961 |
case '!': ret = T_NEGATIVELOOKAHEAD; break; |
| 4962 |
case '[': ret = T_SET_OPERATIONS; break; |
| 4963 |
case '>': ret = T_INDEPENDENT; break; |
| 4964 |
case '<': |
| 4965 |
if (this.offset >= this.regexlen) |
| 4966 |
throw ex("parser.next.2", this.offset-3); |
| 4967 |
ch = this.regex.charAt(this.offset++); |
| 4968 |
if (ch == '=') { |
| 4969 |
ret = T_LOOKBEHIND; |
| 4970 |
} else if (ch == '!') { |
| 4971 |
ret = T_NEGATIVELOOKBEHIND; |
| 4972 |
} else |
| 4973 |
throw ex("parser.next.3", this.offset-3); |
| 4974 |
break; |
| 4975 |
case '#': |
| 4976 |
while (this.offset < this.regexlen) { |
| 4977 |
ch = this.regex.charAt(this.offset++); |
| 4978 |
if (ch == ')') break; |
| 4979 |
} |
| 4980 |
if (ch != ')') |
| 4981 |
throw ex("parser.next.4", this.offset-1); |
| 4982 |
ret = T_COMMENT; |
| 4983 |
break; |
| 4984 |
default: |
| 4985 |
if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options |
| 4986 |
this.offset --; |
| 4987 |
ret = T_MODIFIERS; |
| 4988 |
break; |
| 4989 |
} else if (ch == '(') { // conditional |
| 4990 |
ret = T_CONDITION; // this.offsets points the next of '('. |
| 4991 |
break; |
| 4992 |
} |
| 4993 |
throw ex("parser.next.2", this.offset-2); |
| 4994 |
} |
| 4995 |
break; |
| 4996 |
|
| 4997 |
case '\\': |
| 4998 |
ret = T_BACKSOLIDUS; |
| 4999 |
if (this.offset >= this.regexlen) |
| 5000 |
throw ex("parser.next.1", this.offset-1); |
| 5001 |
this.chardata = this.regex.charAt(this.offset++); |
| 5002 |
break; |
| 5003 |
|
| 5004 |
default: |
| 5005 |
ret = T_CHAR; |
| 5006 |
} |
| 5007 |
this.nexttoken = ret; |
| 5008 |
} |
| 5009 |
|
| 5010 |
/** |
| 5011 |
* regex ::= term (`|` term)* |
| 5012 |
* term ::= factor+ |
| 5013 |
* factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' |
| 5014 |
* | atom (('*' | '+' | '?' | minmax ) '?'? )?) |
| 5015 |
* | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' |
| 5016 |
* atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9] |
| 5017 |
* | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block |
| 5018 |
*/ |
| 5019 |
Token parseRegex() throws ParseException { |
| 5020 |
Token tok = this.parseTerm(); |
| 5021 |
Token parent = null; |
| 5022 |
while (this.read() == T_OR) { |
| 5023 |
this.next(); // '|' |
| 5024 |
if (parent == null) { |
| 5025 |
parent = Token.createUnion(); |
| 5026 |
parent.addChild(tok); |
| 5027 |
tok = parent; |
| 5028 |
} |
| 5029 |
tok.addChild(this.parseTerm()); |
| 5030 |
} |
| 5031 |
return tok; |
| 5032 |
} |
| 5033 |
|
| 5034 |
/** |
| 5035 |
* term ::= factor+ |
| 5036 |
*/ |
| 5037 |
Token parseTerm() throws ParseException { |
| 5038 |
int ch = this.read(); |
| 5039 |
if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) { |
| 5040 |
return Token.createEmpty(); |
| 5041 |
} else { |
| 5042 |
Token tok = this.parseFactor(); |
| 5043 |
Token concat = null; |
| 5044 |
while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) { |
| 5045 |
if (concat == null) { |
| 5046 |
concat = Token.createConcat(); |
| 5047 |
concat.addChild(tok); |
| 5048 |
tok = concat; |
| 5049 |
} |
| 5050 |
concat.addChild(this.parseFactor()); |
| 5051 |
//tok = Token.createConcat(tok, this.parseFactor()); |
| 5052 |
} |
| 5053 |
return tok; |
| 5054 |
} |
| 5055 |
} |
| 5056 |
|
| 5057 |
// ---------------------------------------------------------------- |
| 5058 |
|
| 5059 |
Token processCaret() throws ParseException { |
| 5060 |
this.next(); |
| 5061 |
return Token.token_linebeginning; |
| 5062 |
} |
| 5063 |
Token processDollar() throws ParseException { |
| 5064 |
this.next(); |
| 5065 |
return Token.token_lineend; |
| 5066 |
} |
| 5067 |
Token processLookahead() throws ParseException { |
| 5068 |
this.next(); |
| 5069 |
Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex()); |
| 5070 |
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| 5071 |
this.next(); // ')' |
| 5072 |
return tok; |
| 5073 |
} |
| 5074 |
Token processNegativelookahead() throws ParseException { |
| 5075 |
this.next(); |
| 5076 |
Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex()); |
| 5077 |
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| 5078 |
this.next(); // ')' |
| 5079 |
return tok; |
| 5080 |
} |
| 5081 |
Token processLookbehind() throws ParseException { |
| 5082 |
this.next(); |
| 5083 |
Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex()); |
| 5084 |
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| 5085 |
this.next(); // ')' |
| 5086 |
return tok; |
| 5087 |
} |
| 5088 |
Token processNegativelookbehind() throws ParseException { |
| 5089 |
this.next(); |
| 5090 |
Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex()); |
| 5091 |
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| 5092 |
this.next(); // ')' |
| 5093 |
return tok; |
| 5094 |
} |
| 5095 |
Token processBacksolidus_A() throws ParseException { |
| 5096 |
this.next(); |
| 5097 |
return Token.token_stringbeginning; |
| 5098 |
} |
| 5099 |
Token processBacksolidus_Z() throws ParseException { |
| 5100 |
this.next(); |
| 5101 |
return Token.token_stringend2; |
| 5102 |
} |
| 5103 |
Token processBacksolidus_z() throws ParseException { |
| 5104 |
this.next(); |
| 5105 |
return Token.token_stringend; |
| 5106 |
} |
| 5107 |
Token processBacksolidus_b() throws ParseException { |
| 5108 |
this.next(); |
| 5109 |
return Token.token_wordedge; |
| 5110 |
} |
| 5111 |
Token processBacksolidus_B() throws ParseException { |
| 5112 |
this.next(); |
| 5113 |
return Token.token_not_wordedge; |
| 5114 |
} |
| 5115 |
Token processBacksolidus_lt() throws ParseException { |
| 5116 |
this.next(); |
| 5117 |
return Token.token_wordbeginning; |
| 5118 |
} |
| 5119 |
Token processBacksolidus_gt() throws ParseException { |
| 5120 |
this.next(); |
| 5121 |
return Token.token_wordend; |
| 5122 |
} |
| 5123 |
Token processStar(Token tok) throws ParseException { |
| 5124 |
this.next(); |
| 5125 |
if (this.read() == T_QUESTION) { |
| 5126 |
this.next(); |
| 5127 |
return Token.createNGClosure(tok); |
| 5128 |
} else |
| 5129 |
return Token.createClosure(tok); |
| 5130 |
} |
| 5131 |
Token processPlus(Token tok) throws ParseException { |
| 5132 |
// X+ -> XX* |
| 5133 |
this.next(); |
| 5134 |
if (this.read() == T_QUESTION) { |
| 5135 |
this.next(); |
| 5136 |
return Token.createConcat(tok, Token.createNGClosure(tok)); |
| 5137 |
} else |
| 5138 |
return Token.createConcat(tok, Token.createClosure(tok)); |
| 5139 |
} |
| 5140 |
Token processQuestion(Token tok) throws ParseException { |
| 5141 |
// X? -> X| |
| 5142 |
this.next(); |
| 5143 |
Token par = Token.createUnion(); |
| 5144 |
if (this.read() == T_QUESTION) { |
| 5145 |
this.next(); |
| 5146 |
par.addChild(Token.createEmpty()); |
| 5147 |
par.addChild(tok); |
| 5148 |
} else { |
| 5149 |
par.addChild(tok); |
| 5150 |
par.addChild(Token.createEmpty()); |
| 5151 |
} |
| 5152 |
return par; |
| 5153 |
} |
| 5154 |
boolean checkQuestion(int off) { |
| 5155 |
return off < this.regexlen && this.regex.charAt(off) == '?'; |
| 5156 |
} |
| 5157 |
Token processParen() throws ParseException { |
| 5158 |
this.next(); |
| 5159 |
int p = this.parennumber++; |
| 5160 |
Token tok = Token.createParen(this.parseRegex(), p); |
| 5161 |
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| 5162 |
this.next(); // Skips ')' |
| 5163 |
return tok; |
| 5164 |
} |
| 5165 |
Token processParen2() throws ParseException { |
| 5166 |
this.next(); |
| 5167 |
Token tok = Token.createParen(this.parseRegex(), 0); |
| 5168 |
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| 5169 |
this.next(); // Skips ')' |
| 5170 |
return tok; |
| 5171 |
} |
| 5172 |
Token processCondition() throws ParseException { |
| 5173 |
// this.offset points the next of '(' |
| 5174 |
if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset); |
| 5175 |
// Parses a condition. |
| 5176 |
int refno = -1; |
| 5177 |
Token condition = null; |
| 5178 |
int ch = this.regex.charAt(this.offset); |
| 5179 |
if ('1' <= ch && ch <= '9') { |
| 5180 |
refno = ch-'0'; |
| 5181 |
this.hasBackReferences = true; |
| 5182 |
if (this.references == null) this.references = new Vector(); |
| 5183 |
this.references.addElement(new ReferencePosition(refno, this.offset)); |
| 5184 |
this.offset ++; |
| 5185 |
if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset); |
| 5186 |
this.offset ++; |
| 5187 |
} else { |
| 5188 |
if (ch == '?') this.offset --; // Points '('. |
| 5189 |
this.next(); |
| 5190 |
condition = this.parseFactor(); |
| 5191 |
switch (condition.type) { |
| 5192 |
case Token.LOOKAHEAD: |
| 5193 |
case Token.NEGATIVELOOKAHEAD: |
| 5194 |
case Token.LOOKBEHIND: |
| 5195 |
case Token.NEGATIVELOOKBEHIND: |
| 5196 |
break; |
| 5197 |
case Token.ANCHOR: |
| 5198 |
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| 5199 |
break; |
| 5200 |
default: |
| 5201 |
throw ex("parser.factor.5", this.offset); |
| 5202 |
} |
| 5203 |
} |
| 5204 |
// Parses yes/no-patterns. |
| 5205 |
this.next(); |
| 5206 |
Token yesPattern = this.parseRegex(); |
| 5207 |
Token noPattern = null; |
| 5208 |
if (yesPattern.type == Token.UNION) { |
| 5209 |
if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset); |
| 5210 |
noPattern = yesPattern.getChild(1); |
| 5211 |
yesPattern = yesPattern.getChild(0); |
| 5212 |
} |
| 5213 |
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| 5214 |
this.next(); |
| 5215 |
return Token.createCondition(refno, condition, yesPattern, noPattern); |
| 5216 |
} |
| 5217 |
Token processModifiers() throws ParseException { |
| 5218 |
// this.offset points the next of '?'. |
| 5219 |
// modifiers ::= [imsw]* ('-' [imsw]*)? ':' |
| 5220 |
int add = 0, mask = 0, ch = -1; |
| 5221 |
while (this.offset < this.regexlen) { |
| 5222 |
ch = this.regex.charAt(this.offset); |
| 5223 |
int v = REUtil.getOptionValue(ch); |
| 5224 |
if (v == 0) break; // '-' or ':'? |
| 5225 |
add |= v; |
| 5226 |
this.offset ++; |
| 5227 |
} |
| 5228 |
if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); |
| 5229 |
if (ch == '-') { |
| 5230 |
this.offset ++; |
| 5231 |
while (this.offset < this.regexlen) { |
| 5232 |
ch = this.regex.charAt(this.offset); |
| 5233 |
int v = REUtil.getOptionValue(ch); |
| 5234 |
if (v == 0) break; // ':'? |
| 5235 |
mask |= v; |
| 5236 |
this.offset ++; |
| 5237 |
} |
| 5238 |
if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); |
| 5239 |
} |
| 5240 |
Token tok; |
| 5241 |
if (ch == ':') { |
| 5242 |
this.offset ++; |
| 5243 |
this.next(); |
| 5244 |
tok = Token.createModifierGroup(this.parseRegex(), add, mask); |
| 5245 |
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| 5246 |
this.next(); |
| 5247 |
} else if (ch == ')') { // such as (?-i) |
| 5248 |
this.offset ++; |
| 5249 |
this.next(); |
| 5250 |
tok = Token.createModifierGroup(this.parseRegex(), add, mask); |
| 5251 |
} else |
| 5252 |
throw ex("parser.factor.3", this.offset); |
| 5253 |
|
| 5254 |
return tok; |
| 5255 |
} |
| 5256 |
Token processIndependent() throws ParseException { |
| 5257 |
this.next(); |
| 5258 |
Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex()); |
| 5259 |
if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| 5260 |
this.next(); // Skips ')' |
| 5261 |
return tok; |
| 5262 |
} |
| 5263 |
Token processBacksolidus_c() throws ParseException { |
| 5264 |
int ch2; // Must be in 0x0040-0x005f |
| 5265 |
if (this.offset >= this.regexlen |
| 5266 |
|| ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040) |
| 5267 |
throw ex("parser.atom.1", this.offset-1); |
| 5268 |
this.next(); |
| 5269 |
return Token.createChar(ch2-0x40); |
| 5270 |
} |
| 5271 |
Token processBacksolidus_C() throws ParseException { |
| 5272 |
throw ex("parser.process.1", this.offset); |
| 5273 |
} |
| 5274 |
Token processBacksolidus_i() throws ParseException { |
| 5275 |
Token tok = Token.createChar('i'); |
| 5276 |
this.next(); |
| 5277 |
return tok; |
| 5278 |
} |
| 5279 |
Token processBacksolidus_I() throws ParseException { |
| 5280 |
throw ex("parser.process.1", this.offset); |
| 5281 |
} |
| 5282 |
Token processBacksolidus_g() throws ParseException { |
| 5283 |
this.next(); |
| 5284 |
return Token.getGraphemePattern(); |
| 5285 |
} |
| 5286 |
Token processBacksolidus_X() throws ParseException { |
| 5287 |
this.next(); |
| 5288 |
return Token.getCombiningCharacterSequence(); |
| 5289 |
} |
| 5290 |
Token processBackreference() throws ParseException { |
| 5291 |
int refnum = this.chardata-'0'; |
| 5292 |
Token tok = Token.createBackReference(refnum); |
| 5293 |
this.hasBackReferences = true; |
| 5294 |
if (this.references == null) this.references = new Vector(); |
| 5295 |
this.references.addElement(new ReferencePosition(refnum, this.offset-2)); |
| 5296 |
this.next(); |
| 5297 |
return tok; |
| 5298 |
} |
| 5299 |
|
| 5300 |
// ---------------------------------------------------------------- |
| 5301 |
|
| 5302 |
/** |
| 5303 |
* factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' |
| 5304 |
* | atom (('*' | '+' | '?' | minmax ) '?'? )?) |
| 5305 |
* | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' |
| 5306 |
* | '(?#' [^)]* ')' |
| 5307 |
* minmax ::= '{' min (',' max?)? '}' |
| 5308 |
* min ::= [0-9]+ |
| 5309 |
* max ::= [0-9]+ |
| 5310 |
*/ |
| 5311 |
Token parseFactor() throws ParseException { |
| 5312 |
int ch = this.read(); |
| 5313 |
Token tok; |
| 5314 |
switch (ch) { |
| 5315 |
case T_CARET: return this.processCaret(); |
| 5316 |
case T_DOLLAR: return this.processDollar(); |
| 5317 |
case T_LOOKAHEAD: return this.processLookahead(); |
| 5318 |
case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead(); |
| 5319 |
case T_LOOKBEHIND: return this.processLookbehind(); |
| 5320 |
case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind(); |
| 5321 |
|
| 5322 |
case T_COMMENT: |
| 5323 |
this.next(); |
| 5324 |
return Token.createEmpty(); |
| 5325 |
|
| 5326 |
case T_BACKSOLIDUS: |
| 5327 |
switch (this.chardata) { |
| 5328 |
case 'A': return this.processBacksolidus_A(); |
| 5329 |
case 'Z': return this.processBacksolidus_Z(); |
| 5330 |
case 'z': return this.processBacksolidus_z(); |
| 5331 |
case 'b': return this.processBacksolidus_b(); |
| 5332 |
case 'B': return this.processBacksolidus_B(); |
| 5333 |
case '<': return this.processBacksolidus_lt(); |
| 5334 |
case '>': return this.processBacksolidus_gt(); |
| 5335 |
} |
| 5336 |
// through down |
| 5337 |
} |
| 5338 |
tok = this.parseAtom(); |
| 5339 |
ch = this.read(); |
| 5340 |
switch (ch) { |
| 5341 |
case T_STAR: return this.processStar(tok); |
| 5342 |
case T_PLUS: return this.processPlus(tok); |
| 5343 |
case T_QUESTION: return this.processQuestion(tok); |
| 5344 |
case T_CHAR: |
| 5345 |
if (this.chardata == '{' && this.offset < this.regexlen) { |
| 5346 |
|
| 5347 |
int off = this.offset; // this.offset -> next of '{' |
| 5348 |
int min = 0, max = -1; |
| 5349 |
|
| 5350 |
if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { |
| 5351 |
|
| 5352 |
min = ch -'0'; |
| 5353 |
while (off < this.regexlen |
| 5354 |
&& (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { |
| 5355 |
min = min*10 +ch-'0'; |
| 5356 |
if (min < 0) |
| 5357 |
throw ex("parser.quantifier.5", this.offset); |
| 5358 |
} |
| 5359 |
} |
| 5360 |
else { |
| 5361 |
throw ex("parser.quantifier.1", this.offset); |
| 5362 |
} |
| 5363 |
|
| 5364 |
max = min; |
| 5365 |
if (ch == ',') { |
| 5366 |
|
| 5367 |
if (off >= this.regexlen) { |
| 5368 |
throw ex("parser.quantifier.3", this.offset); |
| 5369 |
} |
| 5370 |
else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { |
| 5371 |
|
| 5372 |
max = ch -'0'; // {min,max} |
| 5373 |
while (off < this.regexlen |
| 5374 |
&& (ch = this.regex.charAt(off++)) >= '0' |
| 5375 |
&& ch <= '9') { |
| 5376 |
max = max*10 +ch-'0'; |
| 5377 |
if (max < 0) |
| 5378 |
throw ex("parser.quantifier.5", this.offset); |
| 5379 |
} |
| 5380 |
|
| 5381 |
if (min > max) |
| 5382 |
throw ex("parser.quantifier.4", this.offset); |
| 5383 |
} |
| 5384 |
else { // assume {min,} |
| 5385 |
max = -1; |
| 5386 |
} |
| 5387 |
} |
| 5388 |
|
| 5389 |
if (ch != '}') |
| 5390 |
throw ex("parser.quantifier.2", this.offset); |
| 5391 |
|
| 5392 |
if (this.checkQuestion(off)) { // off -> next of '}' |
| 5393 |
tok = Token.createNGClosure(tok); |
| 5394 |
this.offset = off+1; |
| 5395 |
} else { |
| 5396 |
tok = Token.createClosure(tok); |
| 5397 |
this.offset = off; |
| 5398 |
} |
| 5399 |
|
| 5400 |
tok.setMin(min); |
| 5401 |
tok.setMax(max); |
| 5402 |
//System.err.println("CLOSURE: "+min+", "+max); |
| 5403 |
this.next(); |
| 5404 |
} |
| 5405 |
} |
| 5406 |
return tok; |
| 5407 |
} |
| 5408 |
|
| 5409 |
/** |
| 5410 |
* atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9] |
| 5411 |
* | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block |
| 5412 |
* | '(?>' regex ')' |
| 5413 |
* char ::= '\\' | '\' [efnrt] | bmp-code | character-1 |
| 5414 |
*/ |
| 5415 |
Token parseAtom() throws ParseException { |
| 5416 |
int ch = this.read(); |
| 5417 |
Token tok = null; |
| 5418 |
switch (ch) { |
| 5419 |
case T_LPAREN: return this.processParen(); |
| 5420 |
case T_LPAREN2: return this.processParen2(); // '(?:' |
| 5421 |
case T_CONDITION: return this.processCondition(); // '(?(' |
| 5422 |
case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... ) |
| 5423 |
case T_INDEPENDENT: return this.processIndependent(); |
| 5424 |
case T_DOT: |
| 5425 |
this.next(); // Skips '.' |
| 5426 |
tok = Token.token_dot; |
| 5427 |
break; |
| 5428 |
|
| 5429 |
/** |
| 5430 |
* char-class ::= '[' ( '^'? range ','?)+ ']' |
| 5431 |
* range ::= '\d' | '\w' | '\s' | category-block | range-char |
| 5432 |
* | range-char '-' range-char |
| 5433 |
* range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2 |
| 5434 |
* bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] |
| 5435 |
*/ |
| 5436 |
case T_LBRACKET: return this.parseCharacterClass(true); |
| 5437 |
case T_SET_OPERATIONS: return this.parseSetOperations(); |
| 5438 |
|
| 5439 |
case T_BACKSOLIDUS: |
| 5440 |
switch (this.chardata) { |
| 5441 |
case 'd': case 'D': |
| 5442 |
case 'w': case 'W': |
| 5443 |
case 's': case 'S': |
| 5444 |
tok = this.getTokenForShorthand(this.chardata); |
| 5445 |
this.next(); |
| 5446 |
return tok; |
| 5447 |
|
| 5448 |
case 'e': case 'f': case 'n': case 'r': |
| 5449 |
case 't': case 'u': case 'v': case 'x': |
| 5450 |
{ |
| 5451 |
int ch2 = this.decodeEscaped(); |
| 5452 |
if (ch2 < 0x10000) { |
| 5453 |
tok = Token.createChar(ch2); |
| 5454 |
} else { |
| 5455 |
tok = Token.createString(REUtil.decomposeToSurrogates(ch2)); |
| 5456 |
} |
| 5457 |
} |
| 5458 |
break; |
| 5459 |
|
| 5460 |
case 'c': return this.processBacksolidus_c(); |
| 5461 |
case 'C': return this.processBacksolidus_C(); |
| 5462 |
case 'i': return this.processBacksolidus_i(); |
| 5463 |
case 'I': return this.processBacksolidus_I(); |
| 5464 |
case 'g': return this.processBacksolidus_g(); |
| 5465 |
case 'X': return this.processBacksolidus_X(); |
| 5466 |
case '1': case '2': case '3': case '4': |
| 5467 |
case '5': case '6': case '7': case '8': case '9': |
| 5468 |
return this.processBackreference(); |
| 5469 |
|
| 5470 |
case 'P': |
| 5471 |
case 'p': |
| 5472 |
int pstart = this.offset; |
| 5473 |
tok = processBacksolidus_pP(this.chardata); |
| 5474 |
if (tok == null) throw this.ex("parser.atom.5", pstart); |
| 5475 |
break; |
| 5476 |
|
| 5477 |
default: |
| 5478 |
tok = Token.createChar(this.chardata); |
| 5479 |
} |
| 5480 |
this.next(); |
| 5481 |
break; |
| 5482 |
|
| 5483 |
case T_CHAR: |
| 5484 |
if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}') |
| 5485 |
throw this.ex("parser.atom.4", this.offset-1); |
| 5486 |
tok = Token.createChar(this.chardata); |
| 5487 |
int high = this.chardata; |
| 5488 |
this.next(); |
| 5489 |
if (REUtil.isHighSurrogate(high) |
| 5490 |
&& this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) { |
| 5491 |
char[] sur = new char[2]; |
| 5492 |
sur[0] = (char)high; |
| 5493 |
sur[1] = (char)this.chardata; |
| 5494 |
tok = Token.createParen(Token.createString(new String(sur)), 0); |
| 5495 |
this.next(); |
| 5496 |
} |
| 5497 |
break; |
| 5498 |
|
| 5499 |
default: |
| 5500 |
throw this.ex("parser.atom.4", this.offset-1); |
| 5501 |
} |
| 5502 |
return tok; |
| 5503 |
} |
| 5504 |
|
| 5505 |
protected RangeToken processBacksolidus_pP(int c) throws ParseException { |
| 5506 |
|
| 5507 |
this.next(); |
| 5508 |
if (this.read() != T_CHAR || this.chardata != '{') |
| 5509 |
throw this.ex("parser.atom.2", this.offset-1); |
| 5510 |
|
| 5511 |
// handle category escape |
| 5512 |
boolean positive = c == 'p'; |
| 5513 |
int namestart = this.offset; |
| 5514 |
int nameend = this.regex.indexOf('}', namestart); |
| 5515 |
|
| 5516 |
if (nameend < 0) |
| 5517 |
throw this.ex("parser.atom.3", this.offset); |
| 5518 |
|
| 5519 |
String pname = this.regex.substring(namestart, nameend); |
| 5520 |
this.offset = nameend+1; |
| 5521 |
|
| 5522 |
return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE)); |
| 5523 |
} |
| 5524 |
|
| 5525 |
int processCIinCharacterClass(RangeToken tok, int c) { |
| 5526 |
return this.decodeEscaped(); |
| 5527 |
} |
| 5528 |
|
| 5529 |
/** |
| 5530 |
* char-class ::= '[' ( '^'? range ','?)+ ']' |
| 5531 |
* range ::= '\d' | '\w' | '\s' | category-block | range-char |
| 5532 |
* | range-char '-' range-char |
| 5533 |
* range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2 |
| 5534 |
* bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] |
| 5535 |
*/ |
| 5536 |
protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException { |
| 5537 |
this.setContext(S_INBRACKETS); |
| 5538 |
this.next(); // '[' |
| 5539 |
boolean nrange = false; |
| 5540 |
RangeToken base = null; |
| 5541 |
RangeToken tok; |
| 5542 |
if (this.read() == T_CHAR && this.chardata == '^') { |
| 5543 |
nrange = true; |
| 5544 |
this.next(); // '^' |
| 5545 |
if (useNrange) { |
| 5546 |
tok = Token.createNRange(); |
| 5547 |
} else { |
| 5548 |
base = Token.createRange(); |
| 5549 |
base.addRange(0, Token.UTF16_MAX); |
| 5550 |
tok = Token.createRange(); |
| 5551 |
} |
| 5552 |
} else { |
| 5553 |
tok = Token.createRange(); |
| 5554 |
} |
| 5555 |
int type; |
| 5556 |
boolean firstloop = true; |
| 5557 |
while ((type = this.read()) != T_EOF) { |
| 5558 |
if (type == T_CHAR && this.chardata == ']' && !firstloop) |
| 5559 |
break; |
| 5560 |
firstloop = false; |
| 5561 |
int c = this.chardata; |
| 5562 |
boolean end = false; |
| 5563 |
if (type == T_BACKSOLIDUS) { |
| 5564 |
switch (c) { |
| 5565 |
case 'd': case 'D': |
| 5566 |
case 'w': case 'W': |
| 5567 |
case 's': case 'S': |
| 5568 |
tok.mergeRanges(this.getTokenForShorthand(c)); |
| 5569 |
end = true; |
| 5570 |
break; |
| 5571 |
|
| 5572 |
case 'i': case 'I': |
| 5573 |
case 'c': case 'C': |
| 5574 |
c = this.processCIinCharacterClass(tok, c); |
| 5575 |
if (c < 0) end = true; |
| 5576 |
break; |
| 5577 |
|
| 5578 |
case 'p': |
| 5579 |
case 'P': |
| 5580 |
int pstart = this.offset; |
| 5581 |
RangeToken tok2 = this.processBacksolidus_pP(c); |
| 5582 |
if (tok2 == null) throw this.ex("parser.atom.5", pstart); |
| 5583 |
tok.mergeRanges(tok2); |
| 5584 |
end = true; |
| 5585 |
break; |
| 5586 |
|
| 5587 |
default: |
| 5588 |
c = this.decodeEscaped(); |
| 5589 |
} // \ + c |
| 5590 |
} // backsolidus |
| 5591 |
// POSIX Character class such as [:alnum:] |
| 5592 |
else if (type == T_POSIX_CHARCLASS_START) { |
| 5593 |
int nameend = this.regex.indexOf(':', this.offset); |
| 5594 |
if (nameend < 0) throw this.ex("parser.cc.1", this.offset); |
| 5595 |
boolean positive = true; |
| 5596 |
if (this.regex.charAt(this.offset) == '^') { |
| 5597 |
this.offset ++; |
| 5598 |
positive = false; |
| 5599 |
} |
| 5600 |
String name = this.regex.substring(this.offset, nameend); |
| 5601 |
RangeToken range = Token.getRange(name, positive, |
| 5602 |
this.isSet(RegularExpression.XMLSCHEMA_MODE)); |
| 5603 |
if (range == null) throw this.ex("parser.cc.3", this.offset); |
| 5604 |
tok.mergeRanges(range); |
| 5605 |
end = true; |
| 5606 |
if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']') |
| 5607 |
throw this.ex("parser.cc.1", nameend); |
| 5608 |
this.offset = nameend+2; |
| 5609 |
} |
| 5610 |
this.next(); |
| 5611 |
if (!end) { // if not shorthands... |
| 5612 |
if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'. |
| 5613 |
tok.addRange(c, c); |
| 5614 |
} else { |
| 5615 |
this.next(); // Skips '-' |
| 5616 |
if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset); |
| 5617 |
if (type == T_CHAR && this.chardata == ']') { |
| 5618 |
tok.addRange(c, c); |
| 5619 |
tok.addRange('-', '-'); |
| 5620 |
} else { |
| 5621 |
int rangeend = this.chardata; |
| 5622 |
if (type == T_BACKSOLIDUS) |
| 5623 |
rangeend = this.decodeEscaped(); |
| 5624 |
this.next(); |
| 5625 |
tok.addRange(c, rangeend); |
| 5626 |
} |
| 5627 |
} |
| 5628 |
} |
| 5629 |
if (this.isSet(RegularExpression.SPECIAL_COMMA) |
| 5630 |
&& this.read() == T_CHAR && this.chardata == ',') |
| 5631 |
this.next(); |
| 5632 |
} |
| 5633 |
if (this.read() == T_EOF) |
| 5634 |
throw this.ex("parser.cc.2", this.offset); |
| 5635 |
if (!useNrange && nrange) { |
| 5636 |
base.subtractRanges(tok); |
| 5637 |
tok = base; |
| 5638 |
} |
| 5639 |
tok.sortRanges(); |
| 5640 |
tok.compactRanges(); |
| 5641 |
//tok.dumpRanges(); |
| 5642 |
/* |
| 5643 |
if (this.isSet(RegularExpression.IGNORE_CASE)) |
| 5644 |
tok = RangeToken.createCaseInsensitiveToken(tok); |
| 5645 |
*/ |
| 5646 |
this.setContext(S_NORMAL); |
| 5647 |
this.next(); // Skips ']' |
| 5648 |
|
| 5649 |
return tok; |
| 5650 |
} |
| 5651 |
|
| 5652 |
/** |
| 5653 |
* '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')' |
| 5654 |
*/ |
| 5655 |
protected RangeToken parseSetOperations() throws ParseException { |
| 5656 |
RangeToken tok = this.parseCharacterClass(false); |
| 5657 |
int type; |
| 5658 |
while ((type = this.read()) != T_RPAREN) { |
| 5659 |
int ch = this.chardata; |
| 5660 |
if (type == T_CHAR && (ch == '-' || ch == '&') |
| 5661 |
|| type == T_PLUS) { |
| 5662 |
this.next(); |
| 5663 |
if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1); |
| 5664 |
RangeToken t2 = this.parseCharacterClass(false); |
| 5665 |
if (type == T_PLUS) |
| 5666 |
tok.mergeRanges(t2); |
| 5667 |
else if (ch == '-') |
| 5668 |
tok.subtractRanges(t2); |
| 5669 |
else if (ch == '&') |
| 5670 |
tok.intersectRanges(t2); |
| 5671 |
else |
| 5672 |
throw new RuntimeException("ASSERT"); |
| 5673 |
} else { |
| 5674 |
throw ex("parser.ope.2", this.offset-1); |
| 5675 |
} |
| 5676 |
} |
| 5677 |
this.next(); |
| 5678 |
return tok; |
| 5679 |
} |
| 5680 |
|
| 5681 |
Token getTokenForShorthand(int ch) { |
| 5682 |
Token tok; |
| 5683 |
switch (ch) { |
| 5684 |
case 'd': |
| 5685 |
tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) |
| 5686 |
? Token.getRange("Nd", true) : Token.token_0to9; |
| 5687 |
break; |
| 5688 |
case 'D': |
| 5689 |
tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) |
| 5690 |
? Token.getRange("Nd", false) : Token.token_not_0to9; |
| 5691 |
break; |
| 5692 |
case 'w': |
| 5693 |
tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) |
| 5694 |
? Token.getRange("IsWord", true) : Token.token_wordchars; |
| 5695 |
break; |
| 5696 |
case 'W': |
| 5697 |
tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) |
| 5698 |
? Token.getRange("IsWord", false) : Token.token_not_wordchars; |
| 5699 |
break; |
| 5700 |
case 's': |
| 5701 |
tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) |
| 5702 |
? Token.getRange("IsSpace", true) : Token.token_spaces; |
| 5703 |
break; |
| 5704 |
case 'S': |
| 5705 |
tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) |
| 5706 |
? Token.getRange("IsSpace", false) : Token.token_not_spaces; |
| 5707 |
break; |
| 5708 |
|
| 5709 |
default: |
| 5710 |
throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16)); |
| 5711 |
} |
| 5712 |
return tok; |
| 5713 |
} |
| 5714 |
|
| 5715 |
/** |
| 5716 |
*/ |
| 5717 |
int decodeEscaped() throws ParseException { |
| 5718 |
if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1); |
| 5719 |
int c = this.chardata; |
| 5720 |
switch (c) { |
| 5721 |
case 'e': c = 0x1b; break; // ESCAPE U+001B |
| 5722 |
case 'f': c = '\f'; break; // FORM FEED U+000C |
| 5723 |
case 'n': c = '\n'; break; // LINE FEED U+000A |
| 5724 |
case 'r': c = '\r'; break; // CRRIAGE RETURN U+000D |
| 5725 |
case 't': c = '\t'; break; // HORIZONTAL TABULATION U+0009 |
| 5726 |
//case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B |
| 5727 |
case 'x': |
| 5728 |
this.next(); |
| 5729 |
if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); |
| 5730 |
if (this.chardata == '{') { |
| 5731 |
int v1 = 0; |
| 5732 |
int uv = 0; |
| 5733 |
do { |
| 5734 |
this.next(); |
| 5735 |
if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); |
| 5736 |
if ((v1 = hexChar(this.chardata)) < 0) |
| 5737 |
break; |
| 5738 |
if (uv > uv*16) throw ex("parser.descape.2", this.offset-1); |
| 5739 |
uv = uv*16+v1; |
| 5740 |
} while (true); |
| 5741 |
if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1); |
| 5742 |
if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1); |
| 5743 |
c = uv; |
| 5744 |
} else { |
| 5745 |
int v1 = 0; |
| 5746 |
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| 5747 |
throw ex("parser.descape.1", this.offset-1); |
| 5748 |
int uv = v1; |
| 5749 |
this.next(); |
| 5750 |
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| 5751 |
throw ex("parser.descape.1", this.offset-1); |
| 5752 |
uv = uv*16+v1; |
| 5753 |
c = uv; |
| 5754 |
} |
| 5755 |
break; |
| 5756 |
|
| 5757 |
case 'u': |
| 5758 |
int v1 = 0; |
| 5759 |
this.next(); |
| 5760 |
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| 5761 |
throw ex("parser.descape.1", this.offset-1); |
| 5762 |
int uv = v1; |
| 5763 |
this.next(); |
| 5764 |
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| 5765 |
throw ex("parser.descape.1", this.offset-1); |
| 5766 |
uv = uv*16+v1; |
| 5767 |
this.next(); |
| 5768 |
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| 5769 |
throw ex("parser.descape.1", this.offset-1); |
| 5770 |
uv = uv*16+v1; |
| 5771 |
this.next(); |
| 5772 |
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| 5773 |
throw ex("parser.descape.1", this.offset-1); |
| 5774 |
uv = uv*16+v1; |
| 5775 |
c = uv; |
| 5776 |
break; |
| 5777 |
|
| 5778 |
case 'v': |
| 5779 |
this.next(); |
| 5780 |
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| 5781 |
throw ex("parser.descape.1", this.offset-1); |
| 5782 |
uv = v1; |
| 5783 |
this.next(); |
| 5784 |
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| 5785 |
throw ex("parser.descape.1", this.offset-1); |
| 5786 |
uv = uv*16+v1; |
| 5787 |
this.next(); |
| 5788 |
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| 5789 |
throw ex("parser.descape.1", this.offset-1); |
| 5790 |
uv = uv*16+v1; |
| 5791 |
this.next(); |
| 5792 |
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| 5793 |
throw ex("parser.descape.1", this.offset-1); |
| 5794 |
uv = uv*16+v1; |
| 5795 |
this.next(); |
| 5796 |
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| 5797 |
throw ex("parser.descape.1", this.offset-1); |
| 5798 |
uv = uv*16+v1; |
| 5799 |
this.next(); |
| 5800 |
if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| 5801 |
throw ex("parser.descape.1", this.offset-1); |
| 5802 |
uv = uv*16+v1; |
| 5803 |
if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1); |
| 5804 |
c = uv; |
| 5805 |
break; |
| 5806 |
case 'A': |
| 5807 |
case 'Z': |
| 5808 |
case 'z': |
| 5809 |
throw ex("parser.descape.5", this.offset-2); |
| 5810 |
default: |
| 5811 |
} |
| 5812 |
return c; |
| 5813 |
} |
| 5814 |
|
| 5815 |
static private final int hexChar(int ch) { |
| 5816 |
if (ch < '0') return -1; |
| 5817 |
if (ch > 'f') return -1; |
| 5818 |
if (ch <= '9') return ch-'0'; |
| 5819 |
if (ch < 'A') return -1; |
| 5820 |
if (ch <= 'F') return ch-'A'+10; |
| 5821 |
if (ch < 'a') return -1; |
| 5822 |
return ch-'a'+10; |
| 5823 |
} |
| 5824 |
} |
| 5825 |
|
| 5826 |
|
| 5827 |
static class Token implements java.io.Serializable { |
| 5828 |
static final boolean COUNTTOKENS = true; |
| 5829 |
static int tokens = 0; |
| 5830 |
|
| 5831 |
static final int CHAR = 0; // Literal char |
| 5832 |
static final int DOT = 11; // . |
| 5833 |
static final int CONCAT = 1; // XY |
| 5834 |
static final int UNION = 2; // X|Y|Z |
| 5835 |
static final int CLOSURE = 3; // X* |
| 5836 |
static final int RANGE = 4; // [a-zA-Z] etc. |
| 5837 |
static final int NRANGE = 5; // [^a-zA-Z] etc. |
| 5838 |
static final int PAREN = 6; // (X) or (?:X) |
| 5839 |
static final int EMPTY = 7; // |
| 5840 |
static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z |
| 5841 |
static final int NONGREEDYCLOSURE = 9; // *? +? |
| 5842 |
static final int STRING = 10; // strings |
| 5843 |
static final int BACKREFERENCE = 12; // back references |
| 5844 |
static final int LOOKAHEAD = 20; // (?=...) |
| 5845 |
static final int NEGATIVELOOKAHEAD = 21; // (?!...) |
| 5846 |
static final int LOOKBEHIND = 22; // (?<=...) |
| 5847 |
static final int NEGATIVELOOKBEHIND = 23; // (?<!...) |
| 5848 |
static final int INDEPENDENT = 24; // (?>...) |
| 5849 |
static final int MODIFIERGROUP = 25; // (?ims-ims:...) |
| 5850 |
static final int CONDITION = 26; // (?(...)yes|no) |
| 5851 |
|
| 5852 |
static final int UTF16_MAX = 0x10ffff; |
| 5853 |
|
| 5854 |
int type; |
| 5855 |
|
| 5856 |
static Token token_dot; |
| 5857 |
static Token token_0to9; |
| 5858 |
static Token token_wordchars; |
| 5859 |
static Token token_not_0to9; |
| 5860 |
static Token token_not_wordchars; |
| 5861 |
static Token token_spaces; |
| 5862 |
static Token token_not_spaces; |
| 5863 |
static Token token_empty; |
| 5864 |
static Token token_linebeginning; |
| 5865 |
static Token token_linebeginning2; |
| 5866 |
static Token token_lineend; |
| 5867 |
static Token token_stringbeginning; |
| 5868 |
static Token token_stringend; |
| 5869 |
static Token token_stringend2; |
| 5870 |
static Token token_wordedge; |
| 5871 |
static Token token_not_wordedge; |
| 5872 |
static Token token_wordbeginning; |
| 5873 |
static Token token_wordend; |
| 5874 |
static { |
| 5875 |
Token.token_empty = new Token(Token.EMPTY); |
| 5876 |
|
| 5877 |
Token.token_linebeginning = Token.createAnchor('^'); |
| 5878 |
Token.token_linebeginning2 = Token.createAnchor('@'); |
| 5879 |
Token.token_lineend = Token.createAnchor('$'); |
| 5880 |
Token.token_stringbeginning = Token.createAnchor('A'); |
| 5881 |
Token.token_stringend = Token.createAnchor('z'); |
| 5882 |
Token.token_stringend2 = Token.createAnchor('Z'); |
| 5883 |
Token.token_wordedge = Token.createAnchor('b'); |
| 5884 |
Token.token_not_wordedge = Token.createAnchor('B'); |
| 5885 |
Token.token_wordbeginning = Token.createAnchor('<'); |
| 5886 |
Token.token_wordend = Token.createAnchor('>'); |
| 5887 |
|
| 5888 |
Token.token_dot = new Token(Token.DOT); |
| 5889 |
|
| 5890 |
Token.token_0to9 = Token.createRange(); |
| 5891 |
Token.token_0to9.addRange('0', '9'); |
| 5892 |
Token.token_wordchars = Token.createRange(); |
| 5893 |
Token.token_wordchars.addRange('0', '9'); |
| 5894 |
Token.token_wordchars.addRange('A', 'Z'); |
| 5895 |
Token.token_wordchars.addRange('_', '_'); |
| 5896 |
Token.token_wordchars.addRange('a', 'z'); |
| 5897 |
Token.token_spaces = Token.createRange(); |
| 5898 |
Token.token_spaces.addRange('\t', '\t'); |
| 5899 |
Token.token_spaces.addRange('\n', '\n'); |
| 5900 |
Token.token_spaces.addRange('\f', '\f'); |
| 5901 |
Token.token_spaces.addRange('\r', '\r'); |
| 5902 |
Token.token_spaces.addRange(' ', ' '); |
| 5903 |
|
| 5904 |
Token.token_not_0to9 = Token.complementRanges(Token.token_0to9); |
| 5905 |
Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars); |
| 5906 |
Token.token_not_spaces = Token.complementRanges(Token.token_spaces); |
| 5907 |
} |
| 5908 |
|
| 5909 |
static Token.ParenToken createLook(int type, Token child) { |
| 5910 |
if (COUNTTOKENS) Token.tokens ++; |
| 5911 |
return new Token.ParenToken(type, child, 0); |
| 5912 |
} |
| 5913 |
static Token.ParenToken createParen(Token child, int pnumber) { |
| 5914 |
if (COUNTTOKENS) Token.tokens ++; |
| 5915 |
return new Token.ParenToken(Token.PAREN, child, pnumber); |
| 5916 |
} |
| 5917 |
static Token.ClosureToken createClosure(Token tok) { |
| 5918 |
if (COUNTTOKENS) Token.tokens ++; |
| 5919 |
return new Token.ClosureToken(Token.CLOSURE, tok); |
| 5920 |
} |
| 5921 |
static Token.ClosureToken createNGClosure(Token tok) { |
| 5922 |
if (COUNTTOKENS) Token.tokens ++; |
| 5923 |
return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok); |
| 5924 |
} |
| 5925 |
static Token.ConcatToken createConcat(Token tok1, Token tok2) { |
| 5926 |
if (COUNTTOKENS) Token.tokens ++; |
| 5927 |
return new Token.ConcatToken(tok1, tok2); |
| 5928 |
} |
| 5929 |
static Token.UnionToken createConcat() { |
| 5930 |
if (COUNTTOKENS) Token.tokens ++; |
| 5931 |
return new Token.UnionToken(Token.CONCAT); // *** It is not a bug. |
| 5932 |
} |
| 5933 |
static Token.UnionToken createUnion() { |
| 5934 |
if (COUNTTOKENS) Token.tokens ++; |
| 5935 |
return new Token.UnionToken(Token.UNION); |
| 5936 |
} |
| 5937 |
static Token createEmpty() { |
| 5938 |
return Token.token_empty; |
| 5939 |
} |
| 5940 |
static RangeToken createRange() { |
| 5941 |
if (COUNTTOKENS) Token.tokens ++; |
| 5942 |
return new RangeToken(Token.RANGE); |
| 5943 |
} |
| 5944 |
static RangeToken createNRange() { |
| 5945 |
if (COUNTTOKENS) Token.tokens ++; |
| 5946 |
return new RangeToken(Token.NRANGE); |
| 5947 |
} |
| 5948 |
static Token.CharToken createChar(int ch) { |
| 5949 |
if (COUNTTOKENS) Token.tokens ++; |
| 5950 |
return new Token.CharToken(Token.CHAR, ch); |
| 5951 |
} |
| 5952 |
static private Token.CharToken createAnchor(int ch) { |
| 5953 |
if (COUNTTOKENS) Token.tokens ++; |
| 5954 |
return new Token.CharToken(Token.ANCHOR, ch); |
| 5955 |
} |
| 5956 |
static Token.StringToken createBackReference(int refno) { |
| 5957 |
if (COUNTTOKENS) Token.tokens ++; |
| 5958 |
return new Token.StringToken(Token.BACKREFERENCE, null, refno); |
| 5959 |
} |
| 5960 |
static Token.StringToken createString(String str) { |
| 5961 |
if (COUNTTOKENS) Token.tokens ++; |
| 5962 |
return new Token.StringToken(Token.STRING, str, 0); |
| 5963 |
} |
| 5964 |
static Token.ModifierToken createModifierGroup(Token child, int add, int mask) { |
| 5965 |
if (COUNTTOKENS) Token.tokens ++; |
| 5966 |
return new Token.ModifierToken(child, add, mask); |
| 5967 |
} |
| 5968 |
static Token.ConditionToken createCondition(int refno, Token condition, |
| 5969 |
Token yespat, Token nopat) { |
| 5970 |
if (COUNTTOKENS) Token.tokens ++; |
| 5971 |
return new Token.ConditionToken(refno, condition, yespat, nopat); |
| 5972 |
} |
| 5973 |
|
| 5974 |
protected Token(int type) { |
| 5975 |
this.type = type; |
| 5976 |
} |
| 5977 |
|
| 5978 |
/** |
| 5979 |
* A number of children. |
| 5980 |
*/ |
| 5981 |
int size() { |
| 5982 |
return 0; |
| 5983 |
} |
| 5984 |
Token getChild(int index) { |
| 5985 |
return null; |
| 5986 |
} |
| 5987 |
void addChild(Token tok) { |
| 5988 |
throw new RuntimeException("Not supported."); |
| 5989 |
} |
| 5990 |
|
| 5991 |
// for RANGE or NRANGE |
| 5992 |
protected void addRange(int start, int end) { |
| 5993 |
throw new RuntimeException("Not supported."); |
| 5994 |
} |
| 5995 |
protected void sortRanges() { |
| 5996 |
throw new RuntimeException("Not supported."); |
| 5997 |
} |
| 5998 |
protected void compactRanges() { |
| 5999 |
throw new RuntimeException("Not supported."); |
| 6000 |
} |
| 6001 |
protected void mergeRanges(Token tok) { |
| 6002 |
throw new RuntimeException("Not supported."); |
| 6003 |
} |
| 6004 |
protected void subtractRanges(Token tok) { |
| 6005 |
throw new RuntimeException("Not supported."); |
| 6006 |
} |
| 6007 |
protected void intersectRanges(Token tok) { |
| 6008 |
throw new RuntimeException("Not supported."); |
| 6009 |
} |
| 6010 |
static Token complementRanges(Token tok) { |
| 6011 |
return RangeToken.complementRanges(tok); |
| 6012 |
} |
| 6013 |
|
| 6014 |
|
| 6015 |
void setMin(int min) { // for CLOSURE |
| 6016 |
} |
| 6017 |
void setMax(int max) { // for CLOSURE |
| 6018 |
} |
| 6019 |
int getMin() { // for CLOSURE |
| 6020 |
return -1; |
| 6021 |
} |
| 6022 |
int getMax() { // for CLOSURE |
| 6023 |
return -1; |
| 6024 |
} |
| 6025 |
int getReferenceNumber() { // for STRING |
| 6026 |
return 0; |
| 6027 |
} |
| 6028 |
String getString() { // for STRING |
| 6029 |
return null; |
| 6030 |
} |
| 6031 |
|
| 6032 |
int getParenNumber() { |
| 6033 |
return 0; |
| 6034 |
} |
| 6035 |
int getChar() { |
| 6036 |
return -1; |
| 6037 |
} |
| 6038 |
|
| 6039 |
public String toString() { |
| 6040 |
return this.toString(0); |
| 6041 |
} |
| 6042 |
public String toString(int options) { |
| 6043 |
return this.type == Token.DOT ? "." : ""; |
| 6044 |
} |
| 6045 |
|
| 6046 |
/** |
| 6047 |
* How many characters are needed? |
| 6048 |
*/ |
| 6049 |
final int getMinLength() { |
| 6050 |
switch (this.type) { |
| 6051 |
case CONCAT: |
| 6052 |
int sum = 0; |
| 6053 |
for (int i = 0; i < this.size(); i ++) |
| 6054 |
sum += this.getChild(i).getMinLength(); |
| 6055 |
return sum; |
| 6056 |
|
| 6057 |
case CONDITION: |
| 6058 |
case UNION: |
| 6059 |
if (this.size() == 0) |
| 6060 |
return 0; |
| 6061 |
int ret = this.getChild(0).getMinLength(); |
| 6062 |
for (int i = 1; i < this.size(); i ++) { |
| 6063 |
int min = this.getChild(i).getMinLength(); |
| 6064 |
if (min < ret) ret = min; |
| 6065 |
} |
| 6066 |
return ret; |
| 6067 |
|
| 6068 |
case CLOSURE: |
| 6069 |
case NONGREEDYCLOSURE: |
| 6070 |
if (this.getMin() >= 0) |
| 6071 |
return this.getMin() * this.getChild(0).getMinLength(); |
| 6072 |
return 0; |
| 6073 |
|
| 6074 |
case EMPTY: |
| 6075 |
case ANCHOR: |
| 6076 |
return 0; |
| 6077 |
|
| 6078 |
case DOT: |
| 6079 |
case CHAR: |
| 6080 |
case RANGE: |
| 6081 |
case NRANGE: |
| 6082 |
return 1; |
| 6083 |
|
| 6084 |
case INDEPENDENT: |
| 6085 |
case PAREN: |
| 6086 |
case MODIFIERGROUP: |
| 6087 |
return this.getChild(0).getMinLength(); |
| 6088 |
|
| 6089 |
case BACKREFERENCE: |
| 6090 |
return 0; // ******* |
| 6091 |
|
| 6092 |
case STRING: |
| 6093 |
return this.getString().length(); |
| 6094 |
|
| 6095 |
case LOOKAHEAD: |
| 6096 |
case NEGATIVELOOKAHEAD: |
| 6097 |
case LOOKBEHIND: |
| 6098 |
case NEGATIVELOOKBEHIND: |
| 6099 |
return 0; // ***** Really? |
| 6100 |
|
| 6101 |
default: |
| 6102 |
throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type); |
| 6103 |
} |
| 6104 |
} |
| 6105 |
|
| 6106 |
final int getMaxLength() { |
| 6107 |
switch (this.type) { |
| 6108 |
case CONCAT: |
| 6109 |
int sum = 0; |
| 6110 |
for (int i = 0; i < this.size(); i ++) { |
| 6111 |
int d = this.getChild(i).getMaxLength(); |
| 6112 |
if (d < 0) return -1; |
| 6113 |
sum += d; |
| 6114 |
} |
| 6115 |
return sum; |
| 6116 |
|
| 6117 |
case CONDITION: |
| 6118 |
case UNION: |
| 6119 |
if (this.size() == 0) |
| 6120 |
return 0; |
| 6121 |
int ret = this.getChild(0).getMaxLength(); |
| 6122 |
for (int i = 1; ret >= 0 && i < this.size(); i ++) { |
| 6123 |
int max = this.getChild(i).getMaxLength(); |
| 6124 |
if (max < 0) { // infinity |
| 6125 |
ret = -1; |
| 6126 |
break; |
| 6127 |
} |
| 6128 |
if (max > ret) ret = max; |
| 6129 |
} |
| 6130 |
return ret; |
| 6131 |
|
| 6132 |
case CLOSURE: |
| 6133 |
case NONGREEDYCLOSURE: |
| 6134 |
if (this.getMax() >= 0) |
| 6135 |
// When this.child.getMaxLength() < 0, |
| 6136 |
// this returns minus value |
| 6137 |
return this.getMax() * this.getChild(0).getMaxLength(); |
| 6138 |
return -1; |
| 6139 |
|
| 6140 |
case EMPTY: |
| 6141 |
case ANCHOR: |
| 6142 |
return 0; |
| 6143 |
|
| 6144 |
case CHAR: |
| 6145 |
return 1; |
| 6146 |
case DOT: |
| 6147 |
case RANGE: |
| 6148 |
case NRANGE: |
| 6149 |
return 2; |
| 6150 |
|
| 6151 |
case INDEPENDENT: |
| 6152 |
case PAREN: |
| 6153 |
case MODIFIERGROUP: |
| 6154 |
return this.getChild(0).getMaxLength(); |
| 6155 |
|
| 6156 |
case BACKREFERENCE: |
| 6157 |
return -1; // ****** |
| 6158 |
|
| 6159 |
case STRING: |
| 6160 |
return this.getString().length(); |
| 6161 |
|
| 6162 |
case LOOKAHEAD: |
| 6163 |
case NEGATIVELOOKAHEAD: |
| 6164 |
case LOOKBEHIND: |
| 6165 |
case NEGATIVELOOKBEHIND: |
| 6166 |
return 0; // ***** Really? |
| 6167 |
|
| 6168 |
default: |
| 6169 |
throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type); |
| 6170 |
} |
| 6171 |
} |
| 6172 |
|
| 6173 |
static final int FC_CONTINUE = 0; |
| 6174 |
static final int FC_TERMINAL = 1; |
| 6175 |
static final int FC_ANY = 2; |
| 6176 |
private static final boolean isSet(int options, int flag) { |
| 6177 |
return (options & flag) == flag; |
| 6178 |
} |
| 6179 |
final int analyzeFirstCharacter(RangeToken result, int options) { |
| 6180 |
switch (this.type) { |
| 6181 |
case CONCAT: |
| 6182 |
int ret = FC_CONTINUE; |
| 6183 |
for (int i = 0; i < this.size(); i ++) |
| 6184 |
if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE) |
| 6185 |
break; |
| 6186 |
return ret; |
| 6187 |
|
| 6188 |
case UNION: |
| 6189 |
if (this.size() == 0) |
| 6190 |
return FC_CONTINUE; |
| 6191 |
/* |
| 6192 |
* a|b|c -> FC_TERMINAL |
| 6193 |
* a|.|c -> FC_ANY |
| 6194 |
* a|b| -> FC_CONTINUE |
| 6195 |
*/ |
| 6196 |
int ret2 = FC_CONTINUE; |
| 6197 |
boolean hasEmpty = false; |
| 6198 |
for (int i = 0; i < this.size(); i ++) { |
| 6199 |
ret2 = this.getChild(i).analyzeFirstCharacter(result, options); |
| 6200 |
if (ret2 == FC_ANY) |
| 6201 |
break; |
| 6202 |
else if (ret2 == FC_CONTINUE) |
| 6203 |
hasEmpty = true; |
| 6204 |
} |
| 6205 |
return hasEmpty ? FC_CONTINUE : ret2; |
| 6206 |
|
| 6207 |
case CONDITION: |
| 6208 |
int ret3 = this.getChild(0).analyzeFirstCharacter(result, options); |
| 6209 |
if (this.size() == 1) return FC_CONTINUE; |
| 6210 |
if (ret3 == FC_ANY) return ret3; |
| 6211 |
int ret4 = this.getChild(1).analyzeFirstCharacter(result, options); |
| 6212 |
if (ret4 == FC_ANY) return ret4; |
| 6213 |
return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL; |
| 6214 |
|
| 6215 |
case CLOSURE: |
| 6216 |
case NONGREEDYCLOSURE: |
| 6217 |
this.getChild(0).analyzeFirstCharacter(result, options); |
| 6218 |
return FC_CONTINUE; |
| 6219 |
|
| 6220 |
case EMPTY: |
| 6221 |
case ANCHOR: |
| 6222 |
return FC_CONTINUE; |
| 6223 |
|
| 6224 |
case CHAR: |
| 6225 |
int ch = this.getChar(); |
| 6226 |
result.addRange(ch, ch); |
| 6227 |
if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { |
| 6228 |
ch = Character.toUpperCase((char)ch); |
| 6229 |
result.addRange(ch, ch); |
| 6230 |
ch = Character.toLowerCase((char)ch); |
| 6231 |
result.addRange(ch, ch); |
| 6232 |
} |
| 6233 |
return FC_TERMINAL; |
| 6234 |
|
| 6235 |
case DOT: // **** |
| 6236 |
if (isSet(options, RegularExpression.SINGLE_LINE)) { |
| 6237 |
return FC_CONTINUE; // **** We can not optimize. |
| 6238 |
} else { |
| 6239 |
return FC_CONTINUE; |
| 6240 |
/* |
| 6241 |
result.addRange(0, RegularExpression.LINE_FEED-1); |
| 6242 |
result.addRange(RegularExpression.LINE_FEED+1, RegularExpression.CARRIAGE_RETURN-1); |
| 6243 |
result.addRange(RegularExpression.CARRIAGE_RETURN+1, |
| 6244 |
RegularExpression.LINE_SEPARATOR-1); |
| 6245 |
result.addRange(RegularExpression.PARAGRAPH_SEPARATOR+1, UTF16_MAX); |
| 6246 |
return 1; |
| 6247 |
*/ |
| 6248 |
} |
| 6249 |
|
| 6250 |
case RANGE: |
| 6251 |
if (isSet(options, RegularExpression.IGNORE_CASE)) { |
| 6252 |
result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken()); |
| 6253 |
} else { |
| 6254 |
result.mergeRanges(this); |
| 6255 |
} |
| 6256 |
return FC_TERMINAL; |
| 6257 |
|
| 6258 |
case NRANGE: // **** |
| 6259 |
if (isSet(options, RegularExpression.IGNORE_CASE)) { |
| 6260 |
result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken())); |
| 6261 |
} else { |
| 6262 |
result.mergeRanges(Token.complementRanges(this)); |
| 6263 |
} |
| 6264 |
return FC_TERMINAL; |
| 6265 |
|
| 6266 |
case INDEPENDENT: |
| 6267 |
case PAREN: |
| 6268 |
return this.getChild(0).analyzeFirstCharacter(result, options); |
| 6269 |
|
| 6270 |
case MODIFIERGROUP: |
| 6271 |
options |= ((ModifierToken)this).getOptions(); |
| 6272 |
options &= ~((ModifierToken)this).getOptionsMask(); |
| 6273 |
return this.getChild(0).analyzeFirstCharacter(result, options); |
| 6274 |
|
| 6275 |
case BACKREFERENCE: |
| 6276 |
result.addRange(0, UTF16_MAX); // **** We can not optimize. |
| 6277 |
return FC_ANY; |
| 6278 |
|
| 6279 |
case STRING: |
| 6280 |
int cha = this.getString().charAt(0); |
| 6281 |
int ch2; |
| 6282 |
if (REUtil.isHighSurrogate(cha) |
| 6283 |
&& this.getString().length() >= 2 |
| 6284 |
&& REUtil.isLowSurrogate((ch2 = this.getString().charAt(1)))) |
| 6285 |
cha = REUtil.composeFromSurrogates(cha, ch2); |
| 6286 |
result.addRange(cha, cha); |
| 6287 |
if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { |
| 6288 |
cha = Character.toUpperCase((char)cha); |
| 6289 |
result.addRange(cha, cha); |
| 6290 |
cha = Character.toLowerCase((char)cha); |
| 6291 |
result.addRange(cha, cha); |
| 6292 |
} |
| 6293 |
return FC_TERMINAL; |
| 6294 |
|
| 6295 |
case LOOKAHEAD: |
| 6296 |
case NEGATIVELOOKAHEAD: |
| 6297 |
case LOOKBEHIND: |
| 6298 |
case NEGATIVELOOKBEHIND: |
| 6299 |
return FC_CONTINUE; |
| 6300 |
|
| 6301 |
default: |
| 6302 |
throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type); |
| 6303 |
} |
| 6304 |
} |
| 6305 |
|
| 6306 |
private final boolean isShorterThan(Token tok) { |
| 6307 |
if (tok == null) return false; |
| 6308 |
/* |
| 6309 |
int mylength; |
| 6310 |
if (this.type == STRING) mylength = this.getString().length(); |
| 6311 |
else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1; |
| 6312 |
else throw new RuntimeException("Internal Error: Illegal type: "+this.type); |
| 6313 |
int otherlength; |
| 6314 |
if (tok.type == STRING) otherlength = tok.getString().length(); |
| 6315 |
else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1; |
| 6316 |
else throw new RuntimeException("Internal Error: Illegal type: "+tok.type); |
| 6317 |
*/ |
| 6318 |
int mylength; |
| 6319 |
if (this.type == STRING) mylength = this.getString().length(); |
| 6320 |
else throw new RuntimeException("Internal Error: Illegal type: "+this.type); |
| 6321 |
int otherlength; |
| 6322 |
if (tok.type == STRING) otherlength = tok.getString().length(); |
| 6323 |
else throw new RuntimeException("Internal Error: Illegal type: "+tok.type); |
| 6324 |
return mylength < otherlength; |
| 6325 |
} |
| 6326 |
|
| 6327 |
static class FixedStringContainer { |
| 6328 |
Token token = null; |
| 6329 |
int options = 0; |
| 6330 |
FixedStringContainer() { |
| 6331 |
} |
| 6332 |
} |
| 6333 |
|
| 6334 |
final void findFixedString(FixedStringContainer container, int options) { |
| 6335 |
switch (this.type) { |
| 6336 |
case CONCAT: |
| 6337 |
Token prevToken = null; |
| 6338 |
int prevOptions = 0; |
| 6339 |
for (int i = 0; i < this.size(); i ++) { |
| 6340 |
this.getChild(i).findFixedString(container, options); |
| 6341 |
if (prevToken == null || prevToken.isShorterThan(container.token)) { |
| 6342 |
prevToken = container.token; |
| 6343 |
prevOptions = container.options; |
| 6344 |
} |
| 6345 |
} |
| 6346 |
container.token = prevToken; |
| 6347 |
container.options = prevOptions; |
| 6348 |
return; |
| 6349 |
|
| 6350 |
case UNION: |
| 6351 |
case CLOSURE: |
| 6352 |
case NONGREEDYCLOSURE: |
| 6353 |
case EMPTY: |
| 6354 |
case ANCHOR: |
| 6355 |
case RANGE: |
| 6356 |
case DOT: |
| 6357 |
case NRANGE: |
| 6358 |
case BACKREFERENCE: |
| 6359 |
case LOOKAHEAD: |
| 6360 |
case NEGATIVELOOKAHEAD: |
| 6361 |
case LOOKBEHIND: |
| 6362 |
case NEGATIVELOOKBEHIND: |
| 6363 |
case CONDITION: |
| 6364 |
container.token = null; |
| 6365 |
return; |
| 6366 |
|
| 6367 |
case CHAR: // Ignore CHAR tokens. |
| 6368 |
container.token = null; // ** |
| 6369 |
return; // ** |
| 6370 |
|
| 6371 |
case STRING: |
| 6372 |
container.token = this; |
| 6373 |
container.options = options; |
| 6374 |
return; |
| 6375 |
|
| 6376 |
case INDEPENDENT: |
| 6377 |
case PAREN: |
| 6378 |
this.getChild(0).findFixedString(container, options); |
| 6379 |
return; |
| 6380 |
|
| 6381 |
case MODIFIERGROUP: |
| 6382 |
options |= ((ModifierToken)this).getOptions(); |
| 6383 |
options &= ~((ModifierToken)this).getOptionsMask(); |
| 6384 |
this.getChild(0).findFixedString(container, options); |
| 6385 |
return; |
| 6386 |
|
| 6387 |
default: |
| 6388 |
throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type); |
| 6389 |
} |
| 6390 |
} |
| 6391 |
|
| 6392 |
boolean match(int ch) { |
| 6393 |
throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type); |
| 6394 |
} |
| 6395 |
|
| 6396 |
// ------------------------------------------------------ |
| 6397 |
private final static Hashtable categories = new Hashtable(); |
| 6398 |
private final static Hashtable categories2 = new Hashtable(); |
| 6399 |
private static final String[] categoryNames = { |
| 6400 |
"Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd", |
| 6401 |
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs", |
| 6402 |
"Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28 |
| 6403 |
"Pi", "Pf", // 29, 30 |
| 6404 |
"L", "M", "N", "Z", "C", "P", "S", // 31-37 |
| 6405 |
}; |
| 6406 |
|
| 6407 |
// Schema Rec. {Datatypes} - Punctuation |
| 6408 |
static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote |
| 6409 |
static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote |
| 6410 |
static final int CHAR_LETTER = 31; |
| 6411 |
static final int CHAR_MARK = 32; |
| 6412 |
static final int CHAR_NUMBER = 33; |
| 6413 |
static final int CHAR_SEPARATOR = 34; |
| 6414 |
static final int CHAR_OTHER = 35; |
| 6415 |
static final int CHAR_PUNCTUATION = 36; |
| 6416 |
static final int CHAR_SYMBOL = 37; |
| 6417 |
|
| 6418 |
//blockNames in UNICODE 3.1 that supported by XML Schema REC |
| 6419 |
private static final String[] blockNames = { |
| 6420 |
/*0000..007F;*/ "Basic Latin", |
| 6421 |
/*0080..00FF;*/ "Latin-1 Supplement", |
| 6422 |
/*0100..017F;*/ "Latin Extended-A", |
| 6423 |
/*0180..024F;*/ "Latin Extended-B", |
| 6424 |
/*0250..02AF;*/ "IPA Extensions", |
| 6425 |
/*02B0..02FF;*/ "Spacing Modifier Letters", |
| 6426 |
/*0300..036F;*/ "Combining Diacritical Marks", |
| 6427 |
/*0370..03FF;*/ "Greek", |
| 6428 |
/*0400..04FF;*/ "Cyrillic", |
| 6429 |
/*0530..058F;*/ "Armenian", |
| 6430 |
/*0590..05FF;*/ "Hebrew", |
| 6431 |
/*0600..06FF;*/ "Arabic", |
| 6432 |
/*0700..074F;*/ "Syriac", |
| 6433 |
/*0780..07BF;*/ "Thaana", |
| 6434 |
/*0900..097F;*/ "Devanagari", |
| 6435 |
/*0980..09FF;*/ "Bengali", |
| 6436 |
/*0A00..0A7F;*/ "Gurmukhi", |
| 6437 |
/*0A80..0AFF;*/ "Gujarati", |
| 6438 |
/*0B00..0B7F;*/ "Oriya", |
| 6439 |
/*0B80..0BFF;*/ "Tamil", |
| 6440 |
/*0C00..0C7F;*/ "Telugu", |
| 6441 |
/*0C80..0CFF;*/ "Kannada", |
| 6442 |
/*0D00..0D7F;*/ "Malayalam", |
| 6443 |
/*0D80..0DFF;*/ "Sinhala", |
| 6444 |
/*0E00..0E7F;*/ "Thai", |
| 6445 |
/*0E80..0EFF;*/ "Lao", |
| 6446 |
/*0F00..0FFF;*/ "Tibetan", |
| 6447 |
/*1000..109F;*/ "Myanmar", |
| 6448 |
/*10A0..10FF;*/ "Georgian", |
| 6449 |
/*1100..11FF;*/ "Hangul Jamo", |
| 6450 |
/*1200..137F;*/ "Ethiopic", |
| 6451 |
/*13A0..13FF;*/ "Cherokee", |
| 6452 |
/*1400..167F;*/ "Unified Canadian Aboriginal Syllabics", |
| 6453 |
/*1680..169F;*/ "Ogham", |
| 6454 |
/*16A0..16FF;*/ "Runic", |
| 6455 |
/*1780..17FF;*/ "Khmer", |
| 6456 |
/*1800..18AF;*/ "Mongolian", |
| 6457 |
/*1E00..1EFF;*/ "Latin Extended Additional", |
| 6458 |
/*1F00..1FFF;*/ "Greek Extended", |
| 6459 |
/*2000..206F;*/ "General Punctuation", |
| 6460 |
/*2070..209F;*/ "Superscripts and Subscripts", |
| 6461 |
/*20A0..20CF;*/ "Currency Symbols", |
| 6462 |
/*20D0..20FF;*/ "Combining Marks for Symbols", |
| 6463 |
/*2100..214F;*/ "Letterlike Symbols", |
| 6464 |
/*2150..218F;*/ "Number Forms", |
| 6465 |
/*2190..21FF;*/ "Arrows", |
| 6466 |
/*2200..22FF;*/ "Mathematical Operators", |
| 6467 |
/*2300..23FF;*/ "Miscellaneous Technical", |
| 6468 |
/*2400..243F;*/ "Control Pictures", |
| 6469 |
/*2440..245F;*/ "Optical Character Recognition", |
| 6470 |
/*2460..24FF;*/ "Enclosed Alphanumerics", |
| 6471 |
/*2500..257F;*/ "Box Drawing", |
| 6472 |
/*2580..259F;*/ "Block Elements", |
| 6473 |
/*25A0..25FF;*/ "Geometric Shapes", |
| 6474 |
/*2600..26FF;*/ "Miscellaneous Symbols", |
| 6475 |
/*2700..27BF;*/ "Dingbats", |
| 6476 |
/*2800..28FF;*/ "Braille Patterns", |
| 6477 |
/*2E80..2EFF;*/ "CJK Radicals Supplement", |
| 6478 |
/*2F00..2FDF;*/ "Kangxi Radicals", |
| 6479 |
/*2FF0..2FFF;*/ "Ideographic Description Characters", |
| 6480 |
/*3000..303F;*/ "CJK Symbols and Punctuation", |
| 6481 |
/*3040..309F;*/ "Hiragana", |
| 6482 |
/*30A0..30FF;*/ "Katakana", |
| 6483 |
/*3100..312F;*/ "Bopomofo", |
| 6484 |
/*3130..318F;*/ "Hangul Compatibility Jamo", |
| 6485 |
/*3190..319F;*/ "Kanbun", |
| 6486 |
/*31A0..31BF;*/ "Bopomofo Extended", |
| 6487 |
/*3200..32FF;*/ "Enclosed CJK Letters and Months", |
| 6488 |
/*3300..33FF;*/ "CJK Compatibility", |
| 6489 |
/*3400..4DB5;*/ "CJK Unified Ideographs Extension A", |
| 6490 |
/*4E00..9FFF;*/ "CJK Unified Ideographs", |
| 6491 |
/*A000..A48F;*/ "Yi Syllables", |
| 6492 |
/*A490..A4CF;*/ "Yi Radicals", |
| 6493 |
/*AC00..D7A3;*/ "Hangul Syllables", |
| 6494 |
/*E000..F8FF;*/ "Private Use", |
| 6495 |
/*F900..FAFF;*/ "CJK Compatibility Ideographs", |
| 6496 |
/*FB00..FB4F;*/ "Alphabetic Presentation Forms", |
| 6497 |
/*FB50..FDFF;*/ "Arabic Presentation Forms-A", |
| 6498 |
/*FE20..FE2F;*/ "Combining Half Marks", |
| 6499 |
/*FE30..FE4F;*/ "CJK Compatibility Forms", |
| 6500 |
/*FE50..FE6F;*/ "Small Form Variants", |
| 6501 |
/*FE70..FEFE;*/ "Arabic Presentation Forms-B", |
| 6502 |
/*FEFF..FEFF;*/ "Specials", |
| 6503 |
/*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms", |
| 6504 |
//missing Specials add manually |
| 6505 |
/*10300..1032F;*/ "Old Italic", // 84 |
| 6506 |
/*10330..1034F;*/ "Gothic", |
| 6507 |
/*10400..1044F;*/ "Deseret", |
| 6508 |
/*1D000..1D0FF;*/ "Byzantine Musical Symbols", |
| 6509 |
/*1D100..1D1FF;*/ "Musical Symbols", |
| 6510 |
/*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols", |
| 6511 |
/*20000..2A6D6;*/ "CJK Unified Ideographs Extension B", |
| 6512 |
/*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement", |
| 6513 |
/*E0000..E007F;*/ "Tags", |
| 6514 |
//missing 2 private use add manually |
| 6515 |
|
| 6516 |
}; |
| 6517 |
//ADD THOSE MANUALLY |
| 6518 |
//F0000..FFFFD; "Private Use", |
| 6519 |
//100000..10FFFD; "Private Use" |
| 6520 |
//FFF0..FFFD; "Specials", |
| 6521 |
static final String blockRanges = |
| 6522 |
"\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F" |
| 6523 |
+"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF" |
| 6524 |
+"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF" |
| 6525 |
+"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF" |
| 6526 |
+"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF" |
| 6527 |
+"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF" |
| 6528 |
+"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF" |
| 6529 |
+"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F" |
| 6530 |
+"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF" |
| 6531 |
+"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF" |
| 6532 |
+"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF"; |
| 6533 |
static final int[] nonBMPBlockRanges = { |
| 6534 |
0x10300, 0x1032F, // 84 |
| 6535 |
0x10330, 0x1034F, |
| 6536 |
0x10400, 0x1044F, |
| 6537 |
0x1D000, 0x1D0FF, |
| 6538 |
0x1D100, 0x1D1FF, |
| 6539 |
0x1D400, 0x1D7FF, |
| 6540 |
0x20000, 0x2A6D6, |
| 6541 |
0x2F800, 0x2FA1F, |
| 6542 |
0xE0000, 0xE007F |
| 6543 |
}; |
| 6544 |
private static final int NONBMP_BLOCK_START = 84; |
| 6545 |
|
| 6546 |
static protected RangeToken getRange(String name, boolean positive) { |
| 6547 |
if (Token.categories.size() == 0) { |
| 6548 |
synchronized (Token.categories) { |
| 6549 |
Token[] ranges = new Token[Token.categoryNames.length]; |
| 6550 |
for (int i = 0; i < ranges.length; i ++) { |
| 6551 |
ranges[i] = Token.createRange(); |
| 6552 |
} |
| 6553 |
int type; |
| 6554 |
for (int i = 0; i < 0x10000; i ++) { |
| 6555 |
type = Character.getType((char)i); |
| 6556 |
if (type == Character.START_PUNCTUATION || |
| 6557 |
type == Character.END_PUNCTUATION) { |
| 6558 |
//build table of Pi values |
| 6559 |
if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C || |
| 6560 |
i == 0x201F || i == 0x2039) { |
| 6561 |
type = CHAR_INIT_QUOTE; |
| 6562 |
} |
| 6563 |
//build table of Pf values |
| 6564 |
if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) { |
| 6565 |
type = CHAR_FINAL_QUOTE; |
| 6566 |
} |
| 6567 |
} |
| 6568 |
ranges[type].addRange(i, i); |
| 6569 |
switch (type) { |
| 6570 |
case Character.UPPERCASE_LETTER: |
| 6571 |
case Character.LOWERCASE_LETTER: |
| 6572 |
case Character.TITLECASE_LETTER: |
| 6573 |
case Character.MODIFIER_LETTER: |
| 6574 |
case Character.OTHER_LETTER: |
| 6575 |
type = CHAR_LETTER; |
| 6576 |
break; |
| 6577 |
case Character.NON_SPACING_MARK: |
| 6578 |
case Character.COMBINING_SPACING_MARK: |
| 6579 |
case Character.ENCLOSING_MARK: |
| 6580 |
type = CHAR_MARK; |
| 6581 |
break; |
| 6582 |
case Character.DECIMAL_DIGIT_NUMBER: |
| 6583 |
case Character.LETTER_NUMBER: |
| 6584 |
case Character.OTHER_NUMBER: |
| 6585 |
type = CHAR_NUMBER; |
| 6586 |
break; |
| 6587 |
case Character.SPACE_SEPARATOR: |
| 6588 |
case Character.LINE_SEPARATOR: |
| 6589 |
case Character.PARAGRAPH_SEPARATOR: |
| 6590 |
type = CHAR_SEPARATOR; |
| 6591 |
break; |
| 6592 |
case Character.CONTROL: |
| 6593 |
case Character.FORMAT: |
| 6594 |
case Character.SURROGATE: |
| 6595 |
case Character.PRIVATE_USE: |
| 6596 |
case Character.UNASSIGNED: |
| 6597 |
type = CHAR_OTHER; |
| 6598 |
break; |
| 6599 |
case Character.CONNECTOR_PUNCTUATION: |
| 6600 |
case Character.DASH_PUNCTUATION: |
| 6601 |
case Character.START_PUNCTUATION: |
| 6602 |
case Character.END_PUNCTUATION: |
| 6603 |
case CHAR_INIT_QUOTE: |
| 6604 |
case CHAR_FINAL_QUOTE: |
| 6605 |
case Character.OTHER_PUNCTUATION: |
| 6606 |
type = CHAR_PUNCTUATION; |
| 6607 |
break; |
| 6608 |
case Character.MATH_SYMBOL: |
| 6609 |
case Character.CURRENCY_SYMBOL: |
| 6610 |
case Character.MODIFIER_SYMBOL: |
| 6611 |
case Character.OTHER_SYMBOL: |
| 6612 |
type = CHAR_SYMBOL; |
| 6613 |
break; |
| 6614 |
default: |
| 6615 |
throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type); |
| 6616 |
} |
| 6617 |
ranges[type].addRange(i, i); |
| 6618 |
} // for all characters |
| 6619 |
ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX); |
| 6620 |
|
| 6621 |
for (int i = 0; i < ranges.length; i ++) { |
| 6622 |
if (Token.categoryNames[i] != null) { |
| 6623 |
if (i == Character.UNASSIGNED) { // Unassigned |
| 6624 |
ranges[i].addRange(0x10000, Token.UTF16_MAX); |
| 6625 |
} |
| 6626 |
Token.categories.put(Token.categoryNames[i], ranges[i]); |
| 6627 |
Token.categories2.put(Token.categoryNames[i], |
| 6628 |
Token.complementRanges(ranges[i])); |
| 6629 |
} |
| 6630 |
} |
| 6631 |
//REVISIT: do we really need to support block names as in Unicode 3.1 |
| 6632 |
// or we can just create all the names in IsBLOCKNAME format (XML Schema REC)? |
| 6633 |
// |
| 6634 |
StringBuffer buffer = new StringBuffer(50); |
| 6635 |
for (int i = 0; i < Token.blockNames.length; i ++) { |
| 6636 |
Token r1 = Token.createRange(); |
| 6637 |
int location; |
| 6638 |
if (i < NONBMP_BLOCK_START) { |
| 6639 |
location = i*2; |
| 6640 |
int rstart = Token.blockRanges.charAt(location); |
| 6641 |
int rend = Token.blockRanges.charAt(location+1); |
| 6642 |
//DEBUGING |
| 6643 |
//System.out.println(n+" " +Integer.toHexString(rstart) |
| 6644 |
// +"-"+ Integer.toHexString(rend)); |
| 6645 |
r1.addRange(rstart, rend); |
| 6646 |
} else { |
| 6647 |
location = (i - NONBMP_BLOCK_START) * 2; |
| 6648 |
r1.addRange(Token.nonBMPBlockRanges[location], |
| 6649 |
Token.nonBMPBlockRanges[location + 1]); |
| 6650 |
} |
| 6651 |
String n = Token.blockNames[i]; |
| 6652 |
if (n.equals("Specials")) |
| 6653 |
r1.addRange(0xfff0, 0xfffd); |
| 6654 |
if (n.equals("Private Use")) { |
| 6655 |
r1.addRange(0xF0000,0xFFFFD); |
| 6656 |
r1.addRange(0x100000,0x10FFFD); |
| 6657 |
} |
| 6658 |
Token.categories.put(n, r1); |
| 6659 |
Token.categories2.put(n, Token.complementRanges(r1)); |
| 6660 |
buffer.setLength(0); |
| 6661 |
buffer.append("Is"); |
| 6662 |
if (n.indexOf(' ') >= 0) { |
| 6663 |
for (int ci = 0; ci < n.length(); ci ++) |
| 6664 |
if (n.charAt(ci) != ' ') buffer.append(n.charAt(ci)); |
| 6665 |
} |
| 6666 |
else { |
| 6667 |
buffer.append(n); |
| 6668 |
} |
| 6669 |
Token.setAlias(buffer.toString(), n, true); |
| 6670 |
} |
| 6671 |
|
| 6672 |
// TR#18 1.2 |
| 6673 |
Token.setAlias("ASSIGNED", "Cn", false); |
| 6674 |
Token.setAlias("UNASSIGNED", "Cn", true); |
| 6675 |
Token all = Token.createRange(); |
| 6676 |
all.addRange(0, Token.UTF16_MAX); |
| 6677 |
Token.categories.put("ALL", all); |
| 6678 |
Token.categories2.put("ALL", Token.complementRanges(all)); |
| 6679 |
Token.registerNonXS("ASSIGNED"); |
| 6680 |
Token.registerNonXS("UNASSIGNED"); |
| 6681 |
Token.registerNonXS("ALL"); |
| 6682 |
|
| 6683 |
Token isalpha = Token.createRange(); |
| 6684 |
isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu |
| 6685 |
isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll |
| 6686 |
isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo |
| 6687 |
Token.categories.put("IsAlpha", isalpha); |
| 6688 |
Token.categories2.put("IsAlpha", Token.complementRanges(isalpha)); |
| 6689 |
Token.registerNonXS("IsAlpha"); |
| 6690 |
|
| 6691 |
Token isalnum = Token.createRange(); |
| 6692 |
isalnum.mergeRanges(isalpha); // Lu Ll Lo |
| 6693 |
isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd |
| 6694 |
Token.categories.put("IsAlnum", isalnum); |
| 6695 |
Token.categories2.put("IsAlnum", Token.complementRanges(isalnum)); |
| 6696 |
Token.registerNonXS("IsAlnum"); |
| 6697 |
|
| 6698 |
Token isspace = Token.createRange(); |
| 6699 |
isspace.mergeRanges(Token.token_spaces); |
| 6700 |
isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z |
| 6701 |
Token.categories.put("IsSpace", isspace); |
| 6702 |
Token.categories2.put("IsSpace", Token.complementRanges(isspace)); |
| 6703 |
Token.registerNonXS("IsSpace"); |
| 6704 |
|
| 6705 |
Token isword = Token.createRange(); |
| 6706 |
isword.mergeRanges(isalnum); // Lu Ll Lo Nd |
| 6707 |
isword.addRange('_', '_'); |
| 6708 |
Token.categories.put("IsWord", isword); |
| 6709 |
Token.categories2.put("IsWord", Token.complementRanges(isword)); |
| 6710 |
Token.registerNonXS("IsWord"); |
| 6711 |
|
| 6712 |
Token isascii = Token.createRange(); |
| 6713 |
isascii.addRange(0, 127); |
| 6714 |
Token.categories.put("IsASCII", isascii); |
| 6715 |
Token.categories2.put("IsASCII", Token.complementRanges(isascii)); |
| 6716 |
Token.registerNonXS("IsASCII"); |
| 6717 |
|
| 6718 |
Token isnotgraph = Token.createRange(); |
| 6719 |
isnotgraph.mergeRanges(ranges[CHAR_OTHER]); |
| 6720 |
isnotgraph.addRange(' ', ' '); |
| 6721 |
Token.categories.put("IsGraph", Token.complementRanges(isnotgraph)); |
| 6722 |
Token.categories2.put("IsGraph", isnotgraph); |
| 6723 |
Token.registerNonXS("IsGraph"); |
| 6724 |
|
| 6725 |
Token isxdigit = Token.createRange(); |
| 6726 |
isxdigit.addRange('0', '9'); |
| 6727 |
isxdigit.addRange('A', 'F'); |
| 6728 |
isxdigit.addRange('a', 'f'); |
| 6729 |
Token.categories.put("IsXDigit", Token.complementRanges(isxdigit)); |
| 6730 |
Token.categories2.put("IsXDigit", isxdigit); |
| 6731 |
Token.registerNonXS("IsXDigit"); |
| 6732 |
|
| 6733 |
Token.setAlias("IsDigit", "Nd", true); |
| 6734 |
Token.setAlias("IsUpper", "Lu", true); |
| 6735 |
Token.setAlias("IsLower", "Ll", true); |
| 6736 |
Token.setAlias("IsCntrl", "C", true); |
| 6737 |
Token.setAlias("IsPrint", "C", false); |
| 6738 |
Token.setAlias("IsPunct", "P", true); |
| 6739 |
Token.registerNonXS("IsDigit"); |
| 6740 |
Token.registerNonXS("IsUpper"); |
| 6741 |
Token.registerNonXS("IsLower"); |
| 6742 |
Token.registerNonXS("IsCntrl"); |
| 6743 |
Token.registerNonXS("IsPrint"); |
| 6744 |
Token.registerNonXS("IsPunct"); |
| 6745 |
|
| 6746 |
Token.setAlias("alpha", "IsAlpha", true); |
| 6747 |
Token.setAlias("alnum", "IsAlnum", true); |
| 6748 |
Token.setAlias("ascii", "IsASCII", true); |
| 6749 |
Token.setAlias("cntrl", "IsCntrl", true); |
| 6750 |
Token.setAlias("digit", "IsDigit", true); |
| 6751 |
Token.setAlias("graph", "IsGraph", true); |
| 6752 |
Token.setAlias("lower", "IsLower", true); |
| 6753 |
Token.setAlias("print", "IsPrint", true); |
| 6754 |
Token.setAlias("punct", "IsPunct", true); |
| 6755 |
Token.setAlias("space", "IsSpace", true); |
| 6756 |
Token.setAlias("upper", "IsUpper", true); |
| 6757 |
Token.setAlias("word", "IsWord", true); // Perl extension |
| 6758 |
Token.setAlias("xdigit", "IsXDigit", true); |
| 6759 |
Token.registerNonXS("alpha"); |
| 6760 |
Token.registerNonXS("alnum"); |
| 6761 |
Token.registerNonXS("ascii"); |
| 6762 |
Token.registerNonXS("cntrl"); |
| 6763 |
Token.registerNonXS("digit"); |
| 6764 |
Token.registerNonXS("graph"); |
| 6765 |
Token.registerNonXS("lower"); |
| 6766 |
Token.registerNonXS("print"); |
| 6767 |
Token.registerNonXS("punct"); |
| 6768 |
Token.registerNonXS("space"); |
| 6769 |
Token.registerNonXS("upper"); |
| 6770 |
Token.registerNonXS("word"); |
| 6771 |
Token.registerNonXS("xdigit"); |
| 6772 |
} // synchronized |
| 6773 |
} // if null |
| 6774 |
RangeToken tok = positive ? (RangeToken)Token.categories.get(name) |
| 6775 |
: (RangeToken)Token.categories2.get(name); |
| 6776 |
//if (tok == null) System.out.println(name); |
| 6777 |
return tok; |
| 6778 |
} |
| 6779 |
static protected RangeToken getRange(String name, boolean positive, boolean xs) { |
| 6780 |
RangeToken range = Token.getRange(name, positive); |
| 6781 |
if (xs && range != null && Token.isRegisterNonXS(name)) |
| 6782 |
range = null; |
| 6783 |
return range; |
| 6784 |
} |
| 6785 |
|
| 6786 |
static Hashtable nonxs = null; |
| 6787 |
/** |
| 6788 |
* This method is called by only getRange(). |
| 6789 |
* So this method need not MT-safe. |
| 6790 |
*/ |
| 6791 |
static protected void registerNonXS(String name) { |
| 6792 |
if (Token.nonxs == null) |
| 6793 |
Token.nonxs = new Hashtable(); |
| 6794 |
Token.nonxs.put(name, name); |
| 6795 |
} |
| 6796 |
static protected boolean isRegisterNonXS(String name) { |
| 6797 |
if (Token.nonxs == null) |
| 6798 |
return false; |
| 6799 |
//DEBUG |
| 6800 |
//System.err.println("isRegisterNonXS: "+name); |
| 6801 |
return Token.nonxs.containsKey(name); |
| 6802 |
} |
| 6803 |
|
| 6804 |
private static void setAlias(String newName, String name, boolean positive) { |
| 6805 |
Token t1 = (Token)Token.categories.get(name); |
| 6806 |
Token t2 = (Token)Token.categories2.get(name); |
| 6807 |
if (positive) { |
| 6808 |
Token.categories.put(newName, t1); |
| 6809 |
Token.categories2.put(newName, t2); |
| 6810 |
} else { |
| 6811 |
Token.categories2.put(newName, t1); |
| 6812 |
Token.categories.put(newName, t2); |
| 6813 |
} |
| 6814 |
} |
| 6815 |
|
| 6816 |
// ------------------------------------------------------ |
| 6817 |
|
| 6818 |
static final String viramaString = |
| 6819 |
"\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; |
| 6820 |
+"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; |
| 6821 |
+"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; |
| 6822 |
+"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; |
| 6823 |
+"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; |
| 6824 |
+"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; |
| 6825 |
+"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; |
| 6826 |
+"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; |
| 6827 |
+"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; |
| 6828 |
+"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;; |
| 6829 |
+"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;; |
| 6830 |
|
| 6831 |
static private Token token_grapheme = null; |
| 6832 |
static synchronized Token getGraphemePattern() { |
| 6833 |
if (Token.token_grapheme != null) |
| 6834 |
return Token.token_grapheme; |
| 6835 |
|
| 6836 |
Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}] |
| 6837 |
base_char.mergeRanges(Token.getRange("ASSIGNED", true)); |
| 6838 |
base_char.subtractRanges(Token.getRange("M", true)); |
| 6839 |
base_char.subtractRanges(Token.getRange("C", true)); |
| 6840 |
|
| 6841 |
Token virama = Token.createRange(); |
| 6842 |
for (int i = 0; i < Token.viramaString.length(); i ++) { |
| 6843 |
virama.addRange(i, i); |
| 6844 |
} |
| 6845 |
|
| 6846 |
Token combiner_wo_virama = Token.createRange(); |
| 6847 |
combiner_wo_virama.mergeRanges(Token.getRange("M", true)); |
| 6848 |
combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final |
| 6849 |
combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras |
| 6850 |
|
| 6851 |
Token left = Token.createUnion(); // base_char? |
| 6852 |
left.addChild(base_char); |
| 6853 |
left.addChild(Token.token_empty); |
| 6854 |
|
| 6855 |
Token foo = Token.createUnion(); |
| 6856 |
foo.addChild(Token.createConcat(virama, Token.getRange("L", true))); |
| 6857 |
foo.addChild(combiner_wo_virama); |
| 6858 |
|
| 6859 |
foo = Token.createClosure(foo); |
| 6860 |
|
| 6861 |
foo = Token.createConcat(left, foo); |
| 6862 |
|
| 6863 |
Token.token_grapheme = foo; |
| 6864 |
return Token.token_grapheme; |
| 6865 |
} |
| 6866 |
|
| 6867 |
/** |
| 6868 |
* Combing Character Sequence in Perl 5.6. |
| 6869 |
*/ |
| 6870 |
static private Token token_ccs = null; |
| 6871 |
static synchronized Token getCombiningCharacterSequence() { |
| 6872 |
if (Token.token_ccs != null) |
| 6873 |
return Token.token_ccs; |
| 6874 |
|
| 6875 |
Token foo = Token.createClosure(Token.getRange("M", true)); // \pM* |
| 6876 |
foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM* |
| 6877 |
Token.token_ccs = foo; |
| 6878 |
return Token.token_ccs; |
| 6879 |
} |
| 6880 |
|
| 6881 |
// ------------------------------------------------------ |
| 6882 |
|
| 6883 |
// ------------------------------------------------------ |
| 6884 |
/** |
| 6885 |
* This class represents a node in parse tree. |
| 6886 |
*/ |
| 6887 |
static class StringToken extends Token implements java.io.Serializable { |
| 6888 |
String string; |
| 6889 |
int refNumber; |
| 6890 |
|
| 6891 |
StringToken(int type, String str, int n) { |
| 6892 |
super(type); |
| 6893 |
this.string = str; |
| 6894 |
this.refNumber = n; |
| 6895 |
} |
| 6896 |
|
| 6897 |
int getReferenceNumber() { // for STRING |
| 6898 |
return this.refNumber; |
| 6899 |
} |
| 6900 |
String getString() { // for STRING |
| 6901 |
return this.string; |
| 6902 |
} |
| 6903 |
|
| 6904 |
public String toString(int options) { |
| 6905 |
if (this.type == BACKREFERENCE) |
| 6906 |
return "\\"+this.refNumber; |
| 6907 |
else |
| 6908 |
return REUtil.quoteMeta(this.string); |
| 6909 |
} |
| 6910 |
} |
| 6911 |
|
| 6912 |
/** |
| 6913 |
* This class represents a node in parse tree. |
| 6914 |
*/ |
| 6915 |
static class ConcatToken extends Token implements java.io.Serializable { |
| 6916 |
Token child; |
| 6917 |
Token child2; |
| 6918 |
|
| 6919 |
ConcatToken(Token t1, Token t2) { |
| 6920 |
super(Token.CONCAT); |
| 6921 |
this.child = t1; |
| 6922 |
this.child2 = t2; |
| 6923 |
} |
| 6924 |
|
| 6925 |
int size() { |
| 6926 |
return 2; |
| 6927 |
} |
| 6928 |
Token getChild(int index) { |
| 6929 |
return index == 0 ? this.child : this.child2; |
| 6930 |
} |
| 6931 |
|
| 6932 |
public String toString(int options) { |
| 6933 |
String ret; |
| 6934 |
if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) { |
| 6935 |
ret = this.child.toString(options)+"+"; |
| 6936 |
} else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) { |
| 6937 |
ret = this.child.toString(options)+"+?"; |
| 6938 |
} else |
| 6939 |
ret = this.child.toString(options)+this.child2.toString(options); |
| 6940 |
return ret; |
| 6941 |
} |
| 6942 |
} |
| 6943 |
|
| 6944 |
/** |
| 6945 |
* This class represents a node in parse tree. |
| 6946 |
*/ |
| 6947 |
static class CharToken extends Token implements java.io.Serializable { |
| 6948 |
int chardata; |
| 6949 |
|
| 6950 |
CharToken(int type, int ch) { |
| 6951 |
super(type); |
| 6952 |
this.chardata = ch; |
| 6953 |
} |
| 6954 |
|
| 6955 |
int getChar() { |
| 6956 |
return this.chardata; |
| 6957 |
} |
| 6958 |
|
| 6959 |
public String toString(int options) { |
| 6960 |
String ret; |
| 6961 |
switch (this.type) { |
| 6962 |
case CHAR: |
| 6963 |
switch (this.chardata) { |
| 6964 |
case '|': case '*': case '+': case '?': |
| 6965 |
case '(': case ')': case '.': case '[': |
| 6966 |
case '{': case '\\': |
| 6967 |
ret = "\\"+(char)this.chardata; |
| 6968 |
break; |
| 6969 |
case '\f': ret = "\\f"; break; |
| 6970 |
case '\n': ret = "\\n"; break; |
| 6971 |
case '\r': ret = "\\r"; break; |
| 6972 |
case '\t': ret = "\\t"; break; |
| 6973 |
case 0x1b: ret = "\\e"; break; |
| 6974 |
//case 0x0b: ret = "\\v"; break; |
| 6975 |
default: |
| 6976 |
if (this.chardata >= 0x10000) { |
| 6977 |
String pre = "0"+Integer.toHexString(this.chardata); |
| 6978 |
ret = "\\v"+pre.substring(pre.length()-6, pre.length()); |
| 6979 |
} else |
| 6980 |
ret = ""+(char)this.chardata; |
| 6981 |
} |
| 6982 |
break; |
| 6983 |
|
| 6984 |
case ANCHOR: |
| 6985 |
if (this == Token.token_linebeginning || this == Token.token_lineend) |
| 6986 |
ret = ""+(char)this.chardata; |
| 6987 |
else |
| 6988 |
ret = "\\"+(char)this.chardata; |
| 6989 |
break; |
| 6990 |
|
| 6991 |
default: |
| 6992 |
ret = null; |
| 6993 |
} |
| 6994 |
return ret; |
| 6995 |
} |
| 6996 |
|
| 6997 |
boolean match(int ch) { |
| 6998 |
if (this.type == CHAR) { |
| 6999 |
return ch == this.chardata; |
| 7000 |
} else |
| 7001 |
throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type); |
| 7002 |
} |
| 7003 |
} |
| 7004 |
|
| 7005 |
/** |
| 7006 |
* This class represents a node in parse tree. |
| 7007 |
*/ |
| 7008 |
static class ClosureToken extends Token implements java.io.Serializable { |
| 7009 |
int min; |
| 7010 |
int max; |
| 7011 |
Token child; |
| 7012 |
|
| 7013 |
ClosureToken(int type, Token tok) { |
| 7014 |
super(type); |
| 7015 |
this.child = tok; |
| 7016 |
this.setMin(-1); |
| 7017 |
this.setMax(-1); |
| 7018 |
} |
| 7019 |
|
| 7020 |
int size() { |
| 7021 |
return 1; |
| 7022 |
} |
| 7023 |
Token getChild(int index) { |
| 7024 |
return this.child; |
| 7025 |
} |
| 7026 |
|
| 7027 |
final void setMin(int min) { |
| 7028 |
this.min = min; |
| 7029 |
} |
| 7030 |
final void setMax(int max) { |
| 7031 |
this.max = max; |
| 7032 |
} |
| 7033 |
final int getMin() { |
| 7034 |
return this.min; |
| 7035 |
} |
| 7036 |
final int getMax() { |
| 7037 |
return this.max; |
| 7038 |
} |
| 7039 |
|
| 7040 |
public String toString(int options) { |
| 7041 |
String ret; |
| 7042 |
if (this.type == CLOSURE) { |
| 7043 |
if (this.getMin() < 0 && this.getMax() < 0) { |
| 7044 |
ret = this.child.toString(options)+"*"; |
| 7045 |
} else if (this.getMin() == this.getMax()) { |
| 7046 |
ret = this.child.toString(options)+"{"+this.getMin()+"}"; |
| 7047 |
} else if (this.getMin() >= 0 && this.getMax() >= 0) { |
| 7048 |
ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}"; |
| 7049 |
} else if (this.getMin() >= 0 && this.getMax() < 0) { |
| 7050 |
ret = this.child.toString(options)+"{"+this.getMin()+",}"; |
| 7051 |
} else |
| 7052 |
throw new RuntimeException("Token#toString(): CLOSURE " |
| 7053 |
+this.getMin()+", "+this.getMax()); |
| 7054 |
} else { |
| 7055 |
if (this.getMin() < 0 && this.getMax() < 0) { |
| 7056 |
ret = this.child.toString(options)+"*?"; |
| 7057 |
} else if (this.getMin() == this.getMax()) { |
| 7058 |
ret = this.child.toString(options)+"{"+this.getMin()+"}?"; |
| 7059 |
} else if (this.getMin() >= 0 && this.getMax() >= 0) { |
| 7060 |
ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?"; |
| 7061 |
} else if (this.getMin() >= 0 && this.getMax() < 0) { |
| 7062 |
ret = this.child.toString(options)+"{"+this.getMin()+",}?"; |
| 7063 |
} else |
| 7064 |
throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE " |
| 7065 |
+ this.getMin() + ", " + this.getMax()); |
| 7066 |
} |
| 7067 |
return ret; |
| 7068 |
} |
| 7069 |
} |
| 7070 |
|
| 7071 |
/** |
| 7072 |
* This class represents a node in parse tree. |
| 7073 |
*/ |
| 7074 |
static class ParenToken extends Token implements java.io.Serializable |
| 7075 |
{ |
| 7076 |
Token child; |
| 7077 |
|
| 7078 |
int parennumber; |
| 7079 |
|
| 7080 |
ParenToken(int type, Token tok, int paren) |
| 7081 |
{ |
| 7082 |
super(type); |
| 7083 |
this.child = tok; |
| 7084 |
this.parennumber = paren; |
| 7085 |
} |
| 7086 |
|
| 7087 |
int size() |
| 7088 |
{ |
| 7089 |
return 1; |
| 7090 |
} |
| 7091 |
|
| 7092 |
Token getChild(int index) |
| 7093 |
{ |
| 7094 |
return this.child; |
| 7095 |
} |
| 7096 |
|
| 7097 |
int getParenNumber() |
| 7098 |
{ |
| 7099 |
return this.parennumber; |
| 7100 |
} |
| 7101 |
|
| 7102 |
public String toString(int options) |
| 7103 |
{ |
| 7104 |
String ret = null; |
| 7105 |
switch (this.type) |
| 7106 |
{ |
| 7107 |
case PAREN: |
| 7108 |
if (this.parennumber == 0) |
| 7109 |
{ |
| 7110 |
ret = "(?:" + this.child.toString(options) + ")"; |
| 7111 |
} |
| 7112 |
else |
| 7113 |
{ |
| 7114 |
ret = "(" + this.child.toString(options) + ")"; |
| 7115 |
} |
| 7116 |
break; |
| 7117 |
|
| 7118 |
case LOOKAHEAD: |
| 7119 |
ret = "(?=" + this.child.toString(options) + ")"; |
| 7120 |
break; |
| 7121 |
case NEGATIVELOOKAHEAD: |
| 7122 |
ret = "(?!" + this.child.toString(options) + ")"; |
| 7123 |
break; |
| 7124 |
case LOOKBEHIND: |
| 7125 |
ret = "(?<=" + this.child.toString(options) + ")"; |
| 7126 |
break; |
| 7127 |
case NEGATIVELOOKBEHIND: |
| 7128 |
ret = "(?<!" + this.child.toString(options) + ")"; |
| 7129 |
break; |
| 7130 |
case INDEPENDENT: |
| 7131 |
ret = "(?>" + this.child.toString(options) + ")"; |
| 7132 |
break; |
| 7133 |
} |
| 7134 |
return ret; |
| 7135 |
} |
| 7136 |
} |
| 7137 |
|
| 7138 |
/** |
| 7139 |
* (?(condition)yes-pattern|no-pattern) |
| 7140 |
*/ |
| 7141 |
static class ConditionToken extends Token implements java.io.Serializable |
| 7142 |
{ |
| 7143 |
int refNumber; |
| 7144 |
|
| 7145 |
Token condition; |
| 7146 |
|
| 7147 |
Token yes; |
| 7148 |
|
| 7149 |
Token no; |
| 7150 |
|
| 7151 |
ConditionToken(int refno, Token cond, Token yespat, Token nopat) |
| 7152 |
{ |
| 7153 |
super(Token.CONDITION); |
| 7154 |
this.refNumber = refno; |
| 7155 |
this.condition = cond; |
| 7156 |
this.yes = yespat; |
| 7157 |
this.no = nopat; |
| 7158 |
} |
| 7159 |
|
| 7160 |
int size() |
| 7161 |
{ |
| 7162 |
return this.no == null ? 1 : 2; |
| 7163 |
} |
| 7164 |
|
| 7165 |
Token getChild(int index) |
| 7166 |
{ |
| 7167 |
if (index == 0) |
| 7168 |
return this.yes; |
| 7169 |
if (index == 1) |
| 7170 |
return this.no; |
| 7171 |
throw new RuntimeException("Internal Error: " + index); |
| 7172 |
} |
| 7173 |
|
| 7174 |
public String toString(int options) |
| 7175 |
{ |
| 7176 |
String ret; |
| 7177 |
if (refNumber > 0) |
| 7178 |
{ |
| 7179 |
ret = "(?(" + refNumber + ")"; |
| 7180 |
} |
| 7181 |
else if (this.condition.type == Token.ANCHOR) |
| 7182 |
{ |
| 7183 |
ret = "(?(" + this.condition + ")"; |
| 7184 |
} |
| 7185 |
else |
| 7186 |
{ |
| 7187 |
ret = "(?" + this.condition; |
| 7188 |
} |
| 7189 |
|
| 7190 |
if (this.no == null) |
| 7191 |
{ |
| 7192 |
ret += this.yes + ")"; |
| 7193 |
} |
| 7194 |
else |
| 7195 |
{ |
| 7196 |
ret += this.yes + "|" + this.no + ")"; |
| 7197 |
} |
| 7198 |
return ret; |
| 7199 |
} |
| 7200 |
} |
| 7201 |
|
| 7202 |
/** |
| 7203 |
* (ims-ims: .... ) |
| 7204 |
*/ |
| 7205 |
static class ModifierToken extends Token implements java.io.Serializable |
| 7206 |
{ |
| 7207 |
Token child; |
| 7208 |
|
| 7209 |
int add; |
| 7210 |
|
| 7211 |
int mask; |
| 7212 |
|
| 7213 |
ModifierToken(Token tok, int add, int mask) |
| 7214 |
{ |
| 7215 |
super(Token.MODIFIERGROUP); |
| 7216 |
this.child = tok; |
| 7217 |
this.add = add; |
| 7218 |
this.mask = mask; |
| 7219 |
} |
| 7220 |
|
| 7221 |
int size() |
| 7222 |
{ |
| 7223 |
return 1; |
| 7224 |
} |
| 7225 |
|
| 7226 |
Token getChild(int index) |
| 7227 |
{ |
| 7228 |
return this.child; |
| 7229 |
} |
| 7230 |
|
| 7231 |
int getOptions() |
| 7232 |
{ |
| 7233 |
return this.add; |
| 7234 |
} |
| 7235 |
|
| 7236 |
int getOptionsMask() |
| 7237 |
{ |
| 7238 |
return this.mask; |
| 7239 |
} |
| 7240 |
|
| 7241 |
public String toString(int options) |
| 7242 |
{ |
| 7243 |
return "(?" + (this.add == 0 ? "" : REUtil.createOptionString(this.add)) |
| 7244 |
+ (this.mask == 0 ? "" : REUtil.createOptionString(this.mask)) + ":" + this.child.toString(options) + ")"; |
| 7245 |
} |
| 7246 |
} |
| 7247 |
|
| 7248 |
/** |
| 7249 |
* This class represents a node in parse tree. |
| 7250 |
* for UNION or CONCAT. |
| 7251 |
*/ |
| 7252 |
static class UnionToken extends Token implements java.io.Serializable |
| 7253 |
{ |
| 7254 |
Vector children; |
| 7255 |
|
| 7256 |
UnionToken(int type) |
| 7257 |
{ |
| 7258 |
super(type); |
| 7259 |
} |
| 7260 |
|
| 7261 |
void addChild(Token tok) |
| 7262 |
{ |
| 7263 |
if (tok == null) |
| 7264 |
return; |
| 7265 |
if (this.children == null) |
| 7266 |
this.children = new Vector(); |
| 7267 |
if (this.type == UNION) |
| 7268 |
{ |
| 7269 |
this.children.addElement(tok); |
| 7270 |
return; |
| 7271 |
} |
| 7272 |
// This is CONCAT, and new child is CONCAT. |
| 7273 |
if (tok.type == CONCAT) |
| 7274 |
{ |
| 7275 |
for (int i = 0; i < tok.size(); i++) |
| 7276 |
this.addChild(tok.getChild(i)); // Recursion |
| 7277 |
return; |
| 7278 |
} |
| 7279 |
int size = this.children.size(); |
| 7280 |
if (size == 0) |
| 7281 |
{ |
| 7282 |
this.children.addElement(tok); |
| 7283 |
return; |
| 7284 |
} |
| 7285 |
Token previous = (Token)this.children.elementAt(size - 1); |
| 7286 |
if (!((previous.type == CHAR || previous.type == STRING) && (tok.type == CHAR || tok.type == STRING))) |
| 7287 |
{ |
| 7288 |
this.children.addElement(tok); |
| 7289 |
return; |
| 7290 |
} |
| 7291 |
|
| 7292 |
//System.err.println("Merge '"+previous+"' and '"+tok+"'."); |
| 7293 |
|
| 7294 |
StringBuffer buffer; |
| 7295 |
int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length()); |
| 7296 |
if (previous.type == CHAR) |
| 7297 |
{ // Replace previous token by STRING |
| 7298 |
buffer = new StringBuffer(2 + nextMaxLength); |
| 7299 |
int ch = previous.getChar(); |
| 7300 |
if (ch >= 0x10000) |
| 7301 |
buffer.append(REUtil.decomposeToSurrogates(ch)); |
| 7302 |
else |
| 7303 |
buffer.append((char)ch); |
| 7304 |
previous = Token.createString(null); |
| 7305 |
this.children.setElementAt(previous, size - 1); |
| 7306 |
} |
| 7307 |
else |
| 7308 |
{ // STRING |
| 7309 |
buffer = new StringBuffer(previous.getString().length() + nextMaxLength); |
| 7310 |
buffer.append(previous.getString()); |
| 7311 |
} |
| 7312 |
|
| 7313 |
if (tok.type == CHAR) |
| 7314 |
{ |
| 7315 |
int ch = tok.getChar(); |
| 7316 |
if (ch >= 0x10000) |
| 7317 |
buffer.append(REUtil.decomposeToSurrogates(ch)); |
| 7318 |
else |
| 7319 |
buffer.append((char)ch); |
| 7320 |
} |
| 7321 |
else |
| 7322 |
{ |
| 7323 |
buffer.append(tok.getString()); |
| 7324 |
} |
| 7325 |
|
| 7326 |
((StringToken)previous).string = new String(buffer); |
| 7327 |
} |
| 7328 |
|
| 7329 |
int size() |
| 7330 |
{ |
| 7331 |
return this.children == null ? 0 : this.children.size(); |
| 7332 |
} |
| 7333 |
|
| 7334 |
Token getChild(int index) |
| 7335 |
{ |
| 7336 |
return (Token)this.children.elementAt(index); |
| 7337 |
} |
| 7338 |
|
| 7339 |
public String toString(int options) |
| 7340 |
{ |
| 7341 |
String ret; |
| 7342 |
if (this.type == CONCAT) |
| 7343 |
{ |
| 7344 |
if (this.children.size() == 2) |
| 7345 |
{ |
| 7346 |
Token ch = this.getChild(0); |
| 7347 |
Token ch2 = this.getChild(1); |
| 7348 |
if (ch2.type == CLOSURE && ch2.getChild(0) == ch) |
| 7349 |
{ |
| 7350 |
ret = ch.toString(options) + "+"; |
| 7351 |
} |
| 7352 |
else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) |
| 7353 |
{ |
| 7354 |
ret = ch.toString(options) + "+?"; |
| 7355 |
} |
| 7356 |
else |
| 7357 |
ret = ch.toString(options) + ch2.toString(options); |
| 7358 |
} |
| 7359 |
else |
| 7360 |
{ |
| 7361 |
StringBuffer sb = new StringBuffer(); |
| 7362 |
for (int i = 0; i < this.children.size(); i++) |
| 7363 |
{ |
| 7364 |
sb.append(((Token)this.children.elementAt(i)).toString(options)); |
| 7365 |
} |
| 7366 |
ret = new String(sb); |
| 7367 |
} |
| 7368 |
return ret; |
| 7369 |
} |
| 7370 |
if (this.children.size() == 2 && this.getChild(1).type == EMPTY) |
| 7371 |
{ |
| 7372 |
ret = this.getChild(0).toString(options) + "?"; |
| 7373 |
} |
| 7374 |
else if (this.children.size() == 2 && this.getChild(0).type == EMPTY) |
| 7375 |
{ |
| 7376 |
ret = this.getChild(1).toString(options) + "??"; |
| 7377 |
} |
| 7378 |
else |
| 7379 |
{ |
| 7380 |
StringBuffer sb = new StringBuffer(); |
| 7381 |
sb.append(((Token)this.children.elementAt(0)).toString(options)); |
| 7382 |
for (int i = 1; i < this.children.size(); i++) |
| 7383 |
{ |
| 7384 |
sb.append('|'); |
| 7385 |
sb.append(((Token)this.children.elementAt(i)).toString(options)); |
| 7386 |
} |
| 7387 |
ret = new String(sb); |
| 7388 |
} |
| 7389 |
return ret; |
| 7390 |
} |
| 7391 |
} |
| 7392 |
} |
| 7393 |
|
| 7394 |
/** |
| 7395 |
* A regular expression parser for the XML Shema. |
| 7396 |
* |
| 7397 |
* @author TAMURA Kent <kent@trl.ibm.co.jp> |
| 7398 |
* @version $Id: RegEx.java,v 1.8.4.1 2007/08/14 18:18:44 emerks Exp $ |
| 7399 |
*/ |
| 7400 |
static class ParserForXMLSchema extends RegexParser |
| 7401 |
{ |
| 7402 |
|
| 7403 |
public ParserForXMLSchema() |
| 7404 |
{ |
| 7405 |
//this.setLocale(Locale.getDefault()); |
| 7406 |
} |
| 7407 |
|
| 7408 |
public ParserForXMLSchema(Locale locale) |
| 7409 |
{ |
| 7410 |
//this.setLocale(locale); |
| 7411 |
} |
| 7412 |
|
| 7413 |
Token processCaret() throws ParseException |
| 7414 |
{ |
| 7415 |
this.next(); |
| 7416 |
return Token.createChar('^'); |
| 7417 |
} |
| 7418 |
|
| 7419 |
Token processDollar() throws ParseException |
| 7420 |
{ |
| 7421 |
this.next(); |
| 7422 |
return Token.createChar('$'); |
| 7423 |
} |
| 7424 |
|
| 7425 |
Token processLookahead() throws ParseException |
| 7426 |
{ |
| 7427 |
throw ex("parser.process.1", this.offset); |
| 7428 |
} |
| 7429 |
|
| 7430 |
Token processNegativelookahead() throws ParseException |
| 7431 |
{ |
| 7432 |
throw ex("parser.process.1", this.offset); |
| 7433 |
} |
| 7434 |
|
| 7435 |
Token processLookbehind() throws ParseException |
| 7436 |
{ |
| 7437 |
throw ex("parser.process.1", this.offset); |
| 7438 |
} |
| 7439 |
|
| 7440 |
Token processNegativelookbehind() throws ParseException |
| 7441 |
{ |
| 7442 |
throw ex("parser.process.1", this.offset); |
| 7443 |
} |
| 7444 |
|
| 7445 |
Token processBacksolidus_A() throws ParseException |
| 7446 |
{ |
| 7447 |
throw ex("parser.process.1", this.offset); |
| 7448 |
} |
| 7449 |
|
| 7450 |
Token processBacksolidus_Z() throws ParseException |
| 7451 |
{ |
| 7452 |
throw ex("parser.process.1", this.offset); |
| 7453 |
} |
| 7454 |
|
| 7455 |
Token processBacksolidus_z() throws ParseException |
| 7456 |
{ |
| 7457 |
throw ex("parser.process.1", this.offset); |
| 7458 |
} |
| 7459 |
|
| 7460 |
Token processBacksolidus_b() throws ParseException |
| 7461 |
{ |
| 7462 |
throw ex("parser.process.1", this.offset); |
| 7463 |
} |
| 7464 |
|
| 7465 |
Token processBacksolidus_B() throws ParseException |
| 7466 |
{ |
| 7467 |
throw ex("parser.process.1", this.offset); |
| 7468 |
} |
| 7469 |
|
| 7470 |
Token processBacksolidus_lt() throws ParseException |
| 7471 |
{ |
| 7472 |
throw ex("parser.process.1", this.offset); |
| 7473 |
} |
| 7474 |
|
| 7475 |
Token processBacksolidus_gt() throws ParseException |
| 7476 |
{ |
| 7477 |
throw ex("parser.process.1", this.offset); |
| 7478 |
} |
| 7479 |
|
| 7480 |
Token processStar(Token tok) throws ParseException |
| 7481 |
{ |
| 7482 |
this.next(); |
| 7483 |
return Token.createClosure(tok); |
| 7484 |
} |
| 7485 |
|
| 7486 |
Token processPlus(Token tok) throws ParseException |
| 7487 |
{ |
| 7488 |
// X+ -> XX* |
| 7489 |
this.next(); |
| 7490 |
return Token.createConcat(tok, Token.createClosure(tok)); |
| 7491 |
} |
| 7492 |
|
| 7493 |
Token processQuestion(Token tok) throws ParseException |
| 7494 |
{ |
| 7495 |
// X? -> X| |
| 7496 |
this.next(); |
| 7497 |
Token par = Token.createUnion(); |
| 7498 |
par.addChild(tok); |
| 7499 |
par.addChild(Token.createEmpty()); |
| 7500 |
return par; |
| 7501 |
} |
| 7502 |
|
| 7503 |
boolean checkQuestion(int off) |
| 7504 |
{ |
| 7505 |
return false; |
| 7506 |
} |
| 7507 |
|
| 7508 |
Token processParen() throws ParseException |
| 7509 |
{ |
| 7510 |
this.next(); |
| 7511 |
Token tok = Token.createParen(this.parseRegex(), 0); |
| 7512 |
if (this.read() != T_RPAREN) |
| 7513 |
throw ex("parser.factor.1", this.offset - 1); |
| 7514 |
this.next(); // Skips ')' |
| 7515 |
return tok; |
| 7516 |
} |
| 7517 |
|
| 7518 |
Token processParen2() throws ParseException |
| 7519 |
{ |
| 7520 |
throw ex("parser.process.1", this.offset); |
| 7521 |
} |
| 7522 |
|
| 7523 |
Token processCondition() throws ParseException |
| 7524 |
{ |
| 7525 |
throw ex("parser.process.1", this.offset); |
| 7526 |
} |
| 7527 |
|
| 7528 |
Token processModifiers() throws ParseException |
| 7529 |
{ |
| 7530 |
throw ex("parser.process.1", this.offset); |
| 7531 |
} |
| 7532 |
|
| 7533 |
Token processIndependent() throws ParseException |
| 7534 |
{ |
| 7535 |
throw ex("parser.process.1", this.offset); |
| 7536 |
} |
| 7537 |
|
| 7538 |
Token processBacksolidus_c() throws ParseException |
| 7539 |
{ |
| 7540 |
this.next(); |
| 7541 |
return this.getTokenForShorthand('c'); |
| 7542 |
} |
| 7543 |
|
| 7544 |
Token processBacksolidus_C() throws ParseException |
| 7545 |
{ |
| 7546 |
this.next(); |
| 7547 |
return this.getTokenForShorthand('C'); |
| 7548 |
} |
| 7549 |
|
| 7550 |
Token processBacksolidus_i() throws ParseException |
| 7551 |
{ |
| 7552 |
this.next(); |
| 7553 |
return this.getTokenForShorthand('i'); |
| 7554 |
} |
| 7555 |
|
| 7556 |
Token processBacksolidus_I() throws ParseException |
| 7557 |
{ |
| 7558 |
this.next(); |
| 7559 |
return this.getTokenForShorthand('I'); |
| 7560 |
} |
| 7561 |
|
| 7562 |
Token processBacksolidus_g() throws ParseException |
| 7563 |
{ |
| 7564 |
throw this.ex("parser.process.1", this.offset - 2); |
| 7565 |
} |
| 7566 |
|
| 7567 |
Token processBacksolidus_X() throws ParseException |
| 7568 |
{ |
| 7569 |
throw ex("parser.process.1", this.offset - 2); |
| 7570 |
} |
| 7571 |
|
| 7572 |
Token processBackreference() throws ParseException |
| 7573 |
{ |
| 7574 |
throw ex("parser.process.1", this.offset - 4); |
| 7575 |
} |
| 7576 |
|
| 7577 |
int processCIinCharacterClass(RangeToken tok, int c) |
| 7578 |
{ |
| 7579 |
tok.mergeRanges(this.getTokenForShorthand(c)); |
| 7580 |
return -1; |
| 7581 |
} |
| 7582 |
|
| 7583 |
/** |
| 7584 |
* Parses a character-class-expression, not a character-class-escape. |
| 7585 |
* |
| 7586 |
* c-c-expression ::= '[' c-group ']' |
| 7587 |
* c-group ::= positive-c-group | negative-c-group | c-c-subtraction |
| 7588 |
* positive-c-group ::= (c-range | c-c-escape)+ |
| 7589 |
* negative-c-group ::= '^' positive-c-group |
| 7590 |
* c-c-subtraction ::= (positive-c-group | negative-c-group) subtraction |
| 7591 |
* subtraction ::= '-' c-c-expression |
| 7592 |
* c-range ::= single-range | from-to-range |
| 7593 |
* single-range ::= multi-c-escape | category-c-escape | block-c-escape | <any XML char> |
| 7594 |
* cc-normal-c ::= <any character except [, ], \> |
| 7595 |
* from-to-range ::= cc-normal-c '-' cc-normal-c |
| 7596 |
* |
| 7597 |
* @param useNrage Ignored. |
| 7598 |
* @return This returns no NrageToken. |
| 7599 |
*/ |
| 7600 |
protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException |
| 7601 |
{ |
| 7602 |
this.setContext(S_INBRACKETS); |
| 7603 |
this.next(); // '[' |
| 7604 |
boolean nrange = false; |
| 7605 |
RangeToken base = null; |
| 7606 |
RangeToken tok; |
| 7607 |
if (this.read() == T_CHAR && this.chardata == '^') |
| 7608 |
{ |
| 7609 |
nrange = true; |
| 7610 |
this.next(); // '^' |
| 7611 |
base = Token.createRange(); |
| 7612 |
base.addRange(0, Token.UTF16_MAX); |
| 7613 |
tok = Token.createRange(); |
| 7614 |
} |
| 7615 |
else |
| 7616 |
{ |
| 7617 |
tok = Token.createRange(); |
| 7618 |
} |
| 7619 |
int type; |
| 7620 |
boolean firstloop = true; |
| 7621 |
while ((type = this.read()) != T_EOF) |
| 7622 |
{ // Don't use 'cotinue' for this loop. |
| 7623 |
// single-range | from-to-range | subtraction |
| 7624 |
if (type == T_CHAR && this.chardata == ']' && !firstloop) |
| 7625 |
{ |
| 7626 |
if (nrange) |
| 7627 |
{ |
| 7628 |
base.subtractRanges(tok); |
| 7629 |
tok = base; |
| 7630 |
} |
| 7631 |
break; |
| 7632 |
} |
| 7633 |
int c = this.chardata; |
| 7634 |
boolean end = false; |
| 7635 |
if (type == T_BACKSOLIDUS) |
| 7636 |
{ |
| 7637 |
switch (c) |
| 7638 |
{ |
| 7639 |
case 'd': |
| 7640 |
case 'D': |
| 7641 |
case 'w': |
| 7642 |
case 'W': |
| 7643 |
case 's': |
| 7644 |
case 'S': |
| 7645 |
tok.mergeRanges(this.getTokenForShorthand(c)); |
| 7646 |
end = true; |
| 7647 |
break; |
| 7648 |
|
| 7649 |
case 'i': |
| 7650 |
case 'I': |
| 7651 |
case 'c': |
| 7652 |
case 'C': |
| 7653 |
c = this.processCIinCharacterClass(tok, c); |
| 7654 |
if (c < 0) |
| 7655 |
end = true; |
| 7656 |
break; |
| 7657 |
|
| 7658 |
case 'p': |
| 7659 |
case 'P': |
| 7660 |
int pstart = this.offset; |
| 7661 |
RangeToken tok2 = this.processBacksolidus_pP(c); |
| 7662 |
if (tok2 == null) |
| 7663 |
throw this.ex("parser.atom.5", pstart); |
| 7664 |
tok.mergeRanges(tok2); |
| 7665 |
end = true; |
| 7666 |
break; |
| 7667 |
|
| 7668 |
default: |
| 7669 |
c = this.decodeEscaped(); |
| 7670 |
} // \ + c |
| 7671 |
} // backsolidus |
| 7672 |
else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) |
| 7673 |
{ |
| 7674 |
// Subraction |
| 7675 |
if (nrange) |
| 7676 |
{ |
| 7677 |
base.subtractRanges(tok); |
| 7678 |
tok = base; |
| 7679 |
} |
| 7680 |
RangeToken range2 = this.parseCharacterClass(false); |
| 7681 |
tok.subtractRanges(range2); |
| 7682 |
if (this.read() != T_CHAR || this.chardata != ']') |
| 7683 |
throw this.ex("parser.cc.5", this.offset); |
| 7684 |
break; // Exit this loop |
| 7685 |
} |
| 7686 |
this.next(); |
| 7687 |
if (!end) |
| 7688 |
{ // if not shorthands... |
| 7689 |
if (type == T_CHAR) |
| 7690 |
{ |
| 7691 |
if (c == '[') |
| 7692 |
throw this.ex("parser.cc.6", this.offset - 2); |
| 7693 |
if (c == ']') |
| 7694 |
throw this.ex("parser.cc.7", this.offset - 2); |
| 7695 |
if (c == '-' && !firstloop && chardata != ']') |
| 7696 |
throw this.ex("parser.cc.8", this.offset - 2); |
| 7697 |
} |
| 7698 |
if (this.read() != T_CHAR || this.chardata != '-' || c == '-' && firstloop) |
| 7699 |
{ // Here is no '-'. |
| 7700 |
tok.addRange(c, c); |
| 7701 |
} |
| 7702 |
else |
| 7703 |
{ // Found '-' |
| 7704 |
// Is this '-' is a from-to token?? |
| 7705 |
this.next(); // Skips '-' |
| 7706 |
if ((type = this.read()) == T_EOF) |
| 7707 |
throw this.ex("parser.cc.2", this.offset); |
| 7708 |
// c '-' ']' -> '-' is a single-range. |
| 7709 |
if (type == T_CHAR && this.chardata == ']') |
| 7710 |
{ // if - is at the last position of the group |
| 7711 |
tok.addRange(c, c); |
| 7712 |
tok.addRange('-', '-'); |
| 7713 |
} |
| 7714 |
else if ((type == T_CHAR && this.chardata == ']') || type == T_XMLSCHEMA_CC_SUBTRACTION) |
| 7715 |
{ |
| 7716 |
throw this.ex("parser.cc.8", this.offset - 1); |
| 7717 |
} |
| 7718 |
else |
| 7719 |
{ |
| 7720 |
int rangeend = this.chardata; |
| 7721 |
if (type == T_CHAR) |
| 7722 |
{ |
| 7723 |
if (rangeend == '[') |
| 7724 |
throw this.ex("parser.cc.6", this.offset - 1); |
| 7725 |
if (rangeend == ']') |
| 7726 |
throw this.ex("parser.cc.7", this.offset - 1); |
| 7727 |
if (rangeend == '-') |
| 7728 |
throw this.ex("parser.cc.8", this.offset - 1); |
| 7729 |
} |
| 7730 |
else if (type == T_BACKSOLIDUS) |
| 7731 |
rangeend = this.decodeEscaped(); |
| 7732 |
this.next(); |
| 7733 |
|
| 7734 |
if (c > rangeend) |
| 7735 |
throw this.ex("parser.ope.3", this.offset - 1); |
| 7736 |
tok.addRange(c, rangeend); |
| 7737 |
} |
| 7738 |
} |
| 7739 |
} |
| 7740 |
firstloop = false; |
| 7741 |
} |
| 7742 |
if (this.read() == T_EOF) |
| 7743 |
throw this.ex("parser.cc.2", this.offset); |
| 7744 |
tok.sortRanges(); |
| 7745 |
tok.compactRanges(); |
| 7746 |
//tok.dumpRanges(); |
| 7747 |
this.setContext(S_NORMAL); |
| 7748 |
this.next(); // Skips ']' |
| 7749 |
|
| 7750 |
return tok; |
| 7751 |
} |
| 7752 |
|
| 7753 |
protected RangeToken parseSetOperations() throws ParseException |
| 7754 |
{ |
| 7755 |
throw this.ex("parser.process.1", this.offset); |
| 7756 |
} |
| 7757 |
|
| 7758 |
Token getTokenForShorthand(int ch) |
| 7759 |
{ |
| 7760 |
switch (ch) |
| 7761 |
{ |
| 7762 |
case 'd': |
| 7763 |
return ParserForXMLSchema.getRange("xml:isDigit", true); |
| 7764 |
case 'D': |
| 7765 |
return ParserForXMLSchema.getRange("xml:isDigit", false); |
| 7766 |
case 'w': |
| 7767 |
return ParserForXMLSchema.getRange("xml:isWord", true); |
| 7768 |
case 'W': |
| 7769 |
return ParserForXMLSchema.getRange("xml:isWord", false); |
| 7770 |
case 's': |
| 7771 |
return ParserForXMLSchema.getRange("xml:isSpace", true); |
| 7772 |
case 'S': |
| 7773 |
return ParserForXMLSchema.getRange("xml:isSpace", false); |
| 7774 |
case 'c': |
| 7775 |
return ParserForXMLSchema.getRange("xml:isNameChar", true); |
| 7776 |
case 'C': |
| 7777 |
return ParserForXMLSchema.getRange("xml:isNameChar", false); |
| 7778 |
case 'i': |
| 7779 |
return ParserForXMLSchema.getRange("xml:isInitialNameChar", true); |
| 7780 |
case 'I': |
| 7781 |
return ParserForXMLSchema.getRange("xml:isInitialNameChar", false); |
| 7782 |
default: |
| 7783 |
throw new RuntimeException("Internal Error: shorthands: \\u" + Integer.toString(ch, 16)); |
| 7784 |
} |
| 7785 |
} |
| 7786 |
|
| 7787 |
int decodeEscaped() throws ParseException |
| 7788 |
{ |
| 7789 |
if (this.read() != T_BACKSOLIDUS) |
| 7790 |
throw ex("parser.next.1", this.offset - 1); |
| 7791 |
int c = this.chardata; |
| 7792 |
switch (c) |
| 7793 |
{ |
| 7794 |
case 'n': |
| 7795 |
c = '\n'; |
| 7796 |
break; // LINE FEED U+000A |
| 7797 |
case 'r': |
| 7798 |
c = '\r'; |
| 7799 |
break; // CRRIAGE RETURN U+000D |
| 7800 |
case 't': |
| 7801 |
c = '\t'; |
| 7802 |
break; // HORIZONTAL TABULATION U+0009 |
| 7803 |
case '\\': |
| 7804 |
case '|': |
| 7805 |
case '.': |
| 7806 |
case '^': |
| 7807 |
case '-': |
| 7808 |
case '?': |
| 7809 |
case '*': |
| 7810 |
case '+': |
| 7811 |
case '{': |
| 7812 |
case '}': |
| 7813 |
case '(': |
| 7814 |
case ')': |
| 7815 |
case '[': |
| 7816 |
case ']': |
| 7817 |
break; // return actucal char |
| 7818 |
default: |
| 7819 |
throw ex("parser.process.1", this.offset - 2); |
| 7820 |
} |
| 7821 |
return c; |
| 7822 |
} |
| 7823 |
|
| 7824 |
static private Hashtable ranges = null; |
| 7825 |
|
| 7826 |
static private Hashtable ranges2 = null; |
| 7827 |
|
| 7828 |
static synchronized protected RangeToken getRange(String name, boolean positive) |
| 7829 |
{ |
| 7830 |
if (ranges == null) |
| 7831 |
{ |
| 7832 |
ranges = new Hashtable(); |
| 7833 |
ranges2 = new Hashtable(); |
| 7834 |
|
| 7835 |
Token tok = Token.createRange(); |
| 7836 |
setupRange(tok, SPACES); |
| 7837 |
ranges.put("xml:isSpace", tok); |
| 7838 |
ranges2.put("xml:isSpace", Token.complementRanges(tok)); |
| 7839 |
|
| 7840 |
tok = Token.createRange(); |
| 7841 |
setupRange(tok, DIGITS); |
| 7842 |
ranges.put("xml:isDigit", tok); |
| 7843 |
ranges2.put("xml:isDigit", Token.complementRanges(tok)); |
| 7844 |
|
| 7845 |
tok = Token.createRange(); |
| 7846 |
setupRange(tok, DIGITS); |
| 7847 |
ranges.put("xml:isDigit", tok); |
| 7848 |
ranges2.put("xml:isDigit", Token.complementRanges(tok)); |
| 7849 |
|
| 7850 |
tok = Token.createRange(); |
| 7851 |
setupRange(tok, LETTERS); |
| 7852 |
tok.mergeRanges((Token)ranges.get("xml:isDigit")); |
| 7853 |
ranges.put("xml:isWord", tok); |
| 7854 |
ranges2.put("xml:isWord", Token.complementRanges(tok)); |
| 7855 |
|
| 7856 |
tok = Token.createRange(); |
| 7857 |
setupRange(tok, NAMECHARS); |
| 7858 |
ranges.put("xml:isNameChar", tok); |
| 7859 |
ranges2.put("xml:isNameChar", Token.complementRanges(tok)); |
| 7860 |
|
| 7861 |
tok = Token.createRange(); |
| 7862 |
setupRange(tok, LETTERS); |
| 7863 |
tok.addRange('_', '_'); |
| 7864 |
tok.addRange(':', ':'); |
| 7865 |
ranges.put("xml:isInitialNameChar", tok); |
| 7866 |
ranges2.put("xml:isInitialNameChar", Token.complementRanges(tok)); |
| 7867 |
} |
| 7868 |
RangeToken tok = positive ? (RangeToken)ranges.get(name) : (RangeToken)ranges2.get(name); |
| 7869 |
return tok; |
| 7870 |
} |
| 7871 |
|
| 7872 |
static void setupRange(Token range, String src) |
| 7873 |
{ |
| 7874 |
int len = src.length(); |
| 7875 |
for (int i = 0; i < len; i += 2) |
| 7876 |
range.addRange(src.charAt(i), src.charAt(i + 1)); |
| 7877 |
} |
| 7878 |
|
| 7879 |
private static final String SPACES = "\t\n\r\r "; |
| 7880 |
|
| 7881 |
private static final String NAMECHARS = "\u002d\u002e\u0030\u003a\u0041\u005a\u005f\u005f\u0061\u007a\u00b7\u00b7\u00c0\u00d6" |
| 7882 |
+ "\u00d8\u00f6\u00f8\u0131\u0134\u013e\u0141\u0148\u014a\u017e\u0180\u01c3\u01cd\u01f0" |
| 7883 |
+ "\u01f4\u01f5\u01fa\u0217\u0250\u02a8\u02bb\u02c1\u02d0\u02d1\u0300\u0345\u0360\u0361" |
| 7884 |
+ "\u0386\u038a\u038c\u038c\u038e\u03a1\u03a3\u03ce\u03d0\u03d6\u03da\u03da\u03dc\u03dc" |
| 7885 |
+ "\u03de\u03de\u03e0\u03e0\u03e2\u03f3\u0401\u040c\u040e\u044f\u0451\u045c\u045e\u0481" |
| 7886 |
+ "\u0483\u0486\u0490\u04c4\u04c7\u04c8\u04cb\u04cc\u04d0\u04eb\u04ee\u04f5\u04f8\u04f9" |
| 7887 |
+ "\u0531\u0556\u0559\u0559\u0561\u0586\u0591\u05a1\u05a3\u05b9\u05bb\u05bd\u05bf\u05bf" |
| 7888 |
+ "\u05c1\u05c2\u05c4\u05c4\u05d0\u05ea\u05f0\u05f2\u0621\u063a\u0640\u0652\u0660\u0669" |
| 7889 |
+ "\u0670\u06b7\u06ba\u06be\u06c0\u06ce\u06d0\u06d3\u06d5\u06e8\u06ea\u06ed\u06f0\u06f9" |
| 7890 |
+ "\u0901\u0903\u0905\u0939\u093c\u094d\u0951\u0954\u0958\u0963\u0966\u096f\u0981\u0983" |
| 7891 |
+ "\u0985\u098c\u098f\u0990\u0993\u09a8\u09aa\u09b0\u09b2\u09b2\u09b6\u09b9\u09bc\u09bc" |
| 7892 |
+ "\u09be\u09c4\u09c7\u09c8\u09cb\u09cd\u09d7\u09d7\u09dc\u09dd\u09df\u09e3\u09e6\u09f1" |
| 7893 |
+ "\u0a02\u0a02\u0a05\u0a0a\u0a0f\u0a10\u0a13\u0a28\u0a2a\u0a30\u0a32\u0a33\u0a35\u0a36" |
| 7894 |
+ "\u0a38\u0a39\u0a3c\u0a3c\u0a3e\u0a42\u0a47\u0a48\u0a4b\u0a4d\u0a59\u0a5c\u0a5e\u0a5e" |
| 7895 |
+ "\u0a66\u0a74\u0a81\u0a83\u0a85\u0a8b\u0a8d\u0a8d\u0a8f\u0a91\u0a93\u0aa8\u0aaa\u0ab0" |
| 7896 |
+ "\u0ab2\u0ab3\u0ab5\u0ab9\u0abc\u0ac5\u0ac7\u0ac9\u0acb\u0acd\u0ae0\u0ae0\u0ae6\u0aef" |
| 7897 |
+ "\u0b01\u0b03\u0b05\u0b0c\u0b0f\u0b10\u0b13\u0b28\u0b2a\u0b30\u0b32\u0b33\u0b36\u0b39" |
| 7898 |
+ "\u0b3c\u0b43\u0b47\u0b48\u0b4b\u0b4d\u0b56\u0b57\u0b5c\u0b5d\u0b5f\u0b61\u0b66\u0b6f" |
| 7899 |
+ "\u0b82\u0b83\u0b85\u0b8a\u0b8e\u0b90\u0b92\u0b95\u0b99\u0b9a\u0b9c\u0b9c\u0b9e\u0b9f" |
| 7900 |
+ "\u0ba3\u0ba4\u0ba8\u0baa\u0bae\u0bb5\u0bb7\u0bb9\u0bbe\u0bc2\u0bc6\u0bc8\u0bca\u0bcd" |
| 7901 |
+ "\u0bd7\u0bd7\u0be7\u0bef\u0c01\u0c03\u0c05\u0c0c\u0c0e\u0c10\u0c12\u0c28\u0c2a\u0c33" |
| 7902 |
+ "\u0c35\u0c39\u0c3e\u0c44\u0c46\u0c48\u0c4a\u0c4d\u0c55\u0c56\u0c60\u0c61\u0c66\u0c6f" |
| 7903 |
+ "\u0c82\u0c83\u0c85\u0c8c\u0c8e\u0c90\u0c92\u0ca8\u0caa\u0cb3\u0cb5\u0cb9\u0cbe\u0cc4" |
| 7904 |
+ "\u0cc6\u0cc8\u0cca\u0ccd\u0cd5\u0cd6\u0cde\u0cde\u0ce0\u0ce1\u0ce6\u0cef\u0d02\u0d03" |
| 7905 |
+ "\u0d05\u0d0c\u0d0e\u0d10\u0d12\u0d28\u0d2a\u0d39\u0d3e\u0d43\u0d46\u0d48\u0d4a\u0d4d" |
| 7906 |
+ "\u0d57\u0d57\u0d60\u0d61\u0d66\u0d6f\u0e01\u0e2e\u0e30\u0e3a\u0e40\u0e4e\u0e50\u0e59" |
| 7907 |
+ "\u0e81\u0e82\u0e84\u0e84\u0e87\u0e88\u0e8a\u0e8a\u0e8d\u0e8d\u0e94\u0e97\u0e99\u0e9f" |
| 7908 |
+ "\u0ea1\u0ea3\u0ea5\u0ea5\u0ea7\u0ea7\u0eaa\u0eab\u0ead\u0eae\u0eb0\u0eb9\u0ebb\u0ebd" |
| 7909 |
+ "\u0ec0\u0ec4\u0ec6\u0ec6\u0ec8\u0ecd\u0ed0\u0ed9\u0f18\u0f19\u0f20\u0f29\u0f35\u0f35" |
| 7910 |
+ "\u0f37\u0f37\u0f39\u0f39\u0f3e\u0f47\u0f49\u0f69\u0f71\u0f84\u0f86\u0f8b\u0f90\u0f95" |
| 7911 |
+ "\u0f97\u0f97\u0f99\u0fad\u0fb1\u0fb7\u0fb9\u0fb9\u10a0\u10c5\u10d0\u10f6\u1100\u1100" |
| 7912 |
+ "\u1102\u1103\u1105\u1107\u1109\u1109\u110b\u110c\u110e\u1112\u113c\u113c\u113e\u113e" |
| 7913 |
+ "\u1140\u1140\u114c\u114c\u114e\u114e\u1150\u1150\u1154\u1155\u1159\u1159\u115f\u1161" |
| 7914 |
+ "\u1163\u1163\u1165\u1165\u1167\u1167\u1169\u1169\u116d\u116e\u1172\u1173\u1175\u1175" |
| 7915 |
+ "\u119e\u119e\u11a8\u11a8\u11ab\u11ab\u11ae\u11af\u11b7\u11b8\u11ba\u11ba\u11bc\u11c2" |
| 7916 |
+ "\u11eb\u11eb\u11f0\u11f0\u11f9\u11f9\u1e00\u1e9b\u1ea0\u1ef9\u1f00\u1f15\u1f18\u1f1d" |
| 7917 |
+ "\u1f20\u1f45\u1f48\u1f4d\u1f50\u1f57\u1f59\u1f59\u1f5b\u1f5b\u1f5d\u1f5d\u1f5f\u1f7d" |
| 7918 |
+ "\u1f80\u1fb4\u1fb6\u1fbc\u1fbe\u1fbe\u1fc2\u1fc4\u1fc6\u1fcc\u1fd0\u1fd3\u1fd6\u1fdb" |
| 7919 |
+ "\u1fe0\u1fec\u1ff2\u1ff4\u1ff6\u1ffc\u20d0\u20dc\u20e1\u20e1\u2126\u2126\u212a\u212b" |
| 7920 |
+ "\u212e\u212e\u2180\u2182\u3005\u3005\u3007\u3007\u3021\u302f\u3031\u3035\u3041\u3094" |
| 7921 |
+ "\u3099\u309a\u309d\u309e\u30a1\u30fa\u30fc\u30fe\u3105\u312c\u4e00\u9fa5\uac00\ud7a3" + ""; |
| 7922 |
|
| 7923 |
private static final String LETTERS = "\u0041\u005a\u0061\u007a\u00c0\u00d6\u00d8\u00f6\u00f8\u0131\u0134\u013e\u0141\u0148" |
| 7924 |
+ "\u014a\u017e\u0180\u01c3\u01cd\u01f0\u01f4\u01f5\u01fa\u0217\u0250\u02a8\u02bb\u02c1" |
| 7925 |
+ "\u0386\u0386\u0388\u038a\u038c\u038c\u038e\u03a1\u03a3\u03ce\u03d0\u03d6\u03da\u03da" |
| 7926 |
+ "\u03dc\u03dc\u03de\u03de\u03e0\u03e0\u03e2\u03f3\u0401\u040c\u040e\u044f\u0451\u045c" |
| 7927 |
+ "\u045e\u0481\u0490\u04c4\u04c7\u04c8\u04cb\u04cc\u04d0\u04eb\u04ee\u04f5\u04f8\u04f9" |
| 7928 |
+ "\u0531\u0556\u0559\u0559\u0561\u0586\u05d0\u05ea\u05f0\u05f2\u0621\u063a\u0641\u064a" |
| 7929 |
+ "\u0671\u06b7\u06ba\u06be\u06c0\u06ce\u06d0\u06d3\u06d5\u06d5\u06e5\u06e6\u0905\u0939" |
| 7930 |
+ "\u093d\u093d\u0958\u0961\u0985\u098c\u098f\u0990\u0993\u09a8\u09aa\u09b0\u09b2\u09b2" |
| 7931 |
+ "\u09b6\u09b9\u09dc\u09dd\u09df\u09e1\u09f0\u09f1\u0a05\u0a0a\u0a0f\u0a10\u0a13\u0a28" |
| 7932 |
+ "\u0a2a\u0a30\u0a32\u0a33\u0a35\u0a36\u0a38\u0a39\u0a59\u0a5c\u0a5e\u0a5e\u0a72\u0a74" |
| 7933 |
+ "\u0a85\u0a8b\u0a8d\u0a8d\u0a8f\u0a91\u0a93\u0aa8\u0aaa\u0ab0\u0ab2\u0ab3\u0ab5\u0ab9" |
| 7934 |
+ "\u0abd\u0abd\u0ae0\u0ae0\u0b05\u0b0c\u0b0f\u0b10\u0b13\u0b28\u0b2a\u0b30\u0b32\u0b33" |
| 7935 |
+ "\u0b36\u0b39\u0b3d\u0b3d\u0b5c\u0b5d\u0b5f\u0b61\u0b85\u0b8a\u0b8e\u0b90\u0b92\u0b95" |
| 7936 |
+ "\u0b99\u0b9a\u0b9c\u0b9c\u0b9e\u0b9f\u0ba3\u0ba4\u0ba8\u0baa\u0bae\u0bb5\u0bb7\u0bb9" |
| 7937 |
+ "\u0c05\u0c0c\u0c0e\u0c10\u0c12\u0c28\u0c2a\u0c33\u0c35\u0c39\u0c60\u0c61\u0c85\u0c8c" |
| 7938 |
+ "\u0c8e\u0c90\u0c92\u0ca8\u0caa\u0cb3\u0cb5\u0cb9\u0cde\u0cde\u0ce0\u0ce1\u0d05\u0d0c" |
| 7939 |
+ "\u0d0e\u0d10\u0d12\u0d28\u0d2a\u0d39\u0d60\u0d61\u0e01\u0e2e\u0e30\u0e30\u0e32\u0e33" |
| 7940 |
+ "\u0e40\u0e45\u0e81\u0e82\u0e84\u0e84\u0e87\u0e88\u0e8a\u0e8a\u0e8d\u0e8d\u0e94\u0e97" |
| 7941 |
+ "\u0e99\u0e9f\u0ea1\u0ea3\u0ea5\u0ea5\u0ea7\u0ea7\u0eaa\u0eab\u0ead\u0eae\u0eb0\u0eb0" |
| 7942 |
+ "\u0eb2\u0eb3\u0ebd\u0ebd\u0ec0\u0ec4\u0f40\u0f47\u0f49\u0f69\u10a0\u10c5\u10d0\u10f6" |
| 7943 |
+ "\u1100\u1100\u1102\u1103\u1105\u1107\u1109\u1109\u110b\u110c\u110e\u1112\u113c\u113c" |
| 7944 |
+ "\u113e\u113e\u1140\u1140\u114c\u114c\u114e\u114e\u1150\u1150\u1154\u1155\u1159\u1159" |
| 7945 |
+ "\u115f\u1161\u1163\u1163\u1165\u1165\u1167\u1167\u1169\u1169\u116d\u116e\u1172\u1173" |
| 7946 |
+ "\u1175\u1175\u119e\u119e\u11a8\u11a8\u11ab\u11ab\u11ae\u11af\u11b7\u11b8\u11ba\u11ba" |
| 7947 |
+ "\u11bc\u11c2\u11eb\u11eb\u11f0\u11f0\u11f9\u11f9\u1e00\u1e9b\u1ea0\u1ef9\u1f00\u1f15" |
| 7948 |
+ "\u1f18\u1f1d\u1f20\u1f45\u1f48\u1f4d\u1f50\u1f57\u1f59\u1f59\u1f5b\u1f5b\u1f5d\u1f5d" |
| 7949 |
+ "\u1f5f\u1f7d\u1f80\u1fb4\u1fb6\u1fbc\u1fbe\u1fbe\u1fc2\u1fc4\u1fc6\u1fcc\u1fd0\u1fd3" |
| 7950 |
+ "\u1fd6\u1fdb\u1fe0\u1fec\u1ff2\u1ff4\u1ff6\u1ffc\u2126\u2126\u212a\u212b\u212e\u212e" |
| 7951 |
+ "\u2180\u2182\u3007\u3007\u3021\u3029\u3041\u3094\u30a1\u30fa\u3105\u312c\u4e00\u9fa5" + "\uac00\ud7a3"; |
| 7952 |
|
| 7953 |
private static final String DIGITS = "\u0030\u0039\u0660\u0669\u06F0\u06F9\u0966\u096F\u09E6\u09EF\u0A66\u0A6F\u0AE6\u0AEF" |
| 7954 |
+ "\u0B66\u0B6F\u0BE7\u0BEF\u0C66\u0C6F\u0CE6\u0CEF\u0D66\u0D6F\u0E50\u0E59\u0ED0\u0ED9" + "\u0F20\u0F29"; |
| 7955 |
} |
| 7956 |
|
| 7957 |
} |