Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 30 additions & 26 deletions src/java.base/share/classes/java/util/regex/Pattern.java
Original file line number Diff line number Diff line change
Expand Up @@ -5572,16 +5572,12 @@ int check(Matcher matcher, int i, CharSequence seq) {
}
if (i > startIndex) {
ch = Character.codePointBefore(seq, i);
left = (isWord(ch) ||
((Character.getType(ch) == Character.NON_SPACING_MARK)
&& hasBaseCharacter(matcher, i-1, seq)));
left = isWord(ch) || isNonSpacingMark(matcher, seq, ch, i - Character.charCount(ch));
}
boolean right = false;
if (i < endIndex) {
ch = Character.codePointAt(seq, i);
right = (isWord(ch) ||
((Character.getType(ch) == Character.NON_SPACING_MARK)
&& hasBaseCharacter(matcher, i, seq)));
right = isWord(ch) || isNonSpacingMark(matcher, seq, ch, i);
} else {
// Tried to access char past the end
matcher.hitEnd = true;
Expand All @@ -5590,30 +5586,38 @@ int check(Matcher matcher, int i, CharSequence seq) {
}
return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE);
}
boolean match(Matcher matcher, int i, CharSequence seq) {
return (check(matcher, i, seq) & type) > 0
&& next.match(matcher, i, seq);

private boolean isNonSpacingMark(Matcher matcher, CharSequence seq, int ch, int i) {
return (Character.getType(ch) == Character.NON_SPACING_MARK)
&& hasBaseCharacter(matcher, i, seq);
}
}

/**
* Non spacing marks only count as word characters in bounds calculations
* if they have a base character.
*/
private static boolean hasBaseCharacter(Matcher matcher, int i,
CharSequence seq)
{
int start = (!matcher.transparentBounds) ?
matcher.from : 0;
for (int x=i; x >= start; x--) {
int ch = Character.codePointAt(seq, x);
if (Character.isLetterOrDigit(ch))
return true;
if (Character.getType(ch) == Character.NON_SPACING_MARK)
continue;
/**
* Non spacing marks only count as word characters in bounds calculations
* if they have a base character.
*/
private boolean hasBaseCharacter(Matcher matcher, int i,
CharSequence seq)
{
int start = (!matcher.transparentBounds) ?
matcher.from : 0;
for (int x=i; x > start; ) {
int ch = Character.codePointBefore(seq, x);
if (isWord(ch))
return true;
if (Character.getType(ch) == Character.NON_SPACING_MARK) {
x -= Character.charCount(ch);
continue;
}
return false;
}
return false;
}
return false;

boolean match(Matcher matcher, int i, CharSequence seq) {
return (check(matcher, i, seq) & type) > 0
&& next.match(matcher, i, seq);
}
}

/**
Expand Down
125 changes: 124 additions & 1 deletion test/jdk/java/util/regex/RegExTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
* 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
* 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812
* 8216332 8214245 8237599 8241055 8247546 8258259 8037397 8269753 8276694
* 8280403 8264160 8281315 8305107
* 8280403 8264160 8281315 8305107 8384082
* @library /test/lib
* @library /lib/testlibrary/java/lang
* @build jdk.test.lib.RandomFactory
Expand Down Expand Up @@ -444,6 +444,43 @@ public static void unicodeWordBoundsTest() {
twoFindIndexes(input, matcher, 3, 6);
}

@Test
public static void unicodeWordBoundsTestUnicodeCharacterClass() {
String spaces = " ";
String wordChar = "a";
String nsm = "\u030a";

assert (Character.getType('\u030a') == Character.NON_SPACING_MARK);

Pattern pattern = Pattern.compile("\\b", Pattern.UNICODE_CHARACTER_CLASS);
Matcher matcher = pattern.matcher("");
// S=other B=word character N=non spacing mark .=word boundary
// SS.BB.SS
String input = spaces + wordChar + wordChar + spaces;
twoFindIndexes(input, matcher, 2, 4);
// SS.BBN.SS
input = spaces + wordChar +wordChar + nsm + spaces;
twoFindIndexes(input, matcher, 2, 5);
// SS.BN.SS
input = spaces + wordChar + nsm + spaces;
twoFindIndexes(input, matcher, 2, 4);
// SS.BNN.SS
input = spaces + wordChar + nsm + nsm + spaces;
twoFindIndexes(input, matcher, 2, 5);
// SS.NBB.SS
input = spaces + nsm + wordChar + wordChar + spaces;
twoFindIndexes(input, matcher, 2, 5);
// SS.BNB.SS
input = spaces + wordChar + nsm + wordChar + spaces;
twoFindIndexes(input, matcher, 2, 5);
// SS.NN.SS
input = spaces + nsm + nsm + spaces;
twoFindIndexes(input, matcher, 2, 4);
// SS.NBBN.SS
input = spaces + nsm + wordChar + wordChar + nsm + spaces;
twoFindIndexes(input, matcher, 2, 6);
}

private static void twoFindIndexes(String input, Matcher matcher, int a,
int b)
{
Expand All @@ -454,6 +491,92 @@ private static void twoFindIndexes(String input, Matcher matcher, int a,
assertEquals(matcher.start(), b);
}

// This test is for 8384082
// Check to see if word boundary construct properly handles unicode
// non spacing marks after surrogate pairs
@Test
public static void unicodeWordBoundsTestSurrogatePairUnicodeCharacterClass() {
String spaces = " ";
String baseChar = "\uD835\uDC00";
String nsm = "\u030a";

assert (Character.getType('\u030a') == Character.NON_SPACING_MARK);

Pattern pattern = Pattern.compile("\\b", Pattern.UNICODE_CHARACTER_CLASS);
Matcher matcher = pattern.matcher("");
// S=other B=character N=non spacing mark .=word boundary
// SS.BBBB.SS
String input = spaces + baseChar + baseChar + spaces;
findIndices(input, matcher, List.of(2, 6));
// SS.BBBBN.SS
input = spaces + baseChar + baseChar + nsm + spaces;
findIndices(input, matcher, List.of(2, 7));
// SS.BBN.SS
input = spaces + baseChar + nsm + spaces;
findIndices(input, matcher, List.of(2, 5));
// SS.BBNN.SS
input = spaces + baseChar + nsm + nsm + spaces;
findIndices(input, matcher, List.of(2, 6));
// SS.NBBBB.SS
input = spaces + nsm + baseChar + baseChar + spaces;
findIndices(input, matcher, List.of(2, 7));
// SS.BBNBB.SS
input = spaces + baseChar + nsm + baseChar + spaces;
findIndices(input, matcher, List.of(2, 7));
// SS.NN.SS
input = spaces + nsm + nsm + spaces;
findIndices(input, matcher, List.of(2, 4));
// SS.NBBBBN.SS
input = spaces + nsm + baseChar + baseChar + nsm + spaces;
findIndices(input, matcher, List.of(2, 8));
}

@Test
public static void unicodeWordBoundsTestSurrogatePair() {
String spaces = " ";
String baseChar = "\uD835\uDC00";
String nsm = "\u030a";

assert (Character.getType('\u030a') == Character.NON_SPACING_MARK);

Pattern pattern = Pattern.compile("\\b");
Matcher matcher = pattern.matcher("");
// S=other B=character N=non spacing mark .=word boundary
// SSBBBBSS
String input = spaces + baseChar + baseChar + spaces;
findIndices(input, matcher, List.of());
// SSBBBBNSS
input = spaces + baseChar + baseChar + nsm + spaces;
findIndices(input, matcher, List.of());
// SSBBNSS
input = spaces + baseChar + nsm + spaces;
findIndices(input, matcher, List.of());
// SSBBNNSS
input = spaces + baseChar + nsm + nsm + spaces;
findIndices(input, matcher, List.of());
// SSNBBBBSS
input = spaces + nsm + baseChar + baseChar + spaces;
findIndices(input, matcher, List.of());
// SSBBNBBSS
input = spaces + baseChar + nsm + baseChar + spaces;
findIndices(input, matcher, List.of());
// SSNNSS
input = spaces + nsm + nsm + spaces;
findIndices(input, matcher, List.of());
// SSNBBBBNSS
input = spaces + nsm + baseChar + baseChar + nsm + spaces;
findIndices(input, matcher, List.of());
}

private static void findIndices(String input, Matcher matcher, List<Integer> expected) {
matcher.reset(input);
List<Integer> indices = new ArrayList<>();
while (matcher.find()) {
indices.add(matcher.start());
}
assertEquals(indices, expected);
}

// This test is for 6284152
private static void check(String regex, String input, String[] expected) {
List<String> result = new ArrayList<>();
Expand Down