diff --git a/src/java.base/share/classes/java/util/regex/Pattern.java b/src/java.base/share/classes/java/util/regex/Pattern.java index 58c9186924b5a..fc4f1a60f81a3 100644 --- a/src/java.base/share/classes/java/util/regex/Pattern.java +++ b/src/java.base/share/classes/java/util/regex/Pattern.java @@ -5572,16 +5572,12 @@ int check(Matcher matcher, int i, CharSequence seq) { } if (i > startIndex) { ch = Character.codePointBefore(seq, i); - left = (isWord(ch) || - ((Character.getType(ch) == Character.NON_SPACING_MARK) - && hasBaseCharacter(matcher, i-1, seq))); + left = isWord(ch) || isNonSpacingMark(matcher, seq, ch, i - Character.charCount(ch)); } boolean right = false; if (i < endIndex) { ch = Character.codePointAt(seq, i); - right = (isWord(ch) || - ((Character.getType(ch) == Character.NON_SPACING_MARK) - && hasBaseCharacter(matcher, i, seq))); + right = isWord(ch) || isNonSpacingMark(matcher, seq, ch, i); } else { // Tried to access char past the end matcher.hitEnd = true; @@ -5590,30 +5586,38 @@ int check(Matcher matcher, int i, CharSequence seq) { } return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE); } - boolean match(Matcher matcher, int i, CharSequence seq) { - return (check(matcher, i, seq) & type) > 0 - && next.match(matcher, i, seq); + + private boolean isNonSpacingMark(Matcher matcher, CharSequence seq, int ch, int i) { + return (Character.getType(ch) == Character.NON_SPACING_MARK) + && hasBaseCharacter(matcher, i, seq); } - } - /** - * Non spacing marks only count as word characters in bounds calculations - * if they have a base character. - */ - private static boolean hasBaseCharacter(Matcher matcher, int i, - CharSequence seq) - { - int start = (!matcher.transparentBounds) ? - matcher.from : 0; - for (int x=i; x >= start; x--) { - int ch = Character.codePointAt(seq, x); - if (Character.isLetterOrDigit(ch)) - return true; - if (Character.getType(ch) == Character.NON_SPACING_MARK) - continue; + /** + * Non spacing marks only count as word characters in bounds calculations + * if they have a base character. + */ + private boolean hasBaseCharacter(Matcher matcher, int i, + CharSequence seq) + { + int start = (!matcher.transparentBounds) ? + matcher.from : 0; + for (int x=i; x > start; ) { + int ch = Character.codePointBefore(seq, x); + if (isWord(ch)) + return true; + if (Character.getType(ch) == Character.NON_SPACING_MARK) { + x -= Character.charCount(ch); + continue; + } + return false; + } return false; } - return false; + + boolean match(Matcher matcher, int i, CharSequence seq) { + return (check(matcher, i, seq) & type) > 0 + && next.match(matcher, i, seq); + } } /** diff --git a/test/jdk/java/util/regex/RegExTest.java b/test/jdk/java/util/regex/RegExTest.java index 2bd2dc3e38d2f..1112096d58c39 100644 --- a/test/jdk/java/util/regex/RegExTest.java +++ b/test/jdk/java/util/regex/RegExTest.java @@ -36,7 +36,7 @@ * 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706 * 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812 * 8216332 8214245 8237599 8241055 8247546 8258259 8037397 8269753 8276694 - * 8280403 8264160 8281315 8305107 + * 8280403 8264160 8281315 8305107 8384082 * @library /test/lib * @library /lib/testlibrary/java/lang * @build jdk.test.lib.RandomFactory @@ -444,6 +444,43 @@ public static void unicodeWordBoundsTest() { twoFindIndexes(input, matcher, 3, 6); } + @Test + public static void unicodeWordBoundsTestUnicodeCharacterClass() { + String spaces = " "; + String wordChar = "a"; + String nsm = "\u030a"; + + assert (Character.getType('\u030a') == Character.NON_SPACING_MARK); + + Pattern pattern = Pattern.compile("\\b", Pattern.UNICODE_CHARACTER_CLASS); + Matcher matcher = pattern.matcher(""); + // S=other B=word character N=non spacing mark .=word boundary + // SS.BB.SS + String input = spaces + wordChar + wordChar + spaces; + twoFindIndexes(input, matcher, 2, 4); + // SS.BBN.SS + input = spaces + wordChar +wordChar + nsm + spaces; + twoFindIndexes(input, matcher, 2, 5); + // SS.BN.SS + input = spaces + wordChar + nsm + spaces; + twoFindIndexes(input, matcher, 2, 4); + // SS.BNN.SS + input = spaces + wordChar + nsm + nsm + spaces; + twoFindIndexes(input, matcher, 2, 5); + // SS.NBB.SS + input = spaces + nsm + wordChar + wordChar + spaces; + twoFindIndexes(input, matcher, 2, 5); + // SS.BNB.SS + input = spaces + wordChar + nsm + wordChar + spaces; + twoFindIndexes(input, matcher, 2, 5); + // SS.NN.SS + input = spaces + nsm + nsm + spaces; + twoFindIndexes(input, matcher, 2, 4); + // SS.NBBN.SS + input = spaces + nsm + wordChar + wordChar + nsm + spaces; + twoFindIndexes(input, matcher, 2, 6); + } + private static void twoFindIndexes(String input, Matcher matcher, int a, int b) { @@ -454,6 +491,92 @@ private static void twoFindIndexes(String input, Matcher matcher, int a, assertEquals(matcher.start(), b); } + // This test is for 8384082 + // Check to see if word boundary construct properly handles unicode + // non spacing marks after surrogate pairs + @Test + public static void unicodeWordBoundsTestSurrogatePairUnicodeCharacterClass() { + String spaces = " "; + String baseChar = "\uD835\uDC00"; + String nsm = "\u030a"; + + assert (Character.getType('\u030a') == Character.NON_SPACING_MARK); + + Pattern pattern = Pattern.compile("\\b", Pattern.UNICODE_CHARACTER_CLASS); + Matcher matcher = pattern.matcher(""); + // S=other B=character N=non spacing mark .=word boundary + // SS.BBBB.SS + String input = spaces + baseChar + baseChar + spaces; + findIndices(input, matcher, List.of(2, 6)); + // SS.BBBBN.SS + input = spaces + baseChar + baseChar + nsm + spaces; + findIndices(input, matcher, List.of(2, 7)); + // SS.BBN.SS + input = spaces + baseChar + nsm + spaces; + findIndices(input, matcher, List.of(2, 5)); + // SS.BBNN.SS + input = spaces + baseChar + nsm + nsm + spaces; + findIndices(input, matcher, List.of(2, 6)); + // SS.NBBBB.SS + input = spaces + nsm + baseChar + baseChar + spaces; + findIndices(input, matcher, List.of(2, 7)); + // SS.BBNBB.SS + input = spaces + baseChar + nsm + baseChar + spaces; + findIndices(input, matcher, List.of(2, 7)); + // SS.NN.SS + input = spaces + nsm + nsm + spaces; + findIndices(input, matcher, List.of(2, 4)); + // SS.NBBBBN.SS + input = spaces + nsm + baseChar + baseChar + nsm + spaces; + findIndices(input, matcher, List.of(2, 8)); + } + + @Test + public static void unicodeWordBoundsTestSurrogatePair() { + String spaces = " "; + String baseChar = "\uD835\uDC00"; + String nsm = "\u030a"; + + assert (Character.getType('\u030a') == Character.NON_SPACING_MARK); + + Pattern pattern = Pattern.compile("\\b"); + Matcher matcher = pattern.matcher(""); + // S=other B=character N=non spacing mark .=word boundary + // SSBBBBSS + String input = spaces + baseChar + baseChar + spaces; + findIndices(input, matcher, List.of()); + // SSBBBBNSS + input = spaces + baseChar + baseChar + nsm + spaces; + findIndices(input, matcher, List.of()); + // SSBBNSS + input = spaces + baseChar + nsm + spaces; + findIndices(input, matcher, List.of()); + // SSBBNNSS + input = spaces + baseChar + nsm + nsm + spaces; + findIndices(input, matcher, List.of()); + // SSNBBBBSS + input = spaces + nsm + baseChar + baseChar + spaces; + findIndices(input, matcher, List.of()); + // SSBBNBBSS + input = spaces + baseChar + nsm + baseChar + spaces; + findIndices(input, matcher, List.of()); + // SSNNSS + input = spaces + nsm + nsm + spaces; + findIndices(input, matcher, List.of()); + // SSNBBBBNSS + input = spaces + nsm + baseChar + baseChar + nsm + spaces; + findIndices(input, matcher, List.of()); + } + + private static void findIndices(String input, Matcher matcher, List expected) { + matcher.reset(input); + List indices = new ArrayList<>(); + while (matcher.find()) { + indices.add(matcher.start()); + } + assertEquals(indices, expected); + } + // This test is for 6284152 private static void check(String regex, String input, String[] expected) { List result = new ArrayList<>();