From 18a96f160a952514213e928d96bfa67b69b79b4e Mon Sep 17 00:00:00 2001 From: aizu-m Date: Sat, 20 Jun 2026 20:24:23 +0530 Subject: [PATCH 1/3] reject out-of-range quantifier counts in regex parser --- .../xmlbeans/impl/regex/RegexParser.java | 8 ++++---- .../misc/checkin/RegularExpressionTest.java | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/apache/xmlbeans/impl/regex/RegexParser.java b/src/main/java/org/apache/xmlbeans/impl/regex/RegexParser.java index 0ef108a97..9054f3226 100644 --- a/src/main/java/org/apache/xmlbeans/impl/regex/RegexParser.java +++ b/src/main/java/org/apache/xmlbeans/impl/regex/RegexParser.java @@ -604,9 +604,9 @@ Token parseFactor() throws ParseException { min = ch -'0'; while (off < this.regexlen && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { - min = min*10 +ch-'0'; - if (min < 0) + if (min > (Integer.MAX_VALUE - (ch-'0')) / 10) throw ex("parser.quantifier.5", this.offset); + min = min*10 +ch-'0'; } } else { @@ -625,9 +625,9 @@ else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { while (off < this.regexlen && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { - max = max*10 +ch-'0'; - if (max < 0) + if (max > (Integer.MAX_VALUE - (ch-'0')) / 10) throw ex("parser.quantifier.5", this.offset); + max = max*10 +ch-'0'; } if (min > max) diff --git a/src/test/java/misc/checkin/RegularExpressionTest.java b/src/test/java/misc/checkin/RegularExpressionTest.java index c6f54b9de..38861c585 100644 --- a/src/test/java/misc/checkin/RegularExpressionTest.java +++ b/src/test/java/misc/checkin/RegularExpressionTest.java @@ -15,12 +15,14 @@ package misc.checkin; +import org.apache.xmlbeans.impl.regex.ParseException; import org.apache.xmlbeans.impl.regex.RegularExpression; import org.junit.jupiter.api.Test; import java.util.Random; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; public class RegularExpressionTest { @@ -44,6 +46,23 @@ void testLookbehindRangeAtInputEnd() { assertFalse(new RegularExpression("x(?<=[a-c])").matches("xc")); } + @Test + void testQuantifierOverflow() { + // a {min,max} count larger than Integer.MAX_VALUE overflowed the int + // accumulator. the only guard was a post-multiply min<0/max<0 check, so + // counts that wrapped to a non-negative value slipped through: "a{4294967296}" + // parsed as "a{0}" (matched the empty string) and "a{1,4294967298}" as "a{1,2}", + // while bigger ones such as "a{99999999999}" blew the heap at match time. + assertThrows(ParseException.class, () -> new RegularExpression("a{4294967296}")); + assertThrows(ParseException.class, () -> new RegularExpression("a{4294967297}")); + assertThrows(ParseException.class, () -> new RegularExpression("a{99999999999}")); + assertThrows(ParseException.class, () -> new RegularExpression("a{1,4294967298}")); + // counts up to Integer.MAX_VALUE are representable and must still parse + new RegularExpression("a{2147483647}"); + new RegularExpression("a{0,2147483647}"); + assertTrue(new RegularExpression("a{2,4}").matches("aaa")); + } + private static final String AB = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; private static final Random rnd = new Random(); From 0230cea9c7f5e255c18a20165cdc38541d6f56a8 Mon Sep 17 00:00:00 2001 From: PJ Fanning Date: Sun, 21 Jun 2026 13:21:28 +0100 Subject: [PATCH 2/3] refactor --- .../java/org/apache/xmlbeans/impl/regex/RegexParser.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/apache/xmlbeans/impl/regex/RegexParser.java b/src/main/java/org/apache/xmlbeans/impl/regex/RegexParser.java index 9054f3226..5a8e813e8 100644 --- a/src/main/java/org/apache/xmlbeans/impl/regex/RegexParser.java +++ b/src/main/java/org/apache/xmlbeans/impl/regex/RegexParser.java @@ -73,7 +73,7 @@ static class ReferencePosition { int context = S_NORMAL; int parennumber = 1; boolean hasBackReferences; - Vector references = null; + Vector references = null; public RegexParser() { this.setLocale(Locale.getDefault()); @@ -112,7 +112,7 @@ synchronized Token parse(String regex, int options) throws ParseException { throw ex("parser.parse.1", this.offset); if (this.references != null) { for (int i = 0; i < this.references.size(); i ++) { - ReferencePosition position = (ReferencePosition)this.references.elementAt(i); + ReferencePosition position = this.references.elementAt(i); if (this.parennumber <= position.refNumber) throw ex("parser.parse.2", position.position); } @@ -431,7 +431,7 @@ Token processCondition() throws ParseException { if ('1' <= ch && ch <= '9') { refno = ch-'0'; this.hasBackReferences = true; - if (this.references == null) this.references = new Vector(); + if (this.references == null) this.references = new Vector<>(); this.references.addElement(new ReferencePosition(refno, this.offset)); this.offset ++; if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset); @@ -543,7 +543,7 @@ Token processBackreference() throws ParseException { int refnum = this.chardata-'0'; Token tok = Token.createBackReference(refnum); this.hasBackReferences = true; - if (this.references == null) this.references = new Vector(); + if (this.references == null) this.references = new Vector<>(); this.references.addElement(new ReferencePosition(refnum, this.offset-2)); this.next(); return tok; From a27d2d7c98a1e4ee8deff100a8216cfb3e24f60e Mon Sep 17 00:00:00 2001 From: PJ Fanning Date: Sun, 21 Jun 2026 13:41:42 +0100 Subject: [PATCH 3/3] refactor --- .../xmlbeans/impl/regex/RegexParser.java | 163 +++++++++--------- 1 file changed, 83 insertions(+), 80 deletions(-) diff --git a/src/main/java/org/apache/xmlbeans/impl/regex/RegexParser.java b/src/main/java/org/apache/xmlbeans/impl/regex/RegexParser.java index 5a8e813e8..5202de2c9 100644 --- a/src/main/java/org/apache/xmlbeans/impl/regex/RegexParser.java +++ b/src/main/java/org/apache/xmlbeans/impl/regex/RegexParser.java @@ -976,89 +976,92 @@ int decodeEscaped() throws ParseException { case 'r': c = '\r'; break; // CRRIAGE RETURN U+000D case 't': c = '\t'; break; // HORIZONTAL TABULATION U+0009 //case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B - case 'x': - this.next(); - if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); - if (this.chardata == '{') { - int v1 = 0; - int uv = 0; - do { - this.next(); - if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); - if ((v1 = hexChar(this.chardata)) < 0) - break; - if (uv > uv*16) throw ex("parser.descape.2", this.offset-1); - uv = uv*16+v1; - } while (true); - if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1); - if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1); - c = uv; - } else { - int v1 = 0; - if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) - throw ex("parser.descape.1", this.offset-1); - int uv = v1; - this.next(); - if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) - throw ex("parser.descape.1", this.offset-1); - uv = uv*16+v1; - c = uv; - } - break; - - case 'u': - int v1 = 0; - this.next(); - if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) - throw ex("parser.descape.1", this.offset-1); - int uv = v1; - this.next(); - if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) - throw ex("parser.descape.1", this.offset-1); - uv = uv*16+v1; - this.next(); - if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) - throw ex("parser.descape.1", this.offset-1); - uv = uv*16+v1; - this.next(); - if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) - throw ex("parser.descape.1", this.offset-1); - uv = uv*16+v1; - c = uv; - break; - - case 'v': - this.next(); - if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) - throw ex("parser.descape.1", this.offset-1); - uv = v1; - this.next(); - if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) - throw ex("parser.descape.1", this.offset-1); - uv = uv*16+v1; - this.next(); - if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) - throw ex("parser.descape.1", this.offset-1); - uv = uv*16+v1; - this.next(); - if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) - throw ex("parser.descape.1", this.offset-1); - uv = uv*16+v1; - this.next(); - if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) - throw ex("parser.descape.1", this.offset-1); - uv = uv*16+v1; - this.next(); - if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) - throw ex("parser.descape.1", this.offset-1); - uv = uv*16+v1; - if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1); - c = uv; - break; + case 'x': { + this.next(); + if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset - 1); + if (this.chardata == '{') { + int v1 = 0; + int uv = 0; + do { + this.next(); + if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset - 1); + if ((v1 = hexChar(this.chardata)) < 0) + break; + if (uv > uv * 16) throw ex("parser.descape.2", this.offset - 1); + uv = uv * 16 + v1; + } while (true); + if (this.chardata != '}') throw ex("parser.descape.3", this.offset - 1); + if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset - 1); + c = uv; + } else { + int v1 = 0; + if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) + throw ex("parser.descape.1", this.offset - 1); + int uv = v1; + this.next(); + if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) + throw ex("parser.descape.1", this.offset - 1); + uv = uv * 16 + v1; + c = uv; + } + break; + } + case 'u': { + int v1 = 0; + this.next(); + if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) + throw ex("parser.descape.1", this.offset - 1); + int uv1 = v1; + this.next(); + if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) + throw ex("parser.descape.1", this.offset - 1); + uv1 = uv1 * 16 + v1; + this.next(); + if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) + throw ex("parser.descape.1", this.offset - 1); + uv1 = uv1 * 16 + v1; + this.next(); + if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) + throw ex("parser.descape.1", this.offset - 1); + uv1 = uv1 * 16 + v1; + c = uv1; + break; + } + case 'v': { + int v2 = 0; + int uv2 = 0; + this.next(); + if (this.read() != T_CHAR || (v2 = hexChar(this.chardata)) < 0) + throw ex("parser.descape.1", this.offset - 1); + uv2 = v2; + this.next(); + if (this.read() != T_CHAR || (v2 = hexChar(this.chardata)) < 0) + throw ex("parser.descape.1", this.offset - 1); + uv2 = uv2 * 16 + v2; + this.next(); + if (this.read() != T_CHAR || (v2 = hexChar(this.chardata)) < 0) + throw ex("parser.descape.1", this.offset - 1); + uv2 = uv2 * 16 + v2; + this.next(); + if (this.read() != T_CHAR || (v2 = hexChar(this.chardata)) < 0) + throw ex("parser.descape.1", this.offset - 1); + uv2 = uv2 * 16 + v2; + this.next(); + if (this.read() != T_CHAR || (v2 = hexChar(this.chardata)) < 0) + throw ex("parser.descape.1", this.offset - 1); + uv2 = uv2 * 16 + v2; + this.next(); + if (this.read() != T_CHAR || (v2 = hexChar(this.chardata)) < 0) + throw ex("parser.descape.1", this.offset - 1); + uv2 = uv2 * 16 + v2; + if (uv2 > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset - 1); + c = uv2; + break; + } case 'A': case 'Z': case 'z': - throw ex("parser.descape.5", this.offset-2); + throw ex("parser.descape.5", this.offset-2); default: } return c;