Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions src/core/regex/include/sourcemeta/core/regex.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,22 @@ SOURCEMETA_CORE_REGEX_EXPORT
auto matches_if_valid(const std::string_view pattern,
const std::string_view value) -> bool;

/// @ingroup regex
///
/// Check whether the given string is a valid ECMA-262 regular expression. For
Comment thread
jviotti marked this conversation as resolved.
/// example:
///
/// ```cpp
/// #include <sourcemeta/core/regex.h>
/// #include <cassert>
///
/// assert(sourcemeta::core::is_regex_ecma("([abc])+\\s+$"));
/// assert(!sourcemeta::core::is_regex_ecma("^(abc]"));
/// assert(!sourcemeta::core::is_regex_ecma("\\a"));
/// ```
SOURCEMETA_CORE_REGEX_EXPORT
auto is_regex_ecma(const std::string_view pattern) -> bool;

} // namespace sourcemeta::core

#endif
172 changes: 154 additions & 18 deletions src/core/regex/preprocess.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <optional> // std::optional
#include <string> // std::string
#include <string_view> // std::string_view
#include <type_traits> // std::conditional_t
#include <utility> // std::pair

namespace sourcemeta::core {
Expand Down Expand Up @@ -636,14 +637,63 @@ inline auto find_shorthand(char escape) -> const ShorthandExpansion * {

} // namespace

template <bool CheckECMA>
inline auto preprocess_regex(const std::string &pattern)
-> std::optional<std::string> {
-> std::conditional_t<CheckECMA, bool, std::optional<std::string>> {
std::string result;
result.reserve(pattern.size() * 2);
if constexpr (!CheckECMA) {
result.reserve(pattern.size() * 2);
}
bool in_class = false;

for (std::size_t position = 0; position < pattern.size(); ++position) {
const char current = pattern[position];

// Reject group constructs that are not part of ECMA-262
if (current == '(' && !is_escaped(pattern, position) && !in_class &&
position + 1 < pattern.size()) {
// (*...) backtracking control verbs (PCRE-only)
if (pattern[position + 1] == '*') {
if constexpr (CheckECMA) {
return false;
} else {
return std::nullopt;
}
}
// (?X...): ECMA only allows (?: (?= (?! (?<= (?<! (?<name>
if (pattern[position + 1] == '?' && position + 2 < pattern.size()) {
const char third = pattern[position + 2];
bool valid_extension{third == ':' || third == '=' || third == '!'};
if (!valid_extension && third == '<' && position + 3 < pattern.size()) {
const char fourth = pattern[position + 3];
valid_extension = fourth == '=' || fourth == '!' ||
(fourth >= 'a' && fourth <= 'z') ||
(fourth >= 'A' && fourth <= 'Z') || fourth == '_' ||
fourth == '$';
}
if (!valid_extension) {
if constexpr (CheckECMA) {
return false;
} else {
return std::nullopt;
}
}
}
}

// Reject possessive quantifiers (PCRE-only): *+ ++ ?+
if (current == '+' && position > 0 && !in_class) {
const char prev = pattern[position - 1];
if ((prev == '*' || prev == '+' || prev == '?') &&
!is_escaped(pattern, position - 1)) {
if constexpr (CheckECMA) {
return false;
} else {
return std::nullopt;
}
}
}

if (current == '[' && !is_escaped(pattern, position) && !in_class) {
// Find end both ways and check which applies
const auto simple_end = find_bracket_end(pattern, position + 1, false);
Expand All @@ -652,6 +702,16 @@ inline auto preprocess_regex(const std::string &pattern)
const auto nested_content =
pattern.substr(position + 1, nested_end - position - 2);

// Reject POSIX-style character classes like [[:alpha:]] (PCRE-only)
if (nested_content.size() >= 4 && nested_content[0] == '[' &&
nested_content[1] == ':' && nested_content.ends_with(":]")) {
if constexpr (CheckECMA) {
return false;
} else {
return std::nullopt;
}
}

// Check for v-flag operators in nested content
const bool nested_has_ops =
nested_content.contains("--") || nested_content.contains("&&");
Expand Down Expand Up @@ -680,10 +740,16 @@ inline auto preprocess_regex(const std::string &pattern)
if (use_v_flag) {
const auto expanded = expand_char_class(nested_content);
if (!expanded) {
return std::nullopt;
if constexpr (CheckECMA) {
return false;
} else {
return std::nullopt;
}
}

result += *expanded;
if constexpr (!CheckECMA) {
result += *expanded;
}
position = nested_end - 1;
continue;
}
Expand All @@ -696,41 +762,81 @@ inline auto preprocess_regex(const std::string &pattern)
}

if (current != '\\' || position + 1 >= pattern.size()) {
result += current;
if constexpr (!CheckECMA) {
result += current;
}
continue;
}

const char next = pattern[position + 1];
if (std::string_view{"\\[]^$"}.contains(next)) {
result += current;
result += next;
if constexpr (!CheckECMA) {
result += current;
result += next;
}
++position;
continue;
}

if (next == 'u' && position + 2 < pattern.size()) {
if (pattern[position + 2] == '{') {
result += "\\x{";
if constexpr (!CheckECMA) {
result += "\\x{";
}
for (position += 3;
position < pattern.size() && pattern[position] != '}';
++position) {
result += pattern[position];
if constexpr (!CheckECMA) {
result += pattern[position];
}
}

if (position < pattern.size()) {
result += '}';
if constexpr (!CheckECMA) {
result += '}';
}
}

continue;
}

if (position + 5 < pattern.size() && all_hex(pattern, position + 2, 4)) {
result += "\\x{" + pattern.substr(position + 2, 4) + '}';
if constexpr (!CheckECMA) {
result += "\\x{" + pattern.substr(position + 2, 4) + '}';
}
position += 5;
continue;
}
}

// Named backreference \k<name> (ECMA-262 since 2018)
if (next == 'k' && position + 2 < pattern.size() &&
pattern[position + 2] == '<') {
if constexpr (!CheckECMA) {
result += "\\k<";
}
const auto name_start = position + 3;
position = name_start;
while (position < pattern.size() && pattern[position] != '>') {
if constexpr (!CheckECMA) {
result += pattern[position];
}
++position;
}
// Reject unterminated \k<name or empty name \k<>
if (position >= pattern.size() || position == name_start) {
if constexpr (CheckECMA) {
return false;
} else {
return std::nullopt;
}
}
if constexpr (!CheckECMA) {
result += '>';
}
continue;
}

if ((next == 'p' || next == 'P') && position + 2 < pattern.size() &&
pattern[position + 2] == '{') {
const auto start = position;
Expand All @@ -742,32 +848,62 @@ inline auto preprocess_regex(const std::string &pattern)

if (position < pattern.size()) {
if (auto translated = translate_property(name, next == 'P')) {
result += *translated;
if constexpr (!CheckECMA) {
result += *translated;
}
} else {
result += pattern.substr(start, position - start + 1);
if constexpr (!CheckECMA) {
result += pattern.substr(start, position - start + 1);
}
}
} else {
position = start;
result += current;
if constexpr (!CheckECMA) {
result += current;
}
}

continue;
}

if (const auto *expansion = find_shorthand(next)) {
if (in_class && expansion->inside_class.empty()) {
result += std::string{current} + next;
if constexpr (!CheckECMA) {
result += std::string{current} + next;
}
} else {
result += in_class ? expansion->inside_class : expansion->outside_class;
if constexpr (!CheckECMA) {
result +=
in_class ? expansion->inside_class : expansion->outside_class;
}
}

++position;
} else {
result += current;
// Reject escape sequences that are not valid in ECMA-262 strict mode
constexpr std::string_view ecma_remaining_escapes{"tnrfvcx0"};
const bool is_ecma_escape{ecma_remaining_escapes.contains(next) ||
v_flag_syntax.contains(next) ||
(next >= '1' && next <= '9')};
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
if (!is_ecma_escape) {
if constexpr (CheckECMA) {
return false;
} else {
return std::nullopt;
}
}

if constexpr (!CheckECMA) {
result += current;
}
}
}

return result;
if constexpr (CheckECMA) {
return true;
} else {
return result;
}
}

} // namespace sourcemeta::core
Expand Down
25 changes: 24 additions & 1 deletion src/core/regex/regex.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ auto to_regex(const std::string_view pattern) -> std::optional<Regex> {
return RegexTypeRange{minimum, maximum};
}

const auto pcre2_pattern{preprocess_regex(std::string{pattern})};
const auto pcre2_pattern{preprocess_regex<false>(std::string{pattern})};
if (!pcre2_pattern.has_value()) {
return std::nullopt;
}
Expand Down Expand Up @@ -115,4 +115,27 @@ auto matches_if_valid(const std::string_view pattern,
return regex.has_value() && matches(regex.value(), value);
}

auto is_regex_ecma(const std::string_view pattern) -> bool {
const auto pcre2_pattern{preprocess_regex<false>(std::string{pattern})};
if (!pcre2_pattern.has_value()) {
return false;
}

int pcre2_error_code{0};
PCRE2_SIZE pcre2_error_offset{0};
pcre2_code *pcre2_regex_raw{pcre2_compile(
reinterpret_cast<PCRE2_SPTR>(pcre2_pattern.value().c_str()),
pcre2_pattern.value().size(),
PCRE2_UTF | PCRE2_UCP | PCRE2_NO_AUTO_CAPTURE | PCRE2_DOTALL |
Copy link
Copy Markdown

@augmentcode augmentcode Bot May 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

src/core/regex/regex.cc:129 — is_regex_ecma() currently delegates validity to PCRE2 compilation, which can accept PCRE-specific constructs that aren’t part of ECMA-262 (e.g., inline option groups / atomic groups). If this API is intended as an ECMA-262 validator, this may lead to false positives (returning true for non-ECMA patterns).

Severity: medium

Fix This in Augment

🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.

PCRE2_DOLLAR_ENDONLY | PCRE2_NEVER_BACKSLASH_C | PCRE2_NO_UTF_CHECK,
&pcre2_error_code, &pcre2_error_offset, nullptr)};

if (pcre2_regex_raw == nullptr) {
return false;
}

pcre2_code_free(pcre2_regex_raw);
return true;
}

} // namespace sourcemeta::core
1 change: 1 addition & 0 deletions test/regex/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME regex
regex_matches_if_valid_test.cc
regex_matches_ecma262_test.cc
regex_matches_rfc9485_test.cc
regex_is_ecma_test.cc
regex_to_regex_test.cc
regex_test.cc)

Expand Down
Loading
Loading