Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/sre_engine/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ rustpython-wtf8 = { workspace = true }
num_enum = { workspace = true }
bitflags = { workspace = true }
optional = { workspace = true }
icu_properties = { workspace = true }

[dev-dependencies]
criterion = { workspace = true }
Expand Down
6 changes: 5 additions & 1 deletion crates/sre_engine/src/string.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use icu_properties::props::{CanonicalCombiningClass, EnumeratedProperty};
use rustpython_wtf8::Wtf8;

#[derive(Debug, Clone, Copy)]
Expand Down Expand Up @@ -443,7 +444,10 @@ pub(crate) const fn is_uni_linebreak(ch: u32) -> bool {
pub(crate) fn is_uni_alnum(ch: u32) -> bool {
// TODO: check with cpython
char::try_from(ch)
.map(|x| x.is_alphanumeric())
.map(|x| {
x.is_alphanumeric()
&& CanonicalCombiningClass::for_char(x) == CanonicalCombiningClass::NotReordered
})
.unwrap_or(false)
}
#[inline]
Expand Down
9 changes: 7 additions & 2 deletions crates/vm/src/builtins/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ use rustpython_common::{
};

use icu_properties::props::{
BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, XidContinue, XidStart,
BidiClass, BinaryProperty, CanonicalCombiningClass, EnumeratedProperty, GeneralCategory,
XidContinue, XidStart,
};
use unicode_casing::CharExt;

Expand Down Expand Up @@ -946,7 +947,11 @@ impl PyStr {

#[pymethod]
fn isalnum(&self) -> bool {
!self.data.is_empty() && self.char_all(char::is_alphanumeric)
!self.data.is_empty()
&& self.char_all(|c| {
c.is_alphanumeric()
&& CanonicalCombiningClass::for_char(c) == CanonicalCombiningClass::NotReordered
})
Comment on lines 949 to +954
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# First, examine the actual code in str.rs around the isalnum implementation
cd /tmp && find . -name "str.rs" -path "*/vm/src/builtins/*" 2>/dev/null | head -5

Repository: RustPython/RustPython

Length of output: 47


🏁 Script executed:

find . -type f -name "str.rs" | grep -E "(builtins|vm)" | head -10

Repository: RustPython/RustPython

Length of output: 96


🏁 Script executed:

# List the codebase structure
git ls-files | grep -E "(str\.rs|string\.rs)" | head -20

Repository: RustPython/RustPython

Length of output: 271


🏁 Script executed:

# Check the actual implementation at the mentioned lines
git show HEAD:crates/vm/src/builtins/str.rs | sed -n '940,960p'

Repository: RustPython/RustPython

Length of output: 675


🏁 Script executed:

# Check the sre_engine code mentioned in the review
git show HEAD:crates/sre_engine/src/string.rs | sed -n '440,460p'

Repository: RustPython/RustPython

Length of output: 668


🏁 Script executed:

# Check the test file mentioned
git show HEAD:extra_tests/snippets/builtin_str.py | sed -n '70,90p'

Repository: RustPython/RustPython

Length of output: 598


🏁 Script executed:

# Check imports in str.rs to see what's available
git show HEAD:crates/vm/src/builtins/str.rs | sed -n '1,60p'

Repository: RustPython/RustPython

Length of output: 2247


🌐 Web query:

U+093F DEVANAGARI VOWEL SIGN I Unicode canonical combining class properties

💡 Result:

The Unicode character U+093F DEVANAGARI VOWEL SIGN I has Canonical Combining Class 0 (Not Reordered).

Citations:


🌐 Web query:

U+0303 combining tilde canonical combining class general category

💡 Result:

The Unicode character U+0303 (Combining Tilde) has: - Canonical Combining Class: Above (230) - General Category: Nonspacing Mark (Mn)

Citations:


🌐 Web query:

CPython str.isalnum() implementation definition Unicode categories

💡 Result:

The CPython implementation of str.isalnum determines if all characters in the string are alphanumeric (letters or numbers) and the string is non-empty. It uses Unicode general categories: a character is alphanumeric if it is alphabetic (general categories L: Letter, specifically "Lm", "Lt", "Lu", "Ll", or "Lo") OR numeric (specifically Decimal_Digit "Nd", though isalnum combines alpha or digit properties). From official documentation: Alphabetic characters are those with general category property being one of "Lm", "Lt", "Lu", "Ll", or "Lo". For isdigit, numeric characters are those with Numeric_Type=Digit (Nd), Decimal (Nd), or Numeric (No, Nl). isalnum is effectively isalpha or isdigit. The C implementation is in Objects/unicodectype.c using _PyUnicode_TypeRecord flags: ALPHA_MASK (0x01) for categories Ll/Lu/Lt/Lo/Lm, and DIGIT_MASK or DECIMAL_MASK for numeric. The function likely checks (ctype->flags & (ALPHA_MASK | DIGIT_MASK)) != 0, though exact isalnum function not directly shown, but follows from isalpha and isdigit patterns and Py_UNICODE_ISALNUM macro. For bytes.isalnum, it uses Py_ISALNUM from pyctype.h table for ASCII. Sources confirm via unicodedata lookups and generated tables from Unicode data.

Citations:


🏁 Script executed:

# Check the actual combining classes of test characters
python3 << 'EOF'
import unicodedata

test_chars = [
    ('\u0303', 'U+0303 COMBINING TILDE'),
    ('\u0345', 'U+0345 COMBINING GREEK YPOGEGRAMMENI'),
    ('\u0363', 'U+0363 COMBINING LATIN SMALL LETTER A'),
    ('\u093F', 'U+093F DEVANAGARI VOWEL SIGN I'),
]

for char, name in test_chars:
    cat = unicodedata.category(char)
    comb_class = unicodedata.combining(char)
    print(f"{name}: category={cat}, combining_class={comb_class}")
EOF

Repository: RustPython/RustPython

Length of output: 327


🏁 Script executed:

# Check if Rust's is_alphanumeric matches Unicode Alphabetic property
# First check what tests actually verify
git show HEAD:extra_tests/snippets/builtin_str.py | sed -n '76,95p'

Repository: RustPython/RustPython

Length of output: 601


🏁 Script executed:

# Check if there are any other tests or comments about combining marks
git show HEAD:crates/vm/src/builtins/str.rs | grep -A 5 -B 5 "combining\|Combining"

Repository: RustPython/RustPython

Length of output: 708


CanonicalCombiningClass::NotReordered filtering is incomplete for CPython parity.

CPython defines str.isalnum() as isalpha() || isdecimal() || isdigit() || isnumeric(), where isalpha() accepts only Unicode letter categories Lm/Lt/Lu/Ll/Lo. Rust's char::is_alphanumeric() is broader, including Alphabetic property which covers combining marks. Unicode class-zero (NotReordered) marks include spacing combining marks like U+093F (DEVANAGARI VOWEL SIGN I), which has category Mc and combining_class=0. The current filter admits such marks despite CPython rejecting them. Tests cover only non-class-zero combining marks (U+0303/U+0345/U+0363..U+036F, all with combining_class=230 or 240), missing this gap. Switch to a CPython-compatible category check (is_letter_category(L*) || is_digit_category(N*)) and add a regression test for U+093F. The mirror predicate in crates/sre_engine/src/string.rs:444-452 (which has a TODO comment) should be updated in lockstep to keep str.isalnum() and \w aligned.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@crates/vm/src/builtins/str.rs` around lines 949 - 954, Replace the current
isalnum implementation (which uses char::is_alphanumeric plus
CanonicalCombiningClass check) with a CPython-compatible predicate that accepts
only Unicode letter categories (Lu/Ll/Lt/Lm/Lo) or number categories (Nd/Nl/Np)
— i.e., check the character's general category starts with 'L' or 'N' — so
spacing/combing marks like U+093F (Mc) are rejected; update the isalnum method
(and the char_all predicate it uses) to use this category test, update the
mirror predicate used for \w in the sre_engine string predicate to the same
logic so they stay aligned, and add a regression test asserting str.isalnum()
returns false for U+093F.

}

#[pymethod]
Expand Down
9 changes: 9 additions & 0 deletions extra_tests/snippets/builtin_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,15 @@
# assert "DZ".title() == "Dz"
assert a.isalpha()

# Combining characters differ slightly between Rust and Python
assert "\u006e".isalnum()
assert not "\u0303".isalnum()
assert not "\u006e\u0303".isalnum()
assert "\u00f1".isalnum()
Copy link
Copy Markdown
Contributor

@ShaharNaveh ShaharNaveh Apr 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it may be worth opening a PR for CPython to test those changes to ensure there's no regression on their end as well (it will get to us as well)

assert not "\u0345".isalnum()
for raw in range(0x0363, 0x036F):
assert not chr(raw).isalnum()

s = "1 2 3"
assert s.split(" ", 1) == ["1", "2 3"]
assert s.rsplit(" ", 1) == ["1 2", "3"]
Expand Down
3 changes: 3 additions & 0 deletions extra_tests/snippets/stdlib_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,6 @@
# Test of fix re.fullmatch POSSESSIVE_REPEAT, issue #7183
assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38")
assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38").group(0) == "1.25.38"

# Combining characters; issue #7518
assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)"
Loading