Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion library/core/src/unicode/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ pub(crate) use unicode_data::white_space::lookup as White_Space;

pub(crate) mod printable;

mod rt;
#[allow(unreachable_pub)]
mod unicode_data;
pub mod unicode_data;

/// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of
/// `char` and `str` methods are based on.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
//! Runtime support for `unicode_data`.

#[inline(always)]
const fn bitset_search<
pub(super) const fn bitset_search<
const N: usize,
const CHUNK_SIZE: usize,
const N1: usize,
Expand Down Expand Up @@ -46,23 +48,23 @@ const fn bitset_search<
}

#[repr(transparent)]
struct ShortOffsetRunHeader(u32);
pub(super) struct ShortOffsetRunHeader(pub(super) u32);

impl ShortOffsetRunHeader {
const fn new(start_index: usize, prefix_sum: u32) -> Self {
pub(super) const fn new(start_index: usize, prefix_sum: u32) -> Self {
assert!(start_index < (1 << 11));
assert!(prefix_sum < (1 << 21));

Self((start_index as u32) << 21 | prefix_sum)
}

#[inline]
const fn start_index(&self) -> usize {
pub(super) const fn start_index(&self) -> usize {
(self.0 >> 21) as usize
}

#[inline]
const fn prefix_sum(&self) -> u32 {
pub(super) const fn prefix_sum(&self) -> u32 {
self.0 & ((1 << 21) - 1)
}
}
Expand All @@ -72,7 +74,7 @@ impl ShortOffsetRunHeader {
/// - The last element of `short_offset_runs` must be greater than `std::char::MAX`.
/// - The start indices of all elements in `short_offset_runs` must be less than `OFFSETS`.
#[inline(always)]
unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
pub(super) unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
needle: char,
short_offset_runs: &[ShortOffsetRunHeader; SOR],
offsets: &[u8; OFFSETS],
Expand Down Expand Up @@ -126,3 +128,30 @@ unsafe fn skip_search<const SOR: usize, const OFFSETS: usize>(
}
offset_idx % 2 == 1
}

#[inline(always)]
pub(super) fn case_conversion(
c: char,
ascii_fn: fn(char) -> char,
table: &[(char, u32)],
multi: &[[char; 3]],
) -> [char; 3] {
const INDEX_MASK: u32 = 1 << 22;

if c.is_ascii() {
return [ascii_fn(c), '\0', '\0'];
}

let Ok(i) = table.binary_search_by(|&(key, _)| key.cmp(&c)) else {
return [c, '\0', '\0'];
};

let u = table[i].1;
match char::from_u32(u) {
Option::Some(c) => [c, '\0', '\0'],
Option::None => {
// SAFETY: Index comes from statically generated table
unsafe { *multi.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
}
}
}
2,505 changes: 1,289 additions & 1,216 deletions library/core/src/unicode/unicode_data.rs

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions library/coretests/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@
#![feature(try_find)]
#![feature(try_trait_v2)]
#![feature(uint_bit_width)]
#![feature(unicode_internals)]
#![feature(unsize)]
#![feature(unwrap_infallible)]
// tidy-alphabetical-end
Expand Down
96 changes: 96 additions & 0 deletions library/coretests/tests/unicode.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,101 @@
use core::unicode::unicode_data;
use std::ops::RangeInclusive;

mod test_data;

#[test]
pub fn version() {
let (major, _minor, _update) = core::char::UNICODE_VERSION;
assert!(major >= 10);
}

#[track_caller]
fn test_boolean_property(ranges: &[RangeInclusive<char>], lookup: fn(char) -> bool) {
let mut start = '\u{80}';
for range in ranges {
for c in start..*range.start() {
assert!(!lookup(c), "{c:?}");
}
for c in range.clone() {
assert!(lookup(c), "{c:?}");
}
start = char::from_u32(*range.end() as u32 + 1).unwrap();
}
for c in start..=char::MAX {
assert!(!lookup(c), "{c:?}");
}
}

#[track_caller]
fn test_case_mapping(ranges: &[(char, [char; 3])], lookup: fn(char) -> [char; 3]) {
let mut start = '\u{80}';
for &(key, val) in ranges {
for c in start..key {
assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
}
assert_eq!(lookup(key), val, "{key:?}");
start = char::from_u32(key as u32 + 1).unwrap();
}
for c in start..=char::MAX {
assert_eq!(lookup(c), [c, '\0', '\0'], "{c:?}");
}
}

#[test]
#[cfg_attr(miri, ignore)]
fn alphabetic() {
test_boolean_property(test_data::ALPHABETIC, unicode_data::alphabetic::lookup);
}

#[test]
#[cfg_attr(miri, ignore)]
fn case_ignorable() {
test_boolean_property(test_data::CASE_IGNORABLE, unicode_data::case_ignorable::lookup);
}

#[test]
#[cfg_attr(miri, ignore)]
fn cased() {
test_boolean_property(test_data::CASED, unicode_data::cased::lookup);
}

#[test]
#[cfg_attr(miri, ignore)]
fn grapheme_extend() {
test_boolean_property(test_data::GRAPHEME_EXTEND, unicode_data::grapheme_extend::lookup);
}

#[test]
#[cfg_attr(miri, ignore)]
fn lowercase() {
test_boolean_property(test_data::LOWERCASE, unicode_data::lowercase::lookup);
}

#[test]
fn n() {
test_boolean_property(test_data::N, unicode_data::n::lookup);
}

#[test]
#[cfg_attr(miri, ignore)]
fn uppercase() {
test_boolean_property(test_data::UPPERCASE, unicode_data::uppercase::lookup);
}

#[test]
#[cfg_attr(miri, ignore)]
fn white_space() {
test_boolean_property(test_data::WHITE_SPACE, unicode_data::white_space::lookup);
}

#[test]
#[cfg_attr(miri, ignore)]
fn to_lowercase() {
test_case_mapping(test_data::TO_LOWER, unicode_data::conversions::to_lower);
}

#[test]
#[cfg_attr(miri, ignore)]
fn to_uppercase() {
test_case_mapping(test_data::TO_UPPER, unicode_data::conversions::to_upper);
}
Loading
Loading