tensor4all_core/
smallstring.rs

1/// Trait for character types that can be stored in SmallString.
2///
3/// This trait abstracts over different character representations:
4/// - `u16`: UTF-16 code unit (2 bytes), compatible with ITensors.jl's SmallString
5/// - `char`: Full Unicode code point (4 bytes), supports all Unicode characters
6///
7/// Default is `u16` for memory efficiency and ITensors.jl compatibility.
8pub trait SmallChar: Copy + Default + Ord + Eq + std::hash::Hash + std::fmt::Debug {
9    /// The null/zero value for this character type.
10    const ZERO: Self;
11
12    /// Convert from a Rust char.
13    /// Returns None if the character cannot be represented in this type.
14    fn from_char(c: char) -> Option<Self>;
15
16    /// Convert to a Rust char.
17    fn to_char(self) -> char;
18}
19
20impl SmallChar for u16 {
21    const ZERO: Self = 0;
22
23    fn from_char(c: char) -> Option<Self> {
24        // u16 can represent BMP characters (U+0000 to U+FFFF)
25        let code = c as u32;
26        if code <= 0xFFFF {
27            Some(code as u16)
28        } else {
29            None // Surrogate pairs not supported
30        }
31    }
32
33    fn to_char(self) -> char {
34        // Safe because we only store valid BMP characters
35        char::from_u32(self as u32).unwrap_or('\u{FFFD}')
36    }
37}
38
39impl SmallChar for char {
40    const ZERO: Self = '\0';
41
42    fn from_char(c: char) -> Option<Self> {
43        Some(c)
44    }
45
46    fn to_char(self) -> char {
47        self
48    }
49}
50
51/// A stack-allocated fixed-capacity string with explicit length.
52///
53/// This type stores characters in a fixed-size array and maintains
54/// an explicit length field, similar to ITensors.jl's `SmallString`.
55///
56/// # Type Parameters
57/// - `MAX_LEN`: Maximum number of characters (default: 16, matching ITensors.jl)
58/// - `C`: Character type (default: `u16` for ITensors.jl compatibility)
59///
60/// # Character Type Options
61/// - `u16` (default): 2 bytes per character, supports BMP (Basic Multilingual Plane)
62///   - Covers ASCII, Japanese, Chinese, Korean, and most practical characters
63///   - Does NOT support emoji or rare characters outside BMP
64/// - `char`: 4 bytes per character, full Unicode support
65///
66/// # Example
67/// ```
68/// use tensor4all_core::smallstring::SmallString;
69///
70/// // Default: u16 characters (ITensors.jl compatible)
71/// let s1 = SmallString::<16>::from_str("hello").unwrap();
72///
73/// // Explicit char type for full Unicode support
74/// let s2 = SmallString::<16, char>::from_str("hello 😀").unwrap();
75/// ```
76#[derive(Debug, Clone, Copy)]
77pub struct SmallString<const MAX_LEN: usize, C: SmallChar = u16> {
78    data: [C; MAX_LEN],
79    len: usize, // Explicit length (0 ≤ len ≤ MAX_LEN)
80}
81
82/// Error type for SmallString operations.
83#[derive(thiserror::Error, Debug, Clone, Copy, PartialEq, Eq)]
84pub enum SmallStringError {
85    /// The string exceeds the maximum length.
86    #[error("String too long ({actual} > {max})")]
87    TooLong {
88        /// The actual length of the string.
89        actual: usize,
90        /// The maximum allowed length.
91        max: usize,
92    },
93    /// A character cannot be represented in the target character type.
94    #[error("Invalid character: {char_value:?}")]
95    InvalidChar {
96        /// The character that could not be converted.
97        char_value: char,
98    },
99}
100
101impl<const MAX_LEN: usize, C: SmallChar> SmallString<MAX_LEN, C> {
102    /// Create an empty SmallString.
103    pub fn new() -> Self {
104        Self {
105            data: [C::ZERO; MAX_LEN],
106            len: 0,
107        }
108    }
109
110    /// Create a SmallString from a string slice.
111    ///
112    /// Returns an error if:
113    /// - The string is longer than MAX_LEN characters
114    /// - Any character cannot be represented in the character type C
115    ///
116    /// This function is allocation-free (no heap allocation).
117    #[allow(clippy::should_implement_trait)]
118    pub fn from_str(s: &str) -> Result<Self, SmallStringError> {
119        let mut data = [C::ZERO; MAX_LEN];
120        let mut len = 0;
121
122        for ch in s.chars() {
123            if len >= MAX_LEN {
124                // Count total characters for error message
125                let actual = len + 1 + s.chars().skip(len + 1).count();
126                return Err(SmallStringError::TooLong {
127                    actual,
128                    max: MAX_LEN,
129                });
130            }
131            data[len] = C::from_char(ch).ok_or(SmallStringError::InvalidChar { char_value: ch })?;
132            len += 1;
133        }
134
135        Ok(Self { data, len })
136    }
137
138    /// Convert to a String.
139    pub fn as_str(&self) -> String {
140        self.data[..self.len].iter().map(|c| c.to_char()).collect()
141    }
142
143    /// Check if the string is empty.
144    pub fn is_empty(&self) -> bool {
145        self.len == 0
146    }
147
148    /// Get the length of the string.
149    pub fn len(&self) -> usize {
150        self.len
151    }
152
153    /// Get the maximum capacity.
154    pub fn capacity(&self) -> usize {
155        MAX_LEN
156    }
157
158    /// Get a character at the given index.
159    pub fn get(&self, index: usize) -> Option<char> {
160        if index < self.len {
161            Some(self.data[index].to_char())
162        } else {
163            None
164        }
165    }
166
167    /// Get a reference to the internal data slice.
168    pub fn as_slice(&self) -> &[C] {
169        &self.data[..self.len]
170    }
171}
172
173impl<const MAX_LEN: usize, C: SmallChar> Default for SmallString<MAX_LEN, C> {
174    fn default() -> Self {
175        Self::new()
176    }
177}
178
179impl<const MAX_LEN: usize, C: SmallChar> PartialEq for SmallString<MAX_LEN, C> {
180    fn eq(&self, other: &Self) -> bool {
181        self.len == other.len && self.data[..self.len] == other.data[..other.len]
182    }
183}
184
185impl<const MAX_LEN: usize, C: SmallChar> Eq for SmallString<MAX_LEN, C> {}
186
187impl<const MAX_LEN: usize, C: SmallChar> std::hash::Hash for SmallString<MAX_LEN, C> {
188    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
189        self.data[..self.len].hash(state);
190    }
191}
192
193impl<const MAX_LEN: usize, C: SmallChar> PartialOrd for SmallString<MAX_LEN, C> {
194    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
195        Some(self.cmp(other))
196    }
197}
198
199impl<const MAX_LEN: usize, C: SmallChar> Ord for SmallString<MAX_LEN, C> {
200    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
201        self.data[..self.len].cmp(&other.data[..other.len])
202    }
203}
204
205impl<const MAX_LEN: usize, C: SmallChar> std::fmt::Display for SmallString<MAX_LEN, C> {
206    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
207        write!(f, "{}", self.as_str())
208    }
209}
210
211#[cfg(test)]
212mod tests;