From 91d058f290696cd6f0b52526751b427ac4c0e7f8 Mon Sep 17 00:00:00 2001 From: mxsm Date: Sat, 20 Jun 2026 09:52:02 +0800 Subject: [PATCH 1/8] Feat: harden string invariants and release automation --- .github/workflows/release.yml | 110 ++++++++++++++++++++++++++ Cargo.toml | 8 +- README.md | 4 +- benches/comprehensive.rs | 8 +- src/cheetah_string.rs | 144 ++++++++++++++++++++++++++-------- src/lib.rs | 2 +- src/serde.rs | 34 +++----- tests/basic.rs | 29 ++++++- 8 files changed, 269 insertions(+), 70 deletions(-) create mode 100644 .github/workflows/release.yml diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..d3f5beb --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,110 @@ +name: Release + +on: + push: + tags: + - "v*.*.*" + workflow_dispatch: + inputs: + version: + description: "Version to release, for example 1.1.0" + required: true + type: string + +permissions: + contents: write + +env: + CARGO_TERM_COLOR: always + RUST_BACKTRACE: full + +jobs: + publish: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Rust + uses: dtolnay/rust-toolchain@stable + with: + toolchain: stable + + - name: Resolve release version + id: version + shell: bash + run: | + if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + VERSION="${{ inputs.version }}" + else + VERSION="${GITHUB_REF_NAME#v}" + fi + + if [[ ! "$VERSION" =~ ^[0-9]+\.[0-9]+\.[0-9]+([+-][0-9A-Za-z.-]+)?$ ]]; then + echo "Invalid release version: $VERSION" >&2 + exit 1 + fi + + MANIFEST_VERSION="$(cargo metadata --no-deps --format-version 1 | python3 -c 'import json, sys; print(json.load(sys.stdin)["packages"][0]["version"])')" + if [[ "$MANIFEST_VERSION" != "$VERSION" ]]; then + echo "Cargo.toml version $MANIFEST_VERSION does not match release version $VERSION" >&2 + exit 1 + fi + + echo "version=$VERSION" >> "$GITHUB_OUTPUT" + echo "tag=v$VERSION" >> "$GITHUB_OUTPUT" + + - name: Create tag for manual release + if: github.event_name == 'workflow_dispatch' + shell: bash + run: | + TAG="${{ steps.version.outputs.tag }}" + CURRENT_SHA="$(git rev-parse HEAD)" + + if git rev-parse "$TAG" >/dev/null 2>&1; then + TAG_SHA="$(git rev-list -n 1 "$TAG")" + if [[ "$TAG_SHA" != "$CURRENT_SHA" ]]; then + echo "Tag $TAG already exists at $TAG_SHA, not current HEAD $CURRENT_SHA" >&2 + exit 1 + fi + else + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git tag "$TAG" + git push origin "$TAG" + fi + + - name: Check formatting + run: cargo fmt -- --check + + - name: Lint + run: cargo clippy --all-features -- -D warnings + + - name: Test all features + run: cargo test --all-features + + - name: Test no-default feature matrix + run: cargo test --no-default-features --features serde,bytes,simd + + - name: Package crate + run: cargo package + + - name: Publish crate to crates.io + run: cargo publish --token "$CARGO_REGISTRY_TOKEN" + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} + + - name: Create GitHub release + env: + GH_TOKEN: ${{ github.token }} + TAG: ${{ steps.version.outputs.tag }} + VERSION: ${{ steps.version.outputs.version }} + run: | + gh release create "$TAG" \ + "target/package/cheetah-string-$VERSION.crate#cheetah-string-$VERSION.crate" \ + --verify-tag \ + --title "cheetah-string $TAG" \ + --generate-notes diff --git a/Cargo.toml b/Cargo.toml index a258edb..2d54803 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cheetah-string" -version = "1.0.1" +version = "1.1.0" authors = ["mxsm "] edition = "2021" homepage = "https://github.com/mxsm/cheetah-string" @@ -15,14 +15,14 @@ A lightweight, high-performance string manipulation library optimized for speed- """ [dependencies] -bytes = "1.10.0" +bytes = { version = "1.10.0", optional = true, default-features = false } serde = { version = "1.0", optional = true, default-features = false, features = ["alloc"] } [features] default = ["std"] std = [] -serde = ["serde/alloc"] -bytes = [] +serde = ["dep:serde", "serde/alloc"] +bytes = ["dep:bytes"] simd = [] [dev-dependencies] diff --git a/README.md b/README.md index 9156ce1..db5583c 100644 --- a/README.md +++ b/README.md @@ -45,14 +45,14 @@ Add this to your `Cargo.toml`: ```toml [dependencies] -cheetah-string = "1.0.0" +cheetah-string = "1.1.0" ``` ### Optional Features ```toml [dependencies] -cheetah-string = { version = "1.0.0", features = ["bytes", "serde", "simd"] } +cheetah-string = { version = "1.1.0", features = ["bytes", "serde", "simd"] } ``` Available features: diff --git a/benches/comprehensive.rs b/benches/comprehensive.rs index 14a405f..174ce1a 100644 --- a/benches/comprehensive.rs +++ b/benches/comprehensive.rs @@ -362,13 +362,13 @@ fn bench_internal_hot_paths(c: &mut Criterion) { }); let long_bytes = vec![b'a'; 256]; - group.bench_function("CheetahString::from(Vec 256B)", |b| { - b.iter(|| black_box(CheetahString::from(long_bytes.clone()))) + group.bench_function("CheetahString::try_from_vec(256B)", |b| { + b.iter(|| black_box(CheetahString::try_from_vec(long_bytes.clone()).unwrap())) }); - group.bench_function("String::from(CheetahString::from(Vec 256B))", |b| { + group.bench_function("String::from(CheetahString::try_from_vec(256B))", |b| { b.iter(|| { - let value = CheetahString::from(long_bytes.clone()); + let value = CheetahString::try_from_vec(long_bytes.clone()).unwrap(); black_box(String::from(value)) }) }); diff --git a/src/cheetah_string.rs b/src/cheetah_string.rs index 9052254..89df165 100644 --- a/src/cheetah_string.rs +++ b/src/cheetah_string.rs @@ -47,19 +47,12 @@ impl<'a> From<&'a str> for CheetahString { } } -/// # Safety Warning -/// -/// This implementation uses `unsafe` code and may cause undefined behavior -/// if the bytes are not valid UTF-8. Consider using `CheetahString::try_from_bytes()` -/// for safe UTF-8 validation. -/// -/// This implementation will be deprecated in a future version. -impl From<&[u8]> for CheetahString { +impl<'a> TryFrom<&'a [u8]> for CheetahString { + type Error = Utf8Error; + #[inline] - fn from(b: &[u8]) -> Self { - // SAFETY: This is unsafe and may cause UB if bytes are not valid UTF-8. - // This will be deprecated in favor of try_from_bytes in the next version. - CheetahString::from_slice(unsafe { str::from_utf8_unchecked(b) }) + fn try_from(b: &'a [u8]) -> Result { + CheetahString::try_from_bytes(b) } } @@ -71,19 +64,12 @@ impl FromStr for CheetahString { } } -/// # Safety Warning -/// -/// This implementation uses `unsafe` code and may cause undefined behavior -/// if the bytes are not valid UTF-8. Consider using `CheetahString::try_from_vec()` -/// for safe UTF-8 validation. -/// -/// This implementation will be deprecated in a future version. -impl From> for CheetahString { +impl TryFrom> for CheetahString { + type Error = Utf8Error; + #[inline] - fn from(v: Vec) -> Self { - // SAFETY: This constructor does not validate UTF-8 and may cause UB - // if the bytes are later observed as a string. - CheetahString::from_vec(v) + fn try_from(v: Vec) -> Result { + CheetahString::try_from_vec(v) } } @@ -159,10 +145,12 @@ impl<'a> FromIterator<&'a String> for CheetahString { } #[cfg(feature = "bytes")] -impl From for CheetahString { +impl TryFrom for CheetahString { + type Error = Utf8Error; + #[inline] - fn from(b: bytes::Bytes) -> Self { - CheetahString::from_bytes(b) + fn try_from(b: bytes::Bytes) -> Result { + CheetahString::try_from_bytes_buf(b) } } @@ -277,8 +265,29 @@ impl CheetahString { } } - #[inline] + #[deprecated( + since = "1.1.0", + note = "use try_from_vec for checked construction or from_utf8_unchecked_vec for an explicit unsafe constructor" + )] pub fn from_vec(s: Vec) -> Self { + CheetahString::try_from_vec(s).expect( + "CheetahString::from_vec requires valid UTF-8; use try_from_vec for fallible construction", + ) + } + + /// Creates a `CheetahString` from a byte vector without validating UTF-8. + /// + /// # Safety + /// + /// The caller must guarantee that `s` contains valid UTF-8 for the entire + /// lifetime of the returned `CheetahString`. + #[inline] + pub unsafe fn from_utf8_unchecked_vec(s: Vec) -> Self { + CheetahString::from_validated_vec_unchecked(s) + } + + #[inline] + fn from_validated_vec_unchecked(s: Vec) -> Self { if s.len() <= INLINE_CAPACITY { let mut data = [0u8; INLINE_CAPACITY]; data[..s.len()].copy_from_slice(&s); @@ -314,9 +323,8 @@ impl CheetahString { /// assert!(CheetahString::try_from_vec(invalid).is_err()); /// ``` pub fn try_from_vec(v: Vec) -> Result { - // Validate UTF-8 str::from_utf8(&v)?; - Ok(CheetahString::from_vec(v)) + Ok(CheetahString::from_validated_vec_unchecked(v)) } /// Creates a `CheetahString` from a byte slice with UTF-8 validation. @@ -342,8 +350,51 @@ impl CheetahString { Ok(CheetahString::from_slice(s)) } + /// Creates a `CheetahString` from a byte slice without validating UTF-8. + /// + /// # Safety + /// + /// The caller must guarantee that `b` contains valid UTF-8. + #[inline] + pub unsafe fn from_utf8_unchecked_bytes(b: &[u8]) -> Self { + // SAFETY: The caller guarantees that `b` contains valid UTF-8. + CheetahString::from_slice(unsafe { str::from_utf8_unchecked(b) }) + } + + /// Creates a `CheetahString` from a shared byte vector with UTF-8 validation. + /// + /// # Errors + /// + /// Returns an error if the bytes are not valid UTF-8. + #[inline] + pub fn try_from_arc_vec(s: Arc>) -> Result { + str::from_utf8(s.as_slice())?; + Ok(CheetahString::from_validated_arc_vec_unchecked(s)) + } + + #[deprecated( + since = "1.1.0", + note = "use try_from_arc_vec for checked construction or from_utf8_unchecked_arc_vec for an explicit unsafe constructor" + )] #[inline] pub fn from_arc_vec(s: Arc>) -> Self { + CheetahString::try_from_arc_vec(s).expect( + "CheetahString::from_arc_vec requires valid UTF-8; use try_from_arc_vec for fallible construction", + ) + } + + /// Creates a `CheetahString` from a shared byte vector without validating UTF-8. + /// + /// # Safety + /// + /// The caller must guarantee that `s` contains valid UTF-8. + #[inline] + pub unsafe fn from_utf8_unchecked_arc_vec(s: Arc>) -> Self { + CheetahString::from_validated_arc_vec_unchecked(s) + } + + #[inline] + fn from_validated_arc_vec_unchecked(s: Arc>) -> Self { CheetahString { inner: InnerString::ArcVecString(s), } @@ -418,7 +469,37 @@ impl CheetahString { #[inline] #[cfg(feature = "bytes")] + #[deprecated( + since = "1.1.0", + note = "use try_from_bytes_buf for checked construction or from_utf8_unchecked_bytes_buf for an explicit unsafe constructor" + )] pub fn from_bytes(b: bytes::Bytes) -> Self { + CheetahString::try_from_bytes_buf(b).expect( + "CheetahString::from_bytes requires valid UTF-8; use try_from_bytes_buf for fallible construction", + ) + } + + #[inline] + #[cfg(feature = "bytes")] + pub fn try_from_bytes_buf(b: bytes::Bytes) -> Result { + str::from_utf8(b.as_ref())?; + Ok(CheetahString::from_validated_bytes_unchecked(b)) + } + + /// Creates a `CheetahString` from `bytes::Bytes` without validating UTF-8. + /// + /// # Safety + /// + /// The caller must guarantee that `b` contains valid UTF-8. + #[inline] + #[cfg(feature = "bytes")] + pub unsafe fn from_utf8_unchecked_bytes_buf(b: bytes::Bytes) -> Self { + CheetahString::from_validated_bytes_unchecked(b) + } + + #[inline] + #[cfg(feature = "bytes")] + fn from_validated_bytes_unchecked(b: bytes::Bytes) -> Self { CheetahString { inner: InnerString::Bytes(b), } @@ -1455,6 +1536,7 @@ impl<'a> DoubleEndedIterator for SplitWrapper<'a> { #[cfg(test)] mod tests { use super::*; + use alloc::{format, vec}; #[test] fn with_capacity_above_inline_uses_heap_storage() { @@ -1524,7 +1606,7 @@ mod tests { #[test] fn long_vec_conversion_uses_arc_vec_storage() { let value = "a".repeat(INLINE_CAPACITY + 1).into_bytes(); - let s = CheetahString::from(value); + let s = CheetahString::try_from_vec(value).expect("valid utf-8"); match &s.inner { InnerString::ArcVecString(inner) => { diff --git a/src/lib.rs b/src/lib.rs index 5d030ba..c9aab49 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,7 +21,7 @@ //! To enable SIMD acceleration: //! ```toml //! [dependencies] -//! cheetah-string = { version = "1.0.0", features = ["simd"] } +//! cheetah-string = { version = "1.1.0", features = ["simd"] } //! ``` //! //! # Examples diff --git a/src/serde.rs b/src/serde.rs index 0eef167..f0d448d 100644 --- a/src/serde.rs +++ b/src/serde.rs @@ -1,10 +1,9 @@ -use crate::cheetah_string::InnerString; use crate::CheetahString; use alloc::string::String; use alloc::vec::Vec; use core::fmt; use core::str; -use serde::de::{Error, Unexpected, Visitor}; +use serde::de::{Error, Visitor}; use serde::{Deserialize, Deserializer, Serialize, Serializer}; impl Serialize for CheetahString { @@ -12,20 +11,7 @@ impl Serialize for CheetahString { where S: Serializer, { - match &self.inner { - InnerString::Inline { len, data } => { - // Safety: InnerString::Inline guarantees that data[0..len] is valid UTF-8 - let s = unsafe { str::from_utf8_unchecked(&data[..*len as usize]) }; - serializer.serialize_str(s) - } - InnerString::StaticStr(s) => serializer.serialize_str(s), - InnerString::ArcStr(s) => serializer.serialize_str(s.as_ref()), - InnerString::Owned(s) => serializer.serialize_str(s.as_str()), - InnerString::ArcString(s) => serializer.serialize_str(s.as_str()), - InnerString::ArcVecString(s) => serializer.serialize_bytes(s), - #[cfg(feature = "bytes")] - InnerString::Bytes(bytes) => serializer.serialize_bytes(bytes.as_ref()), - } + serializer.serialize_str(self.as_str()) } } @@ -67,27 +53,25 @@ where where E: Error, { - Ok(CheetahString::from(v)) + str::from_utf8(v) + .map(CheetahString::from_slice) + .map_err(Error::custom) } fn visit_borrowed_bytes(self, v: &'a [u8]) -> Result where E: Error, { - Ok(CheetahString::from(v)) + str::from_utf8(v) + .map(CheetahString::from_slice) + .map_err(Error::custom) } fn visit_byte_buf(self, v: Vec) -> Result where E: Error, { - match String::from_utf8(v) { - Ok(s) => Ok(CheetahString::from_string(s)), - Err(e) => Err(Error::invalid_value( - Unexpected::Bytes(&e.into_bytes()), - &self, - )), - } + CheetahString::try_from_vec(v).map_err(Error::custom) } } deserializer.deserialize_str(CheetahStringVisitor) diff --git a/tests/basic.rs b/tests/basic.rs index af3cd9c..4c8329a 100644 --- a/tests/basic.rs +++ b/tests/basic.rs @@ -189,7 +189,7 @@ fn test_into_string_reuses_unique_vec_buffer() { let bytes = vec![b'a'; 64]; let original_ptr = bytes.as_ptr(); - let s = CheetahString::from(bytes); + let s = CheetahString::try_from_vec(bytes).unwrap(); let owned: String = s.into(); assert_eq!(owned.as_bytes().as_ptr(), original_ptr); @@ -200,7 +200,7 @@ fn test_into_string_clones_shared_vec_buffer() { let bytes = vec![b'a'; 64]; let original_ptr = bytes.as_ptr(); - let s = CheetahString::from(bytes); + let s = CheetahString::try_from_vec(bytes).unwrap(); let shared = s.clone(); let owned: String = s.into(); @@ -281,6 +281,26 @@ fn test_try_from_vec_method() { assert!(CheetahString::try_from_vec(invalid).is_err()); } +#[test] +fn test_try_from_slice_trait() { + let bytes: &[u8] = b"hello"; + let s = CheetahString::try_from(bytes).unwrap(); + assert_eq!(s, "hello"); + + let invalid: &[u8] = &[0xFF, 0xFE]; + assert!(CheetahString::try_from(invalid).is_err()); +} + +#[test] +fn test_try_from_vec_trait() { + let bytes = vec![104, 101, 108, 108, 111]; + let s = CheetahString::try_from(bytes).unwrap(); + assert_eq!(s, "hello"); + + let invalid = vec![0xFF, 0xFE]; + assert!(CheetahString::try_from(invalid).is_err()); +} + #[test] fn test_unicode() { let s = CheetahString::from("\u{00E9}\u{00E7}\u{00F1}\u{00FC}"); // accented chars @@ -341,6 +361,9 @@ fn test_from_bytes_feature() { use bytes::Bytes; let bytes = Bytes::from("hello"); - let s = CheetahString::from(bytes); + let s = CheetahString::try_from(bytes).unwrap(); assert_eq!(s, "hello"); + + let invalid = Bytes::from_static(&[0xFF, 0xFE]); + assert!(CheetahString::try_from(invalid).is_err()); } From 08189cdbbc1956b8053e7a06bab47a5876d09948 Mon Sep 17 00:00:00 2001 From: mxsm Date: Sat, 20 Jun 2026 10:27:45 +0800 Subject: [PATCH 2/8] Fix: satisfy nightly clippy question-mark lint --- src/simd.rs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/simd.rs b/src/simd.rs index 175f491..80a8293 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -213,18 +213,15 @@ unsafe fn find_bytes_sse2(haystack: &[u8], needle: &[u8]) -> Option { while pos + needle_len <= haystack_len { // Find the next occurrence of the first byte - if let Some(offset) = find_byte_sse2(&haystack[pos..], first_byte) { - let candidate_pos = pos + offset; - - // Check if the rest matches - if candidate_pos + needle_len <= haystack_len { - if eq_bytes_sse2(&haystack[candidate_pos..candidate_pos + needle_len], needle) { - return Some(candidate_pos); - } - pos = candidate_pos + 1; - } else { - return None; + let offset = find_byte_sse2(&haystack[pos..], first_byte)?; + let candidate_pos = pos + offset; + + // Check if the rest matches + if candidate_pos + needle_len <= haystack_len { + if eq_bytes_sse2(&haystack[candidate_pos..candidate_pos + needle_len], needle) { + return Some(candidate_pos); } + pos = candidate_pos + 1; } else { return None; } From ab3ef5cca68d5d61791d60318e0441dc63b87ed4 Mon Sep 17 00:00:00 2001 From: mxsm Date: Sat, 20 Jun 2026 09:55:09 +0800 Subject: [PATCH 3/8] Feat: add layout and benchmark artifacts --- .github/workflows/ci.yaml | 15 ++++++++-- Cargo.toml | 4 +++ bench-results/README.md | 53 +++++++++++++++++++++++++++++++++ benches/layout.rs | 57 +++++++++++++++++++++++++++++++++++ scripts/bench-all.ps1 | 16 ++++++++++ scripts/bench-all.sh | 12 ++++++++ tests/layout_snapshot.rs | 62 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 217 insertions(+), 2 deletions(-) create mode 100644 bench-results/README.md create mode 100644 benches/layout.rs create mode 100644 scripts/bench-all.ps1 create mode 100755 scripts/bench-all.sh create mode 100644 tests/layout_snapshot.rs diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 9aec1ae..f2550bc 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -10,8 +10,8 @@ env: CI: true jobs: - build: - runs-on: ubuntu-latest + build: + runs-on: ${{ matrix.os }} strategy: matrix: @@ -50,3 +50,14 @@ jobs: - name: Test run: cargo test --verbose --all-features + + - name: Layout snapshot + run: cargo test layout_snapshot --all-features -- --nocapture + + - name: Upload layout artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: layout-${{ matrix.os }}-${{ matrix.rust }} + path: target/layout-artifacts + if-no-files-found: ignore diff --git a/Cargo.toml b/Cargo.toml index 2d54803..e2dce5a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,10 @@ harness = false name = "comprehensive" harness = false +[[bench]] +name = "layout" +harness = false + [[bench]] name = "simd" harness = false diff --git a/bench-results/README.md b/bench-results/README.md new file mode 100644 index 0000000..1e0fc59 --- /dev/null +++ b/bench-results/README.md @@ -0,0 +1,53 @@ +# Benchmark Artifacts + +This directory defines the artifact layout for performance-sensitive changes. +Generated benchmark output should be committed only when it is intentionally +used as review evidence for a release or PR. + +Recommended layout: + +```text +bench-results/ + layout/ + current.json + v1.1.json + v1.2.json + v2-packed.json + criterion/ + before/ + after/ + mq/ + topic.json + properties.json + remoting-header.json + summaries/ + summary-v1.1-v1.2.md + summary-v1.2-v2-packed.md +``` + +Minimum metadata for generated JSON artifacts: + +```json +{ + "crate": "cheetah-string", + "version": "1.1.0", + "profile": "release", + "target": "x86_64-unknown-linux-gnu", + "rustc": "rustc 1.xx.x", + "os": "linux", + "cpu": "model name", + "bench": "layout" +} +``` + +For local capture, run: + +```bash +scripts/bench-all.sh current +``` + +On Windows PowerShell: + +```powershell +scripts/bench-all.ps1 current +``` diff --git a/benches/layout.rs b/benches/layout.rs new file mode 100644 index 0000000..ad7eb52 --- /dev/null +++ b/benches/layout.rs @@ -0,0 +1,57 @@ +use cheetah_string::CheetahString; +use std::env; +use std::fs; +use std::mem::{align_of, size_of}; +use std::path::PathBuf; + +fn target_dir() -> PathBuf { + env::var_os("CARGO_TARGET_DIR") + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from("target")) +} + +fn layout_entry(name: &str) -> String { + format!( + r#"{{"type":"{}","size":{},"align":{}}}"#, + name, + size_of::(), + align_of::() + ) +} + +fn main() { + let layouts = [ + layout_entry::("CheetahString"), + layout_entry::>("Option"), + layout_entry::("String"), + layout_entry::>("Option"), + layout_entry::<&str>("&str"), + layout_entry::>("Option<&str>"), + layout_entry::>("Arc"), + layout_entry::>>("Option>"), + ]; + + let snapshot = format!( + concat!( + "{{\n", + " \"crate\":\"cheetah-string\",\n", + " \"profile\":\"bench\",\n", + " \"target_arch\":\"{}\",\n", + " \"target_os\":\"{}\",\n", + " \"pointer_width\":\"{}\",\n", + " \"layouts\":[\n {}\n ]\n", + "}}\n" + ), + env::consts::ARCH, + env::consts::OS, + std::mem::size_of::() * 8, + layouts.join(",\n ") + ); + + let artifact_dir = target_dir().join("layout-artifacts"); + fs::create_dir_all(&artifact_dir).expect("create layout artifact directory"); + fs::write(artifact_dir.join("layout-bench.json"), &snapshot) + .expect("write layout bench artifact"); + + println!("{snapshot}"); +} diff --git a/scripts/bench-all.ps1 b/scripts/bench-all.ps1 new file mode 100644 index 0000000..335e541 --- /dev/null +++ b/scripts/bench-all.ps1 @@ -0,0 +1,16 @@ +$Version = if ($args.Count -gt 0) { $args[0] } else { "current" } +$ResultDir = Join-Path "bench-results" $Version + +New-Item -ItemType Directory -Force -Path $ResultDir | Out-Null + +cargo test layout_snapshot --all-features -- --nocapture | + Tee-Object -FilePath (Join-Path $ResultDir "layout-test.txt") + +cargo bench --bench layout | + Tee-Object -FilePath (Join-Path $ResultDir "layout-bench.txt") + +cargo bench --bench comprehensive | + Tee-Object -FilePath (Join-Path $ResultDir "comprehensive.txt") + +cargo bench --bench simd --features simd | + Tee-Object -FilePath (Join-Path $ResultDir "simd.txt") diff --git a/scripts/bench-all.sh b/scripts/bench-all.sh new file mode 100755 index 0000000..211b65c --- /dev/null +++ b/scripts/bench-all.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env sh +set -eu + +VERSION="${1:-current}" +RESULT_DIR="bench-results/${VERSION}" + +mkdir -p "$RESULT_DIR" + +cargo test layout_snapshot --all-features -- --nocapture | tee "$RESULT_DIR/layout-test.txt" +cargo bench --bench layout | tee "$RESULT_DIR/layout-bench.txt" +cargo bench --bench comprehensive | tee "$RESULT_DIR/comprehensive.txt" +cargo bench --bench simd --features simd | tee "$RESULT_DIR/simd.txt" diff --git a/tests/layout_snapshot.rs b/tests/layout_snapshot.rs new file mode 100644 index 0000000..a1ba971 --- /dev/null +++ b/tests/layout_snapshot.rs @@ -0,0 +1,62 @@ +use cheetah_string::CheetahString; +use std::env; +use std::fs; +use std::mem::{align_of, size_of}; +use std::path::PathBuf; + +fn target_dir() -> PathBuf { + env::var_os("CARGO_TARGET_DIR") + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from("target")) +} + +fn layout_entry(name: &str) -> String { + format!( + r#"{{"type":"{}","size":{},"align":{}}}"#, + name, + size_of::(), + align_of::() + ) +} + +#[test] +fn layout_snapshot() { + let layouts = [ + layout_entry::("CheetahString"), + layout_entry::>("Option"), + layout_entry::("String"), + layout_entry::>("Option"), + layout_entry::<&str>("&str"), + layout_entry::>("Option<&str>"), + layout_entry::>("Arc"), + layout_entry::>>("Option>"), + ]; + + let snapshot = format!( + concat!( + "{{\n", + " \"crate\":\"cheetah-string\",\n", + " \"profile\":\"test\",\n", + " \"target_arch\":\"{}\",\n", + " \"target_os\":\"{}\",\n", + " \"pointer_width\":\"{}\",\n", + " \"layouts\":[\n {}\n ]\n", + "}}\n" + ), + env::consts::ARCH, + env::consts::OS, + std::mem::size_of::() * 8, + layouts.join(",\n ") + ); + + let artifact_dir = target_dir().join("layout-artifacts"); + fs::create_dir_all(&artifact_dir).expect("create layout artifact directory"); + fs::write(artifact_dir.join("layout-snapshot.json"), &snapshot) + .expect("write layout snapshot artifact"); + + println!("{snapshot}"); + + assert!(size_of::() >= size_of::()); + assert!(align_of::() >= align_of::()); + assert!(size_of::>() >= size_of::()); +} From e5e8c3638969421a5e9bad00b6b514df825d5103 Mon Sep 17 00:00:00 2001 From: mxsm Date: Sat, 20 Jun 2026 09:58:38 +0800 Subject: [PATCH 4/8] Enh: use memchr for substring search --- Cargo.toml | 7 ++++- README.md | 5 ++-- benches/pattern.rs | 53 ++++++++++++++++++++++++++++++++ scripts/bench-all.ps1 | 3 ++ scripts/bench-all.sh | 1 + src/cheetah_string.rs | 20 ++----------- src/lib.rs | 10 ++++--- src/search.rs | 70 +++++++++++++++++++++++++++++++++++++++++++ src/simd.rs | 5 ++++ tests/search.rs | 50 +++++++++++++++++++++++++++++++ 10 files changed, 200 insertions(+), 24 deletions(-) create mode 100644 benches/pattern.rs create mode 100644 src/search.rs create mode 100644 tests/search.rs diff --git a/Cargo.toml b/Cargo.toml index e2dce5a..5dfb885 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,11 +16,12 @@ A lightweight, high-performance string manipulation library optimized for speed- [dependencies] bytes = { version = "1.10.0", optional = true, default-features = false } +memchr = { version = "2", default-features = false } serde = { version = "1.0", optional = true, default-features = false, features = ["alloc"] } [features] default = ["std"] -std = [] +std = ["memchr/std"] serde = ["dep:serde", "serde/alloc"] bytes = ["dep:bytes"] simd = [] @@ -41,6 +42,10 @@ harness = false name = "layout" harness = false +[[bench]] +name = "pattern" +harness = false + [[bench]] name = "simd" harness = false diff --git a/README.md b/README.md index db5583c..c1d6f91 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,8 @@ CheetahString is a versatile string type that goes beyond the standard library's - **⚡ Performance Focused** - Optimized for common string operations - Reduced memory allocations via intelligent internal representation - - Optional SIMD acceleration for string matching operations (x86_64 SSE2) + - `memchr`/`memmem` substring search by default + - Optional SIMD acceleration for selected byte comparisons (x86_64 SSE2) - Benchmarked against standard library types - **🛡️ Safe & Correct** @@ -106,7 +107,7 @@ CheetahString is designed with performance in mind: - **Efficient Sharing**: Large immutable strings use `Arc` for cheap cloning - **Fast Builders**: Capacity-preserving builder paths use owned heap storage for direct mutation - **Optimized Operations**: Common operations like concatenation have fast-path implementations -- **SIMD Acceleration** (with `simd` feature): String matching operations (`starts_with`, `ends_with`, `contains`, `find`, equality comparisons) are accelerated using SSE2 SIMD instructions on x86_64 platforms. The implementation automatically falls back to scalar code for small inputs or when SIMD is not available. +- **Search Acceleration**: Substring search uses `memchr`/`memmem` by default. With the `simd` feature, selected byte comparisons such as prefix, suffix, and equality paths can use SSE2 on x86_64 platforms. Run benchmarks: ```bash diff --git a/benches/pattern.rs b/benches/pattern.rs new file mode 100644 index 0000000..ef8a680 --- /dev/null +++ b/benches/pattern.rs @@ -0,0 +1,53 @@ +use cheetah_string::{CheetahFinder, CheetahString}; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; + +fn bench_pathological_find(c: &mut Criterion) { + let mut group = c.benchmark_group("pathological_find"); + + for size in [128, 1024, 16 * 1024, 64 * 1024] { + let haystack = format!("{}b", "a".repeat(size)); + let cheetah = CheetahString::from(haystack.as_str()); + let needle = "aaaab"; + let finder = CheetahFinder::new(needle); + + group.throughput(Throughput::Bytes(haystack.len() as u64)); + + group.bench_with_input(BenchmarkId::new("cheetah_find", size), &size, |b, _| { + b.iter(|| black_box(&cheetah).find(black_box(needle))) + }); + + group.bench_with_input(BenchmarkId::new("finder", size), &size, |b, _| { + b.iter(|| finder.find_in(black_box(&cheetah))) + }); + + group.bench_with_input(BenchmarkId::new("std_find", size), &size, |b, _| { + b.iter(|| black_box(haystack.as_str()).find(black_box(needle))) + }); + } + + group.finish(); +} + +fn bench_single_byte_find(c: &mut Criterion) { + let mut group = c.benchmark_group("single_byte_find"); + + for size in [128, 1024, 16 * 1024, 64 * 1024] { + let haystack = format!("{}z", "a".repeat(size)); + let cheetah = CheetahString::from(haystack.as_str()); + + group.throughput(Throughput::Bytes(haystack.len() as u64)); + + group.bench_with_input(BenchmarkId::new("cheetah_find", size), &size, |b, _| { + b.iter(|| black_box(&cheetah).find(black_box("z"))) + }); + + group.bench_with_input(BenchmarkId::new("std_find", size), &size, |b, _| { + b.iter(|| black_box(haystack.as_str()).find(black_box("z"))) + }); + } + + group.finish(); +} + +criterion_group!(benches, bench_pathological_find, bench_single_byte_find); +criterion_main!(benches); diff --git a/scripts/bench-all.ps1 b/scripts/bench-all.ps1 index 335e541..a7d85e1 100644 --- a/scripts/bench-all.ps1 +++ b/scripts/bench-all.ps1 @@ -12,5 +12,8 @@ cargo bench --bench layout | cargo bench --bench comprehensive | Tee-Object -FilePath (Join-Path $ResultDir "comprehensive.txt") +cargo bench --bench pattern | + Tee-Object -FilePath (Join-Path $ResultDir "pattern.txt") + cargo bench --bench simd --features simd | Tee-Object -FilePath (Join-Path $ResultDir "simd.txt") diff --git a/scripts/bench-all.sh b/scripts/bench-all.sh index 211b65c..095a4d7 100755 --- a/scripts/bench-all.sh +++ b/scripts/bench-all.sh @@ -9,4 +9,5 @@ mkdir -p "$RESULT_DIR" cargo test layout_snapshot --all-features -- --nocapture | tee "$RESULT_DIR/layout-test.txt" cargo bench --bench layout | tee "$RESULT_DIR/layout-bench.txt" cargo bench --bench comprehensive | tee "$RESULT_DIR/comprehensive.txt" +cargo bench --bench pattern | tee "$RESULT_DIR/pattern.txt" cargo bench --bench simd --features simd | tee "$RESULT_DIR/simd.txt" diff --git a/src/cheetah_string.rs b/src/cheetah_string.rs index 89df165..d60986d 100644 --- a/src/cheetah_string.rs +++ b/src/cheetah_string.rs @@ -691,14 +691,7 @@ impl CheetahString { match pat.as_str_pattern() { StrPatternImpl::Char(c) => self.as_str().contains(c), StrPatternImpl::Str(s) => { - #[cfg(all(feature = "simd", target_arch = "x86_64"))] - { - crate::simd::find_bytes(self.as_bytes(), s.as_bytes()).is_some() - } - #[cfg(not(all(feature = "simd", target_arch = "x86_64")))] - { - self.as_str().contains(s) - } + crate::search::find_bytes(self.as_bytes(), s.as_bytes()).is_some() } } } @@ -736,14 +729,7 @@ impl CheetahString { #[inline] pub fn find>(&self, pat: P) -> Option { let pat = pat.as_ref(); - #[cfg(all(feature = "simd", target_arch = "x86_64"))] - { - crate::simd::find_bytes(self.as_bytes(), pat.as_bytes()) - } - #[cfg(not(all(feature = "simd", target_arch = "x86_64")))] - { - self.as_str().find(pat) - } + crate::search::find_bytes(self.as_bytes(), pat.as_bytes()) } /// Returns the byte index of the last occurrence of the pattern, or `None` if not found. @@ -758,7 +744,7 @@ impl CheetahString { /// ``` #[inline] pub fn rfind>(&self, pat: P) -> Option { - self.as_str().rfind(pat.as_ref()) + crate::search::rfind_bytes(self.as_bytes(), pat.as_ref().as_bytes()) } /// Returns a string slice with leading and trailing whitespace removed. diff --git a/src/lib.rs b/src/lib.rs index c9aab49..76eead9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,15 +4,15 @@ //! It is usable in both `std` and `no_std` environments. Additionally, CheetahString supports serde for serialization and deserialization. //! CheetahString also supports the `bytes` feature, allowing conversion to the `bytes::Bytes` type. //! It minimizes allocations across small, shared, and builder-oriented string workloads. +//! Substring search uses `memchr`/`memmem` by default. //! //! # SIMD Acceleration //! //! When compiled with the `simd` feature flag, CheetahString uses SIMD (Single Instruction, Multiple Data) -//! instructions to accelerate string matching operations on x86_64 platforms with SSE2 support. +//! instructions to accelerate selected byte comparisons on x86_64 platforms with SSE2 support. //! SIMD acceleration is applied to: //! - `starts_with()` - Pattern prefix matching //! - `ends_with()` - Pattern suffix matching -//! - `contains()` / `find()` - Substring search //! - Equality comparisons (`==`, `!=`) //! //! The implementation automatically uses SIMD for strings >= 16 bytes and falls back to scalar operations @@ -40,13 +40,13 @@ //! //! ``` //! -//! Using SIMD-accelerated operations (when `simd` feature is enabled): +//! Using accelerated search operations: //! ```rust //! use cheetah_string::CheetahString; //! //! let url = CheetahString::from("https://api.example.com/v1/users"); //! -//! // These operations use SIMD when the pattern is >= 16 bytes +//! // Substring search uses memchr/memmem by default. //! if url.starts_with("https://") { //! println!("Secure connection"); //! } @@ -60,6 +60,7 @@ extern crate alloc; mod cheetah_string; mod error; +mod search; #[cfg(feature = "serde")] mod serde; @@ -69,3 +70,4 @@ mod simd; pub use cheetah_string::{CheetahString, SplitPattern, SplitStr, SplitWrapper, StrPattern}; pub use error::{Error, Result}; +pub use search::CheetahFinder; diff --git a/src/search.rs b/src/search.rs new file mode 100644 index 0000000..b5b52b9 --- /dev/null +++ b/src/search.rs @@ -0,0 +1,70 @@ +pub(crate) fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option { + if needle.is_empty() { + return Some(0); + } + + if needle.len() == 1 { + return memchr::memchr(needle[0], haystack); + } + + memchr::memmem::find(haystack, needle) +} + +pub(crate) fn rfind_bytes(haystack: &[u8], needle: &[u8]) -> Option { + if needle.is_empty() { + return Some(haystack.len()); + } + + if needle.len() == 1 { + return memchr::memrchr(needle[0], haystack); + } + + memchr::memmem::rfind(haystack, needle) +} + +/// Reusable substring finder for repeated searches with the same needle. +pub struct CheetahFinder<'a> { + needle: &'a str, + finder: Option>, +} + +impl<'a> CheetahFinder<'a> { + #[inline] + pub fn new(needle: &'a str) -> Self { + let finder = (needle.len() > 1).then(|| memchr::memmem::Finder::new(needle.as_bytes())); + Self { needle, finder } + } + + #[inline] + pub fn needle(&self) -> &'a str { + self.needle + } + + #[inline] + pub fn find_in(&self, haystack: &S) -> Option + where + S: AsRef + ?Sized, + { + let haystack = haystack.as_ref().as_bytes(); + + if self.needle.is_empty() { + return Some(0); + } + + if self.needle.len() == 1 { + return memchr::memchr(self.needle.as_bytes()[0], haystack); + } + + self.finder + .as_ref() + .and_then(|finder| finder.find(haystack)) + } + + #[inline] + pub fn is_match(&self, haystack: &S) -> bool + where + S: AsRef + ?Sized, + { + self.find_in(haystack).is_some() + } +} diff --git a/src/simd.rs b/src/simd.rs index 80a8293..9b4d32f 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -88,6 +88,7 @@ pub(crate) fn ends_with_bytes(haystack: &[u8], needle: &[u8]) -> bool { } /// Find the first occurrence of needle in haystack using SIMD when available +#[allow(dead_code)] #[inline] pub(crate) fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option { if needle.is_empty() { @@ -121,6 +122,7 @@ pub(crate) fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option { .position(|window| window == needle) } +#[allow(dead_code)] #[inline] fn find_first_byte(haystack: &[u8], needle: u8) -> Option { #[cfg(all(feature = "simd", target_arch = "x86_64"))] @@ -133,6 +135,7 @@ fn find_first_byte(haystack: &[u8], needle: u8) -> Option { haystack.iter().position(|&b| b == needle) } +#[allow(dead_code)] #[inline] fn find_short_bytes(haystack: &[u8], needle: &[u8]) -> Option { debug_assert!(needle.len() > 1 && needle.len() < SIMD_THRESHOLD); @@ -191,6 +194,7 @@ unsafe fn eq_bytes_sse2(a: &[u8], b: &[u8]) -> bool { } #[cfg(all(feature = "simd", target_arch = "x86_64"))] +#[allow(dead_code)] #[target_feature(enable = "sse2")] #[inline] unsafe fn find_bytes_sse2(haystack: &[u8], needle: &[u8]) -> Option { @@ -231,6 +235,7 @@ unsafe fn find_bytes_sse2(haystack: &[u8], needle: &[u8]) -> Option { } #[cfg(all(feature = "simd", target_arch = "x86_64"))] +#[allow(dead_code)] #[target_feature(enable = "sse2")] #[inline] unsafe fn find_byte_sse2(haystack: &[u8], needle: u8) -> Option { diff --git a/tests/search.rs b/tests/search.rs new file mode 100644 index 0000000..08b57ea --- /dev/null +++ b/tests/search.rs @@ -0,0 +1,50 @@ +use cheetah_string::{CheetahFinder, CheetahString}; + +#[test] +fn empty_needle_matches_str_find_semantics() { + let s = CheetahString::from("hello"); + + assert_eq!(s.find(""), "hello".find("")); + assert_eq!(s.rfind(""), "hello".rfind("")); + assert!(s.contains("")); +} + +#[test] +fn memmem_search_reports_byte_indices() { + let s = CheetahString::from("cafe cafe"); + + assert_eq!(s.find("fe"), Some(2)); + assert_eq!(s.rfind("fe"), Some(7)); + assert_eq!(s.find("missing"), None); +} + +#[test] +fn unicode_search_matches_str_indices() { + let s = CheetahString::from("éxé"); + + assert_eq!(s.find("é"), "éxé".find("é")); + assert_eq!(s.rfind("é"), "éxé".rfind("é")); + assert_eq!(s.find("xé"), "éxé".find("xé")); +} + +#[test] +fn reusable_finder_matches_repeated_needle() { + let finder = CheetahFinder::new("route"); + let first = CheetahString::from("topic.route.alpha"); + let second = CheetahString::from("topic.name.beta"); + + assert_eq!(finder.needle(), "route"); + assert_eq!(finder.find_in(&first), Some(6)); + assert_eq!(finder.find_in(&second), None); + assert!(finder.is_match(&first)); + assert!(!finder.is_match(&second)); +} + +#[test] +fn reusable_empty_finder_matches_start() { + let finder = CheetahFinder::new(""); + let s = CheetahString::from("payload"); + + assert_eq!(finder.find_in(&s), Some(0)); + assert!(finder.is_match(&s)); +} From f67656204f4fa19c4ee2188ae2609c22762598f0 Mon Sep 17 00:00:00 2001 From: mxsm Date: Sat, 20 Jun 2026 10:00:47 +0800 Subject: [PATCH 5/8] Enh: reuse mutation fast paths --- Cargo.toml | 4 +++ benches/mutation.rs | 83 +++++++++++++++++++++++++++++++++++++++++++ scripts/bench-all.ps1 | 3 ++ scripts/bench-all.sh | 1 + src/cheetah_string.rs | 83 ++++++++++--------------------------------- tests/mutation.rs | 60 +++++++++++++++++++++++++++++++ 6 files changed, 170 insertions(+), 64 deletions(-) create mode 100644 benches/mutation.rs create mode 100644 tests/mutation.rs diff --git a/Cargo.toml b/Cargo.toml index 5dfb885..2a1d9fd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,6 +42,10 @@ harness = false name = "layout" harness = false +[[bench]] +name = "mutation" +harness = false + [[bench]] name = "pattern" harness = false diff --git a/benches/mutation.rs b/benches/mutation.rs new file mode 100644 index 0000000..f70add9 --- /dev/null +++ b/benches/mutation.rs @@ -0,0 +1,83 @@ +use cheetah_string::CheetahString; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; + +fn bench_push_str(c: &mut Criterion) { + let mut group = c.benchmark_group("push_str"); + + group.bench_function("inline_in_place", |b| { + b.iter(|| { + let mut s = CheetahString::from("hello"); + s.push_str(black_box(" world")); + black_box(s) + }) + }); + + group.bench_function("owned_spare_capacity", |b| { + b.iter(|| { + let mut s = CheetahString::with_capacity(128); + s.push_str("hello"); + s.push_str(black_box(" world")); + black_box(s) + }) + }); + + group.bench_function("static_fallback", |b| { + b.iter(|| { + let mut s = CheetahString::from_static_str("hello"); + s.push_str(black_box(" world")); + black_box(s) + }) + }); + + group.finish(); +} + +fn bench_add(c: &mut Criterion) { + let mut group = c.benchmark_group("add"); + + for rhs_len in [1, 8, 32, 128] { + let rhs = "x".repeat(rhs_len); + + group.bench_with_input( + BenchmarkId::new("owned_capacity_str", rhs_len), + &rhs, + |b, rhs| { + b.iter(|| { + let mut s = CheetahString::with_capacity(256); + s.push_str("hello"); + black_box(s + black_box(rhs.as_str())) + }) + }, + ); + + group.bench_with_input(BenchmarkId::new("inline_str", rhs_len), &rhs, |b, rhs| { + b.iter(|| black_box(CheetahString::from("h") + black_box(rhs.as_str()))) + }); + } + + group.finish(); +} + +fn bench_reserve(c: &mut Criterion) { + let mut group = c.benchmark_group("reserve"); + + for additional in [0, 8, 128] { + group.bench_with_input( + BenchmarkId::from_parameter(additional), + &additional, + |b, extra| { + b.iter(|| { + let mut s = CheetahString::with_capacity(64); + s.push_str("hello"); + s.reserve(black_box(*extra)); + black_box(s) + }) + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_push_str, bench_add, bench_reserve); +criterion_main!(benches); diff --git a/scripts/bench-all.ps1 b/scripts/bench-all.ps1 index a7d85e1..2bbc772 100644 --- a/scripts/bench-all.ps1 +++ b/scripts/bench-all.ps1 @@ -12,6 +12,9 @@ cargo bench --bench layout | cargo bench --bench comprehensive | Tee-Object -FilePath (Join-Path $ResultDir "comprehensive.txt") +cargo bench --bench mutation | + Tee-Object -FilePath (Join-Path $ResultDir "mutation.txt") + cargo bench --bench pattern | Tee-Object -FilePath (Join-Path $ResultDir "pattern.txt") diff --git a/scripts/bench-all.sh b/scripts/bench-all.sh index 095a4d7..062efe8 100755 --- a/scripts/bench-all.sh +++ b/scripts/bench-all.sh @@ -9,5 +9,6 @@ mkdir -p "$RESULT_DIR" cargo test layout_snapshot --all-features -- --nocapture | tee "$RESULT_DIR/layout-test.txt" cargo bench --bench layout | tee "$RESULT_DIR/layout-bench.txt" cargo bench --bench comprehensive | tee "$RESULT_DIR/comprehensive.txt" +cargo bench --bench mutation | tee "$RESULT_DIR/mutation.txt" cargo bench --bench pattern | tee "$RESULT_DIR/pattern.txt" cargo bench --bench simd --features simd | tee "$RESULT_DIR/simd.txt" diff --git a/src/cheetah_string.rs b/src/cheetah_string.rs index d60986d..4b7d428 100644 --- a/src/cheetah_string.rs +++ b/src/cheetah_string.rs @@ -972,6 +972,10 @@ impl CheetahString { #[inline] fn push_str_internal(&mut self, string: &str) { + if string.is_empty() { + return; + } + match &mut self.inner { InnerString::Inline { len, data } => { let total_len = *len as usize + string.len(); @@ -1039,6 +1043,10 @@ impl CheetahString { /// ``` #[inline] pub fn reserve(&mut self, additional: usize) { + if additional == 0 { + return; + } + match &mut self.inner { InnerString::Inline { len, .. } if *len as usize + additional <= INLINE_CAPACITY => { return; @@ -1202,28 +1210,9 @@ impl Add<&str> for CheetahString { /// assert_eq!(result, "Hello World"); /// ``` #[inline] - fn add(self, rhs: &str) -> Self::Output { - let total_len = self.len() + rhs.len(); - - // Fast path: result fits in inline storage - if total_len <= INLINE_CAPACITY { - let mut data = [0u8; INLINE_CAPACITY]; - let self_bytes = self.as_bytes(); - data[..self_bytes.len()].copy_from_slice(self_bytes); - data[self_bytes.len()..total_len].copy_from_slice(rhs.as_bytes()); - return CheetahString { - inner: InnerString::Inline { - len: total_len as u8, - data, - }, - }; - } - - // Slow path: allocate for long result - let mut result = String::with_capacity(total_len); - result.push_str(self.as_str()); - result.push_str(rhs); - CheetahString::from_string(result) + fn add(mut self, rhs: &str) -> Self::Output { + self.push_str_internal(rhs); + self } } @@ -1243,28 +1232,9 @@ impl Add<&CheetahString> for CheetahString { /// assert_eq!(result, "Hello World"); /// ``` #[inline] - fn add(self, rhs: &CheetahString) -> Self::Output { - let total_len = self.len() + rhs.len(); - - // Fast path: result fits in inline storage - if total_len <= INLINE_CAPACITY { - let mut data = [0u8; INLINE_CAPACITY]; - let self_bytes = self.as_bytes(); - data[..self_bytes.len()].copy_from_slice(self_bytes); - data[self_bytes.len()..total_len].copy_from_slice(rhs.as_bytes()); - return CheetahString { - inner: InnerString::Inline { - len: total_len as u8, - data, - }, - }; - } - - // Slow path: allocate for long result - let mut result = String::with_capacity(total_len); - result.push_str(self.as_str()); - result.push_str(rhs.as_str()); - CheetahString::from_string(result) + fn add(mut self, rhs: &CheetahString) -> Self::Output { + self.push_str_internal(rhs.as_str()); + self } } @@ -1283,28 +1253,13 @@ impl Add for CheetahString { /// assert_eq!(result, "Hello World"); /// ``` #[inline] - fn add(self, rhs: String) -> Self::Output { - let total_len = self.len() + rhs.len(); - - // Fast path: result fits in inline storage - if total_len <= INLINE_CAPACITY { - let mut data = [0u8; INLINE_CAPACITY]; - let self_bytes = self.as_bytes(); - data[..self_bytes.len()].copy_from_slice(self_bytes); - data[self_bytes.len()..total_len].copy_from_slice(rhs.as_bytes()); - return CheetahString { - inner: InnerString::Inline { - len: total_len as u8, - data, - }, - }; + fn add(mut self, rhs: String) -> Self::Output { + if self.is_empty() { + return CheetahString::from_string(rhs); } - // Slow path: allocate for long result - let mut result = String::with_capacity(total_len); - result.push_str(self.as_str()); - result.push_str(&rhs); - CheetahString::from_string(result) + self.push_str_internal(&rhs); + self } } diff --git a/tests/mutation.rs b/tests/mutation.rs new file mode 100644 index 0000000..2e3ab77 --- /dev/null +++ b/tests/mutation.rs @@ -0,0 +1,60 @@ +use cheetah_string::CheetahString; + +#[test] +fn inline_push_str_appends_in_place() { + let mut s = CheetahString::from("hello"); + let before = s.as_bytes().as_ptr(); + + s.push_str(" world"); + + assert_eq!(s, "hello world"); + assert_eq!(s.as_bytes().as_ptr(), before); +} + +#[test] +fn owned_push_str_reuses_spare_capacity() { + let mut s = CheetahString::with_capacity(128); + s.push_str("hello"); + let before = s.as_bytes().as_ptr(); + + s.push_str(" world"); + + assert_eq!(s, "hello world"); + assert_eq!(s.as_bytes().as_ptr(), before); +} + +#[test] +fn add_reuses_owned_lhs_capacity() { + let mut s = CheetahString::with_capacity(128); + s.push_str("hello"); + let before = s.as_bytes().as_ptr(); + + let s = s + " world"; + + assert_eq!(s, "hello world"); + assert_eq!(s.as_bytes().as_ptr(), before); +} + +#[test] +fn reserve_zero_keeps_existing_buffer() { + let mut s = CheetahString::with_capacity(128); + s.push_str("hello"); + let before = s.as_bytes().as_ptr(); + + s.reserve(0); + + assert_eq!(s, "hello"); + assert_eq!(s.as_bytes().as_ptr(), before); +} + +#[test] +fn add_assign_uses_push_str_path() { + let mut s = CheetahString::with_capacity(128); + s.push_str("hello"); + let before = s.as_bytes().as_ptr(); + + s += " world"; + + assert_eq!(s, "hello world"); + assert_eq!(s.as_bytes().as_ptr(), before); +} From 3eaa760c6b72e1acaa0457852d06c8c7e7c4b7df Mon Sep 17 00:00:00 2001 From: mxsm Date: Sat, 20 Jun 2026 10:06:10 +0800 Subject: [PATCH 6/8] Enh: contract string representation variants --- src/cheetah_string.rs | 150 +++++++++++++----------------------------- tests/basic.rs | 23 +++++-- 2 files changed, 64 insertions(+), 109 deletions(-) diff --git a/src/cheetah_string.rs b/src/cheetah_string.rs index 4b7d428..79d9890 100644 --- a/src/cheetah_string.rs +++ b/src/cheetah_string.rs @@ -172,35 +172,14 @@ impl From for String { unsafe { String::from_utf8_unchecked(data[..len as usize].to_vec()) } } CheetahString { - inner: InnerString::StaticStr(s), + inner: InnerString::Static(s), } => s.to_string(), CheetahString { - inner: InnerString::ArcStr(s), + inner: InnerString::Shared(s), } => s.to_string(), CheetahString { inner: InnerString::Owned(s), } => s, - CheetahString { - inner: InnerString::ArcString(s), - } => match Arc::try_unwrap(s) { - Ok(s) => s, - Err(s) => s.as_ref().clone(), - }, - CheetahString { - inner: InnerString::ArcVecString(s), - } => match Arc::try_unwrap(s) { - // SAFETY: ArcVecString should only be created from valid UTF-8 sources. - Ok(s) => unsafe { String::from_utf8_unchecked(s) }, - // SAFETY: ArcVecString should only be created from valid UTF-8 sources. - Err(s) => unsafe { String::from_utf8_unchecked(s.as_ref().clone()) }, - }, - #[cfg(feature = "bytes")] - CheetahString { - inner: InnerString::Bytes(b), - } => { - // SAFETY: Bytes variant should only be created from valid UTF-8 sources - unsafe { String::from_utf8_unchecked(b.to_vec()) } - } } } } @@ -261,7 +240,7 @@ impl CheetahString { #[inline] pub const fn from_static_str(s: &'static str) -> Self { CheetahString { - inner: InnerString::StaticStr(s), + inner: InnerString::Static(s), } } @@ -298,9 +277,8 @@ impl CheetahString { }, } } else { - CheetahString { - inner: InnerString::ArcVecString(Arc::new(s)), - } + // SAFETY: Callers validate UTF-8 before reaching this helper. + CheetahString::from_builder_string(unsafe { String::from_utf8_unchecked(s) }) } } @@ -368,8 +346,13 @@ impl CheetahString { /// Returns an error if the bytes are not valid UTF-8. #[inline] pub fn try_from_arc_vec(s: Arc>) -> Result { - str::from_utf8(s.as_slice())?; - Ok(CheetahString::from_validated_arc_vec_unchecked(s)) + match Arc::try_unwrap(s) { + Ok(v) => CheetahString::try_from_vec(v), + Err(s) => { + let s = str::from_utf8(s.as_slice())?; + Ok(CheetahString::from_slice(s)) + } + } } #[deprecated( @@ -395,8 +378,12 @@ impl CheetahString { #[inline] fn from_validated_arc_vec_unchecked(s: Arc>) -> Self { - CheetahString { - inner: InnerString::ArcVecString(s), + match Arc::try_unwrap(s) { + Ok(v) => CheetahString::from_validated_vec_unchecked(v), + Err(s) => { + // SAFETY: Callers validate UTF-8 before reaching this helper. + unsafe { CheetahString::from_utf8_unchecked_bytes(s.as_slice()) } + } } } @@ -416,7 +403,7 @@ impl CheetahString { // Use Arc for long borrowed strings to avoid the extra String header. let arc_str: Arc = Arc::from(s); CheetahString { - inner: InnerString::ArcStr(arc_str), + inner: InnerString::Shared(arc_str), } } } @@ -437,7 +424,7 @@ impl CheetahString { // Use Arc for long strings to avoid double allocation let arc_str: Arc = s.into_boxed_str().into(); CheetahString { - inner: InnerString::ArcStr(arc_str), + inner: InnerString::Shared(arc_str), } } } @@ -462,8 +449,9 @@ impl CheetahString { #[inline] pub fn from_arc_string(s: Arc) -> Self { - CheetahString { - inner: InnerString::ArcString(s), + match Arc::try_unwrap(s) { + Ok(s) => CheetahString::from_builder_string(s), + Err(s) => CheetahString::from_slice(s.as_str()), } } @@ -500,9 +488,8 @@ impl CheetahString { #[inline] #[cfg(feature = "bytes")] fn from_validated_bytes_unchecked(b: bytes::Bytes) -> Self { - CheetahString { - inner: InnerString::Bytes(b), - } + // SAFETY: Callers validate UTF-8 before reaching this helper. + unsafe { CheetahString::from_utf8_unchecked_bytes(b.as_ref()) } } #[inline] @@ -513,21 +500,9 @@ impl CheetahString { // The data is always valid UTF-8 up to len bytes. unsafe { str::from_utf8_unchecked(&data[..*len as usize]) } } - InnerString::StaticStr(s) => s, - InnerString::ArcStr(s) => s.as_ref(), + InnerString::Static(s) => s, + InnerString::Shared(s) => s.as_ref(), InnerString::Owned(s) => s.as_str(), - InnerString::ArcString(s) => s.as_str(), - InnerString::ArcVecString(s) => { - // SAFETY: ArcVecString is only created from validated UTF-8 sources. - // All constructors ensure this invariant is maintained. - unsafe { str::from_utf8_unchecked(s.as_ref()) } - } - #[cfg(feature = "bytes")] - InnerString::Bytes(b) => { - // SAFETY: Bytes variant is only created from validated UTF-8 sources. - // The from_bytes constructor ensures this invariant. - unsafe { str::from_utf8_unchecked(b.as_ref()) } - } } } @@ -535,13 +510,9 @@ impl CheetahString { pub fn as_bytes(&self) -> &[u8] { match &self.inner { InnerString::Inline { len, data } => &data[..*len as usize], - InnerString::StaticStr(s) => s.as_bytes(), - InnerString::ArcStr(s) => s.as_bytes(), + InnerString::Static(s) => s.as_bytes(), + InnerString::Shared(s) => s.as_bytes(), InnerString::Owned(s) => s.as_bytes(), - InnerString::ArcString(s) => s.as_bytes(), - InnerString::ArcVecString(s) => s.as_ref(), - #[cfg(feature = "bytes")] - InnerString::Bytes(b) => b.as_ref(), } } @@ -549,13 +520,9 @@ impl CheetahString { pub fn len(&self) -> usize { match &self.inner { InnerString::Inline { len, .. } => *len as usize, - InnerString::StaticStr(s) => s.len(), - InnerString::ArcStr(s) => s.len(), + InnerString::Static(s) => s.len(), + InnerString::Shared(s) => s.len(), InnerString::Owned(s) => s.len(), - InnerString::ArcString(s) => s.len(), - InnerString::ArcVecString(s) => s.len(), - #[cfg(feature = "bytes")] - InnerString::Bytes(b) => b.len(), } } @@ -563,13 +530,9 @@ impl CheetahString { pub fn is_empty(&self) -> bool { match &self.inner { InnerString::Inline { len, .. } => *len == 0, - InnerString::StaticStr(s) => s.is_empty(), - InnerString::ArcStr(s) => s.is_empty(), + InnerString::Static(s) => s.is_empty(), + InnerString::Shared(s) => s.is_empty(), InnerString::Owned(s) => s.is_empty(), - InnerString::ArcString(s) => s.is_empty(), - InnerString::ArcVecString(s) => s.is_empty(), - #[cfg(feature = "bytes")] - InnerString::Bytes(b) => b.is_empty(), } } @@ -989,12 +952,6 @@ impl CheetahString { s.push_str(string); return; } - InnerString::ArcString(arc) => { - if let Some(s) = Arc::get_mut(arc) { - s.push_str(string); - return; - } - } _ => {} } @@ -1009,7 +966,7 @@ impl CheetahString { /// /// This method is optimized for incremental building and will: /// - Mutate inline storage when possible - /// - Mutate unique Arc in-place when available + /// - Mutate owned heap storage in-place when capacity allows /// - Only allocate when necessary /// /// # Examples @@ -1056,12 +1013,6 @@ impl CheetahString { s.reserve(additional); return; } - InnerString::ArcString(arc) => { - if let Some(s) = Arc::get_mut(arc) { - s.reserve(additional); - return; - } - } _ => {} } @@ -1310,12 +1261,9 @@ const INLINE_CAPACITY: usize = 23; /// Variants: /// /// * `Inline` - Inline storage for strings <= 23 bytes (zero heap allocations). -/// * `StaticStr(&'static str)` - A static string slice (zero heap allocations). -/// * `ArcStr(Arc)` - A reference-counted string slice (single heap allocation, optimized). +/// * `Static(&'static str)` - A static string slice (zero heap allocations). +/// * `Shared(Arc)` - A reference-counted string slice (single heap allocation, optimized). /// * `Owned(String)` - An owned heap string used for builder-style mutation. -/// * `ArcString(Arc)` - A reference-counted string (for backwards compatibility). -/// * `ArcVecString(Arc>)` - A reference-counted byte vector. -/// * `Bytes(bytes::Bytes)` - A byte buffer (available when the "bytes" feature is enabled). #[derive(Clone)] pub(super) enum InnerString { /// Inline storage for short strings (up to 23 bytes). @@ -1325,20 +1273,12 @@ pub(super) enum InnerString { data: [u8; INLINE_CAPACITY], }, /// Static string slice with 'static lifetime. - StaticStr(&'static str), + Static(&'static str), /// Reference-counted string slice (single heap allocation). /// Preferred for long immutable strings created from owned or borrowed data. - ArcStr(Arc), + Shared(Arc), /// Owned heap-allocated string used when exclusive mutability matters. Owned(String), - /// Reference-counted heap-allocated string. - /// Kept for backwards compatibility and when Arc is explicitly provided. - ArcString(Arc), - /// Reference-counted heap-allocated byte vector. - ArcVecString(Arc>), - /// Bytes type integration (requires "bytes" feature). - #[cfg(feature = "bytes")] - Bytes(bytes::Bytes), } // Sealed trait pattern to support both &str and char in starts_with/ends_with/contains @@ -1515,14 +1455,14 @@ mod tests { } #[test] - fn long_borrowed_str_uses_arc_str_storage() { + fn long_borrowed_str_uses_shared_storage() { let value = "a".repeat(INLINE_CAPACITY + 1); let s = CheetahString::from_slice(&value); match &s.inner { - InnerString::ArcStr(inner) => assert_eq!(inner.as_ref(), value.as_str()), + InnerString::Shared(inner) => assert_eq!(inner.as_ref(), value.as_str()), other => panic!( - "expected ArcStr for long borrowed input, got {:?}", + "expected Shared for long borrowed input, got {:?}", core::mem::discriminant(other) ), } @@ -1545,17 +1485,17 @@ mod tests { } #[test] - fn long_vec_conversion_uses_arc_vec_storage() { + fn long_vec_conversion_uses_owned_storage() { let value = "a".repeat(INLINE_CAPACITY + 1).into_bytes(); let s = CheetahString::try_from_vec(value).expect("valid utf-8"); match &s.inner { - InnerString::ArcVecString(inner) => { + InnerString::Owned(inner) => { assert_eq!(inner.len(), INLINE_CAPACITY + 1); - assert_eq!(inner.as_slice(), vec![b'a'; INLINE_CAPACITY + 1].as_slice()); + assert_eq!(inner.as_bytes(), vec![b'a'; INLINE_CAPACITY + 1].as_slice()); } other => panic!( - "expected ArcVecString for long Vec conversion, got {:?}", + "expected Owned for long Vec conversion, got {:?}", core::mem::discriminant(other) ), } diff --git a/tests/basic.rs b/tests/basic.rs index 4c8329a..7f610b9 100644 --- a/tests/basic.rs +++ b/tests/basic.rs @@ -171,7 +171,7 @@ fn test_into_string_reuses_unique_arc_string_buffer() { } #[test] -fn test_into_string_clones_shared_arc_string_buffer() { +fn test_into_string_reuses_owned_arc_string_buffer_after_clone() { let value = "a".repeat(64); let arc = Arc::new(value); let original_ptr = arc.as_bytes().as_ptr(); @@ -180,10 +180,24 @@ fn test_into_string_clones_shared_arc_string_buffer() { let shared = s.clone(); let owned: String = s.into(); - assert_ne!(owned.as_bytes().as_ptr(), original_ptr); + assert_eq!(owned.as_bytes().as_ptr(), original_ptr); + assert_ne!(shared.as_bytes().as_ptr(), original_ptr); assert_eq!(shared.as_str(), "a".repeat(64)); } +#[test] +fn test_into_string_copies_shared_arc_string_input() { + let value = "a".repeat(64); + let arc = Arc::new(value); + let original_ptr = arc.as_bytes().as_ptr(); + let _held = Arc::clone(&arc); + + let s = CheetahString::from(arc); + let owned: String = s.into(); + + assert_ne!(owned.as_bytes().as_ptr(), original_ptr); +} + #[test] fn test_into_string_reuses_unique_vec_buffer() { let bytes = vec![b'a'; 64]; @@ -196,7 +210,7 @@ fn test_into_string_reuses_unique_vec_buffer() { } #[test] -fn test_into_string_clones_shared_vec_buffer() { +fn test_into_string_reuses_owned_vec_buffer_after_clone() { let bytes = vec![b'a'; 64]; let original_ptr = bytes.as_ptr(); @@ -204,7 +218,8 @@ fn test_into_string_clones_shared_vec_buffer() { let shared = s.clone(); let owned: String = s.into(); - assert_ne!(owned.as_bytes().as_ptr(), original_ptr); + assert_eq!(owned.as_bytes().as_ptr(), original_ptr); + assert_ne!(shared.as_bytes().as_ptr(), original_ptr); assert_eq!(shared.as_str(), "a".repeat(64)); } From 623dd3d1d37537b159f6e4d53e27a5db207552a0 Mon Sep 17 00:00:00 2001 From: mxsm Date: Sat, 20 Jun 2026 10:09:39 +0800 Subject: [PATCH 7/8] Feat: add CheetahBytes byte semantics --- Cargo.toml | 1 + README.md | 3 +- src/bytes.rs | 222 +++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 11 ++- tests/bytes.rs | 51 ++++++++++++ 5 files changed, 285 insertions(+), 3 deletions(-) create mode 100644 src/bytes.rs create mode 100644 tests/bytes.rs diff --git a/Cargo.toml b/Cargo.toml index 2a1d9fd..0493390 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ simd = [] [dev-dependencies] criterion = { version = "0.5", features = ["html_reports"] } +serde_json = "1.0" [[bench]] diff --git a/README.md b/README.md index c1d6f91..713362b 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ cheetah-string = { version = "1.1.0", features = ["bytes", "serde", "simd"] } Available features: - `std` (default): Enable standard library support -- `bytes`: Integration with the `bytes` crate +- `bytes`: `CheetahBytes` and integration with the `bytes` crate - `serde`: Serialization support via serde - `simd`: SIMD-accelerated string operations (x86_64 SSE2) @@ -138,6 +138,7 @@ CheetahString intelligently chooses the most efficient storage: - `from_static_str(s)` - Zero-cost wrapper for `'static str` - `from_string(s)` - From owned `String` - `try_from_bytes(b)` - Safe construction from bytes with UTF-8 validation +- `CheetahBytes` - Byte-oriented companion type available with the `bytes` feature - `with_capacity(n)` - Pre-allocate capacity ### Query Methods diff --git a/src/bytes.rs b/src/bytes.rs new file mode 100644 index 0000000..9c78bb2 --- /dev/null +++ b/src/bytes.rs @@ -0,0 +1,222 @@ +use crate::CheetahString; +use alloc::vec::Vec; +use core::fmt; +use core::ops::Deref; +use core::str::Utf8Error; + +/// Byte-oriented companion type for `CheetahString`. +/// +/// `CheetahBytes` does not promise UTF-8 and never dereferences to `str`. +/// Convert it to `CheetahString` with `TryFrom` or an explicit unsafe method. +#[derive(Clone, Default, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct CheetahBytes { + inner: ::bytes::Bytes, +} + +impl CheetahBytes { + #[inline] + pub fn new() -> Self { + Self::default() + } + + #[inline] + pub fn from_static(bytes: &'static [u8]) -> Self { + Self { + inner: ::bytes::Bytes::from_static(bytes), + } + } + + #[inline] + pub fn from_vec(bytes: Vec) -> Self { + Self { + inner: ::bytes::Bytes::from(bytes), + } + } + + #[inline] + pub fn from_bytes(bytes: ::bytes::Bytes) -> Self { + Self { inner: bytes } + } + + #[inline] + pub fn as_bytes(&self) -> &[u8] { + self.inner.as_ref() + } + + #[inline] + pub fn len(&self) -> usize { + self.inner.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + #[inline] + pub fn into_bytes(self) -> ::bytes::Bytes { + self.inner + } + + #[inline] + pub fn try_into_string(self) -> Result { + CheetahString::try_from_bytes_buf(self.inner) + } + + /// Converts bytes into `CheetahString` without validating UTF-8. + /// + /// # Safety + /// + /// The caller must guarantee that the bytes contain valid UTF-8. + #[inline] + pub unsafe fn into_string_unchecked(self) -> CheetahString { + // SAFETY: The caller guarantees valid UTF-8. + unsafe { CheetahString::from_utf8_unchecked_bytes_buf(self.inner) } + } +} + +impl AsRef<[u8]> for CheetahBytes { + #[inline] + fn as_ref(&self) -> &[u8] { + self.as_bytes() + } +} + +impl Deref for CheetahBytes { + type Target = [u8]; + + #[inline] + fn deref(&self) -> &Self::Target { + self.as_bytes() + } +} + +impl fmt::Debug for CheetahBytes { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("CheetahBytes") + .field(&self.as_bytes()) + .finish() + } +} + +impl From> for CheetahBytes { + #[inline] + fn from(bytes: Vec) -> Self { + Self::from_vec(bytes) + } +} + +impl From<&'static [u8]> for CheetahBytes { + #[inline] + fn from(bytes: &'static [u8]) -> Self { + Self::from_static(bytes) + } +} + +impl From<::bytes::Bytes> for CheetahBytes { + #[inline] + fn from(bytes: ::bytes::Bytes) -> Self { + Self::from_bytes(bytes) + } +} + +impl From for ::bytes::Bytes { + #[inline] + fn from(bytes: CheetahBytes) -> Self { + bytes.into_bytes() + } +} + +impl TryFrom for CheetahString { + type Error = Utf8Error; + + #[inline] + fn try_from(bytes: CheetahBytes) -> Result { + bytes.try_into_string() + } +} + +impl TryFrom<&CheetahBytes> for CheetahString { + type Error = Utf8Error; + + #[inline] + fn try_from(bytes: &CheetahBytes) -> Result { + CheetahString::try_from_bytes(bytes.as_bytes()) + } +} + +#[cfg(feature = "serde")] +mod serde_impl { + use super::CheetahBytes; + use alloc::vec::Vec; + use core::fmt; + use serde::de::{Error, SeqAccess, Visitor}; + use serde::{Deserialize, Deserializer, Serialize, Serializer}; + + impl Serialize for CheetahBytes { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + serializer.serialize_bytes(self.as_bytes()) + } + } + + impl<'de> Deserialize<'de> for CheetahBytes { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct CheetahBytesVisitor; + + impl<'de> Visitor<'de> for CheetahBytesVisitor { + type Value = CheetahBytes; + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + formatter.write_str("a byte buffer") + } + + fn visit_bytes(self, v: &[u8]) -> Result + where + E: Error, + { + Ok(CheetahBytes::from_vec(v.to_vec())) + } + + fn visit_borrowed_bytes(self, v: &'de [u8]) -> Result + where + E: Error, + { + Ok(CheetahBytes::from_vec(v.to_vec())) + } + + fn visit_byte_buf(self, v: Vec) -> Result + where + E: Error, + { + Ok(CheetahBytes::from_vec(v)) + } + + fn visit_str(self, v: &str) -> Result + where + E: Error, + { + Ok(CheetahBytes::from_vec(v.as_bytes().to_vec())) + } + + fn visit_seq(self, mut seq: A) -> Result + where + A: SeqAccess<'de>, + { + let mut bytes = Vec::::with_capacity(seq.size_hint().unwrap_or(0)); + while let Some(byte) = seq.next_element()? { + bytes.push(byte); + } + Ok(CheetahBytes::from_vec(bytes)) + } + } + + deserializer.deserialize_byte_buf(CheetahBytesVisitor) + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 76eead9..c8721cf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,8 +1,8 @@ #![cfg_attr(not(feature = "std"), no_std)] -//! No more relying solely on the standard library's String! CheetahString is a versatile string type that can store static strings, dynamic strings, and byte arrays. +//! No more relying solely on the standard library's String! CheetahString is a versatile string type that can store static and dynamic strings. //! It is usable in both `std` and `no_std` environments. Additionally, CheetahString supports serde for serialization and deserialization. -//! CheetahString also supports the `bytes` feature, allowing conversion to the `bytes::Bytes` type. +//! The `bytes` feature exposes `CheetahBytes` for byte-oriented data. //! It minimizes allocations across small, shared, and builder-oriented string workloads. //! Substring search uses `memchr`/`memmem` by default. //! @@ -62,12 +62,19 @@ mod cheetah_string; mod error; mod search; +#[cfg(feature = "bytes")] +#[path = "bytes.rs"] +mod cheetah_bytes; + #[cfg(feature = "serde")] mod serde; #[cfg(all(feature = "simd", target_arch = "x86_64"))] mod simd; +#[cfg(feature = "bytes")] +pub use cheetah_bytes::CheetahBytes; + pub use cheetah_string::{CheetahString, SplitPattern, SplitStr, SplitWrapper, StrPattern}; pub use error::{Error, Result}; pub use search::CheetahFinder; diff --git a/tests/bytes.rs b/tests/bytes.rs new file mode 100644 index 0000000..e9dc33b --- /dev/null +++ b/tests/bytes.rs @@ -0,0 +1,51 @@ +#![cfg(feature = "bytes")] + +use cheetah_string::{CheetahBytes, CheetahString}; + +#[test] +fn cheetah_bytes_accepts_invalid_utf8() { + let bytes = CheetahBytes::from_vec(vec![0, 159, 146, 150, 255]); + + assert_eq!(bytes.len(), 5); + assert_eq!(bytes.as_bytes(), &[0, 159, 146, 150, 255]); +} + +#[test] +fn cheetah_bytes_try_into_string_validates_utf8() { + let valid = CheetahBytes::from_vec(b"hello".to_vec()); + let s = CheetahString::try_from(valid).unwrap(); + assert_eq!(s, "hello"); + + let invalid = CheetahBytes::from_vec(vec![0xFF, 0xFE]); + assert!(CheetahString::try_from(invalid).is_err()); +} + +#[test] +fn cheetah_bytes_roundtrips_bytes_crate_type() { + let raw = bytes::Bytes::from_static(b"payload"); + let cheetah = CheetahBytes::from(raw.clone()); + let roundtrip: bytes::Bytes = cheetah.into(); + + assert_eq!(roundtrip, raw); +} + +#[test] +fn unsafe_bytes_to_string_conversion_is_explicit() { + let bytes = CheetahBytes::from_vec(b"hello".to_vec()); + + // SAFETY: The test input is valid UTF-8. + let s = unsafe { bytes.into_string_unchecked() }; + + assert_eq!(s, "hello"); +} + +#[cfg(feature = "serde")] +#[test] +fn serde_uses_bytes_semantics_for_cheetah_bytes() { + let bytes = CheetahBytes::from_vec(vec![0, 1, 255]); + let json = serde_json::to_string(&bytes).unwrap(); + assert_eq!(json, "[0,1,255]"); + + let decoded: CheetahBytes = serde_json::from_str(&json).unwrap(); + assert_eq!(decoded.as_bytes(), &[0, 1, 255]); +} From 24d0fcd76edd312e5a4e50084497f6d72cc8b0b6 Mon Sep 17 00:00:00 2001 From: mxsm Date: Sat, 20 Jun 2026 10:13:43 +0800 Subject: [PATCH 8/8] Feat: add RocketMQ workload benchmarks --- Cargo.toml | 14 ++++ benches/mq_properties.rs | 151 ++++++++++++++++++++++++++++++++++ benches/mq_remoting_header.rs | 148 +++++++++++++++++++++++++++++++++ benches/mq_topic.rs | 139 +++++++++++++++++++++++++++++++ scripts/bench-all.ps1 | 9 ++ scripts/bench-all.sh | 3 + 6 files changed, 464 insertions(+) create mode 100644 benches/mq_properties.rs create mode 100644 benches/mq_remoting_header.rs create mode 100644 benches/mq_topic.rs diff --git a/Cargo.toml b/Cargo.toml index 0493390..4b14c4c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,8 +27,10 @@ bytes = ["dep:bytes"] simd = [] [dev-dependencies] +compact_str = "0.8" criterion = { version = "0.5", features = ["html_reports"] } serde_json = "1.0" +smartstring = "1.0" [[bench]] @@ -47,6 +49,18 @@ harness = false name = "mutation" harness = false +[[bench]] +name = "mq_properties" +harness = false + +[[bench]] +name = "mq_remoting_header" +harness = false + +[[bench]] +name = "mq_topic" +harness = false + [[bench]] name = "pattern" harness = false diff --git a/benches/mq_properties.rs b/benches/mq_properties.rs new file mode 100644 index 0000000..a7e343e --- /dev/null +++ b/benches/mq_properties.rs @@ -0,0 +1,151 @@ +use cheetah_string::CheetahString; +use compact_str::CompactString; +use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; +use smartstring::alias::String as SmartString; +use std::collections::HashMap; + +fn properties() -> Vec<(&'static str, &'static str)> { + vec![ + ("KEYS", "order-10001"), + ("TAGS", "paid"), + ("WAIT", "false"), + ("DELAY", "0"), + ("RETRY_TOPIC", "order-service"), + ("REAL_TOPIC", "order-created"), + ("REAL_QID", "4"), + ("TRAN_MSG", "false"), + ("PGROUP", "order-consumer"), + ("MIN_OFFSET", "1024"), + ("MAX_OFFSET", "2048"), + ("BUYER_ID", "u-10001"), + ("TRACE_ON", "true"), + ("INSTANCE_ID", "rmq-prod-a"), + ("CORRELATION_ID", "corr-10001"), + ("REPLY_TO_CLIENT", "client-7"), + ("TTL", "30000"), + ("UNIQ_KEY", "7F00000100002A9F000000000001"), + ("BORN_HOST", "10.0.0.1"), + ("STORE_HOST", "10.0.0.2"), + ] +} + +fn bench_property_build(c: &mut Criterion) { + let props = properties(); + let mut group = c.benchmark_group("mq_property_build"); + group.throughput(Throughput::Elements(props.len() as u64)); + + group.bench_function("String", |b| { + b.iter(|| { + let mut map = HashMap::with_capacity(props.len()); + for (key, value) in &props { + map.insert( + black_box((*key).to_string()), + black_box((*value).to_string()), + ); + } + black_box(map) + }) + }); + + group.bench_function("CompactString", |b| { + b.iter(|| { + let mut map = HashMap::with_capacity(props.len()); + for (key, value) in &props { + map.insert( + black_box(CompactString::from(*key)), + black_box(CompactString::from(*value)), + ); + } + black_box(map) + }) + }); + + group.bench_function("SmartString", |b| { + b.iter(|| { + let mut map = HashMap::with_capacity(props.len()); + for (key, value) in &props { + map.insert( + black_box(SmartString::from(*key)), + black_box(SmartString::from(*value)), + ); + } + black_box(map) + }) + }); + + group.bench_function("CheetahString", |b| { + b.iter(|| { + let mut map = HashMap::with_capacity(props.len()); + for (key, value) in &props { + map.insert( + black_box(CheetahString::from(*key)), + black_box(CheetahString::from(*value)), + ); + } + black_box(map) + }) + }); + + group.finish(); +} + +fn bench_property_lookup(c: &mut Criterion) { + let props = properties(); + let string_map: HashMap = props + .iter() + .map(|(key, value)| ((*key).to_string(), (*value).to_string())) + .collect(); + let compact_map: HashMap = props + .iter() + .map(|(key, value)| (CompactString::from(*key), CompactString::from(*value))) + .collect(); + let smart_map: HashMap = props + .iter() + .map(|(key, value)| (SmartString::from(*key), SmartString::from(*value))) + .collect(); + let cheetah_map: HashMap = props + .iter() + .map(|(key, value)| (CheetahString::from(*key), CheetahString::from(*value))) + .collect(); + + let mut group = c.benchmark_group("mq_property_lookup"); + group.throughput(Throughput::Elements(4)); + + group.bench_function("String", |b| { + b.iter(|| { + black_box(string_map.get("UNIQ_KEY")); + black_box(string_map.get("TAGS")); + black_box(string_map.get("PGROUP")); + black_box(string_map.get("MISSING")) + }) + }); + group.bench_function("CompactString", |b| { + b.iter(|| { + black_box(compact_map.get("UNIQ_KEY")); + black_box(compact_map.get("TAGS")); + black_box(compact_map.get("PGROUP")); + black_box(compact_map.get("MISSING")) + }) + }); + group.bench_function("SmartString", |b| { + b.iter(|| { + black_box(smart_map.get("UNIQ_KEY")); + black_box(smart_map.get("TAGS")); + black_box(smart_map.get("PGROUP")); + black_box(smart_map.get("MISSING")) + }) + }); + group.bench_function("CheetahString", |b| { + b.iter(|| { + black_box(cheetah_map.get("UNIQ_KEY")); + black_box(cheetah_map.get("TAGS")); + black_box(cheetah_map.get("PGROUP")); + black_box(cheetah_map.get("MISSING")) + }) + }); + + group.finish(); +} + +criterion_group!(benches, bench_property_build, bench_property_lookup); +criterion_main!(benches); diff --git a/benches/mq_remoting_header.rs b/benches/mq_remoting_header.rs new file mode 100644 index 0000000..f959870 --- /dev/null +++ b/benches/mq_remoting_header.rs @@ -0,0 +1,148 @@ +use cheetah_string::CheetahString; +use compact_str::CompactString; +use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; +use smartstring::alias::String as SmartString; + +fn header_fields() -> Vec<(&'static str, &'static str)> { + vec![ + ("code", "310"), + ("language", "RUST"), + ("version", "455"), + ("opaque", "10001"), + ("flag", "0"), + ("remark", ""), + ("serializeTypeCurrentRPC", "JSON"), + ("topic", "RMQ_SYS_TRACE_TOPIC_00001"), + ("queueId", "4"), + ("bornTimestamp", "1700000000000"), + ("storeTimestamp", "1700000001000"), + ("producerGroup", "order-producer"), + ("consumerGroup", "order-consumer"), + ] +} + +fn encode_pairs(fields: I) -> String +where + I: IntoIterator, + K: AsRef, + V: AsRef, +{ + let mut encoded = String::with_capacity(256); + for (key, value) in fields { + encoded.push_str(key.as_ref()); + encoded.push('='); + encoded.push_str(value.as_ref()); + encoded.push('\n'); + } + encoded +} + +fn bench_header_encode(c: &mut Criterion) { + let fields = header_fields(); + let mut group = c.benchmark_group("mq_remoting_header_encode"); + group.throughput(Throughput::Elements(fields.len() as u64)); + + group.bench_function("String", |b| { + b.iter(|| { + black_box(encode_pairs( + fields + .iter() + .map(|(key, value)| ((*key).to_string(), (*value).to_string())), + )) + }) + }); + + group.bench_function("CompactString", |b| { + b.iter(|| { + black_box(encode_pairs(fields.iter().map(|(key, value)| { + (CompactString::from(*key), CompactString::from(*value)) + }))) + }) + }); + + group.bench_function("SmartString", |b| { + b.iter(|| { + black_box(encode_pairs(fields.iter().map(|(key, value)| { + (SmartString::from(*key), SmartString::from(*value)) + }))) + }) + }); + + group.bench_function("CheetahString", |b| { + b.iter(|| { + black_box(encode_pairs(fields.iter().map(|(key, value)| { + (CheetahString::from(*key), CheetahString::from(*value)) + }))) + }) + }); + + group.finish(); +} + +fn bench_header_parse(c: &mut Criterion) { + let encoded = encode_pairs(header_fields()); + let mut group = c.benchmark_group("mq_remoting_header_parse"); + group.throughput(Throughput::Bytes(encoded.len() as u64)); + + group.bench_function("String", |b| { + b.iter(|| { + let mut fields = Vec::new(); + for line in encoded.lines() { + if let Some((key, value)) = line.split_once('=') { + fields.push((black_box(key.to_string()), black_box(value.to_string()))); + } + } + black_box(fields) + }) + }); + + group.bench_function("CompactString", |b| { + b.iter(|| { + let mut fields = Vec::new(); + for line in encoded.lines() { + if let Some((key, value)) = line.split_once('=') { + fields.push(( + black_box(CompactString::from(key)), + black_box(CompactString::from(value)), + )); + } + } + black_box(fields) + }) + }); + + group.bench_function("SmartString", |b| { + b.iter(|| { + let mut fields = Vec::new(); + for line in encoded.lines() { + if let Some((key, value)) = line.split_once('=') { + fields.push(( + black_box(SmartString::from(key)), + black_box(SmartString::from(value)), + )); + } + } + black_box(fields) + }) + }); + + group.bench_function("CheetahString", |b| { + b.iter(|| { + let mut fields = Vec::new(); + for line in encoded.lines() { + if let Some((key, value)) = line.split_once('=') { + fields.push(( + black_box(CheetahString::from(key)), + black_box(CheetahString::from(value)), + )); + } + } + black_box(fields) + }) + }); + + group.finish(); +} + +criterion_group!(benches, bench_header_encode, bench_header_parse); +criterion_main!(benches); diff --git a/benches/mq_topic.rs b/benches/mq_topic.rs new file mode 100644 index 0000000..467cb60 --- /dev/null +++ b/benches/mq_topic.rs @@ -0,0 +1,139 @@ +use cheetah_string::CheetahString; +use compact_str::CompactString; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use smartstring::alias::String as SmartString; +use std::collections::HashMap; +use std::sync::Arc; + +const TOPIC_COUNT: usize = 10_000; + +fn topics() -> Vec { + (0..TOPIC_COUNT) + .map(|i| format!("RMQ_SYS_TRACE_TOPIC_{:05}", i)) + .collect() +} + +fn bench_topic_insert(c: &mut Criterion) { + let topics = topics(); + let mut group = c.benchmark_group("mq_topic_insert"); + group.throughput(Throughput::Elements(TOPIC_COUNT as u64)); + + group.bench_function("String", |b| { + b.iter(|| { + let mut map = HashMap::with_capacity(TOPIC_COUNT); + for (idx, topic) in topics.iter().enumerate() { + map.insert(black_box(topic.clone()), idx); + } + black_box(map) + }) + }); + + group.bench_function("Arc", |b| { + b.iter(|| { + let mut map = HashMap::with_capacity(TOPIC_COUNT); + for (idx, topic) in topics.iter().enumerate() { + map.insert(black_box(Arc::::from(topic.as_str())), idx); + } + black_box(map) + }) + }); + + group.bench_function("CompactString", |b| { + b.iter(|| { + let mut map = HashMap::with_capacity(TOPIC_COUNT); + for (idx, topic) in topics.iter().enumerate() { + map.insert(black_box(CompactString::from(topic.as_str())), idx); + } + black_box(map) + }) + }); + + group.bench_function("SmartString", |b| { + b.iter(|| { + let mut map = HashMap::with_capacity(TOPIC_COUNT); + for (idx, topic) in topics.iter().enumerate() { + map.insert(black_box(SmartString::from(topic.as_str())), idx); + } + black_box(map) + }) + }); + + group.bench_function("CheetahString", |b| { + b.iter(|| { + let mut map = HashMap::with_capacity(TOPIC_COUNT); + for (idx, topic) in topics.iter().enumerate() { + map.insert(black_box(CheetahString::from(topic.as_str())), idx); + } + black_box(map) + }) + }); + + group.finish(); +} + +fn bench_topic_lookup(c: &mut Criterion) { + let topics = topics(); + let string_map: HashMap = topics + .iter() + .enumerate() + .map(|(idx, topic)| (topic.clone(), idx)) + .collect(); + let arc_map: HashMap, usize> = topics + .iter() + .enumerate() + .map(|(idx, topic)| (Arc::::from(topic.as_str()), idx)) + .collect(); + let compact_map: HashMap = topics + .iter() + .enumerate() + .map(|(idx, topic)| (CompactString::from(topic.as_str()), idx)) + .collect(); + let smart_map: HashMap = topics + .iter() + .enumerate() + .map(|(idx, topic)| (SmartString::from(topic.as_str()), idx)) + .collect(); + let cheetah_map: HashMap = topics + .iter() + .enumerate() + .map(|(idx, topic)| (CheetahString::from(topic.as_str()), idx)) + .collect(); + + let needles = [ + "RMQ_SYS_TRACE_TOPIC_00000", + "RMQ_SYS_TRACE_TOPIC_01024", + "RMQ_SYS_TRACE_TOPIC_09999", + ]; + + let mut group = c.benchmark_group("mq_topic_lookup"); + group.throughput(Throughput::Elements(needles.len() as u64)); + + for needle in needles { + group.bench_with_input(BenchmarkId::new("String", needle), needle, |b, needle| { + b.iter(|| black_box(string_map.get(black_box(needle)))) + }); + group.bench_with_input(BenchmarkId::new("Arc", needle), needle, |b, needle| { + b.iter(|| black_box(arc_map.get(black_box(needle)))) + }); + group.bench_with_input( + BenchmarkId::new("CompactString", needle), + needle, + |b, needle| b.iter(|| black_box(compact_map.get(black_box(needle)))), + ); + group.bench_with_input( + BenchmarkId::new("SmartString", needle), + needle, + |b, needle| b.iter(|| black_box(smart_map.get(black_box(needle)))), + ); + group.bench_with_input( + BenchmarkId::new("CheetahString", needle), + needle, + |b, needle| b.iter(|| black_box(cheetah_map.get(black_box(needle)))), + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_topic_insert, bench_topic_lookup); +criterion_main!(benches); diff --git a/scripts/bench-all.ps1 b/scripts/bench-all.ps1 index 2bbc772..cb9cf35 100644 --- a/scripts/bench-all.ps1 +++ b/scripts/bench-all.ps1 @@ -15,6 +15,15 @@ cargo bench --bench comprehensive | cargo bench --bench mutation | Tee-Object -FilePath (Join-Path $ResultDir "mutation.txt") +cargo bench --bench mq_topic | + Tee-Object -FilePath (Join-Path $ResultDir "mq-topic.txt") + +cargo bench --bench mq_properties | + Tee-Object -FilePath (Join-Path $ResultDir "mq-properties.txt") + +cargo bench --bench mq_remoting_header | + Tee-Object -FilePath (Join-Path $ResultDir "mq-remoting-header.txt") + cargo bench --bench pattern | Tee-Object -FilePath (Join-Path $ResultDir "pattern.txt") diff --git a/scripts/bench-all.sh b/scripts/bench-all.sh index 062efe8..a86a9df 100755 --- a/scripts/bench-all.sh +++ b/scripts/bench-all.sh @@ -10,5 +10,8 @@ cargo test layout_snapshot --all-features -- --nocapture | tee "$RESULT_DIR/layo cargo bench --bench layout | tee "$RESULT_DIR/layout-bench.txt" cargo bench --bench comprehensive | tee "$RESULT_DIR/comprehensive.txt" cargo bench --bench mutation | tee "$RESULT_DIR/mutation.txt" +cargo bench --bench mq_topic | tee "$RESULT_DIR/mq-topic.txt" +cargo bench --bench mq_properties | tee "$RESULT_DIR/mq-properties.txt" +cargo bench --bench mq_remoting_header | tee "$RESULT_DIR/mq-remoting-header.txt" cargo bench --bench pattern | tee "$RESULT_DIR/pattern.txt" cargo bench --bench simd --features simd | tee "$RESULT_DIR/simd.txt"