diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 9aec1ae..f2550bc 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -10,8 +10,8 @@ env: CI: true jobs: - build: - runs-on: ubuntu-latest + build: + runs-on: ${{ matrix.os }} strategy: matrix: @@ -50,3 +50,14 @@ jobs: - name: Test run: cargo test --verbose --all-features + + - name: Layout snapshot + run: cargo test layout_snapshot --all-features -- --nocapture + + - name: Upload layout artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: layout-${{ matrix.os }}-${{ matrix.rust }} + path: target/layout-artifacts + if-no-files-found: ignore diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..d3f5beb --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,110 @@ +name: Release + +on: + push: + tags: + - "v*.*.*" + workflow_dispatch: + inputs: + version: + description: "Version to release, for example 1.1.0" + required: true + type: string + +permissions: + contents: write + +env: + CARGO_TERM_COLOR: always + RUST_BACKTRACE: full + +jobs: + publish: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Rust + uses: dtolnay/rust-toolchain@stable + with: + toolchain: stable + + - name: Resolve release version + id: version + shell: bash + run: | + if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + VERSION="${{ inputs.version }}" + else + VERSION="${GITHUB_REF_NAME#v}" + fi + + if [[ ! "$VERSION" =~ ^[0-9]+\.[0-9]+\.[0-9]+([+-][0-9A-Za-z.-]+)?$ ]]; then + echo "Invalid release version: $VERSION" >&2 + exit 1 + fi + + MANIFEST_VERSION="$(cargo metadata --no-deps --format-version 1 | python3 -c 'import json, sys; print(json.load(sys.stdin)["packages"][0]["version"])')" + if [[ "$MANIFEST_VERSION" != "$VERSION" ]]; then + echo "Cargo.toml version $MANIFEST_VERSION does not match release version $VERSION" >&2 + exit 1 + fi + + echo "version=$VERSION" >> "$GITHUB_OUTPUT" + echo "tag=v$VERSION" >> "$GITHUB_OUTPUT" + + - name: Create tag for manual release + if: github.event_name == 'workflow_dispatch' + shell: bash + run: | + TAG="${{ steps.version.outputs.tag }}" + CURRENT_SHA="$(git rev-parse HEAD)" + + if git rev-parse "$TAG" >/dev/null 2>&1; then + TAG_SHA="$(git rev-list -n 1 "$TAG")" + if [[ "$TAG_SHA" != "$CURRENT_SHA" ]]; then + echo "Tag $TAG already exists at $TAG_SHA, not current HEAD $CURRENT_SHA" >&2 + exit 1 + fi + else + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git tag "$TAG" + git push origin "$TAG" + fi + + - name: Check formatting + run: cargo fmt -- --check + + - name: Lint + run: cargo clippy --all-features -- -D warnings + + - name: Test all features + run: cargo test --all-features + + - name: Test no-default feature matrix + run: cargo test --no-default-features --features serde,bytes,simd + + - name: Package crate + run: cargo package + + - name: Publish crate to crates.io + run: cargo publish --token "$CARGO_REGISTRY_TOKEN" + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} + + - name: Create GitHub release + env: + GH_TOKEN: ${{ github.token }} + TAG: ${{ steps.version.outputs.tag }} + VERSION: ${{ steps.version.outputs.version }} + run: | + gh release create "$TAG" \ + "target/package/cheetah-string-$VERSION.crate#cheetah-string-$VERSION.crate" \ + --verify-tag \ + --title "cheetah-string $TAG" \ + --generate-notes diff --git a/Cargo.toml b/Cargo.toml index a258edb..2a1d9fd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cheetah-string" -version = "1.0.1" +version = "1.1.0" authors = ["mxsm "] edition = "2021" homepage = "https://github.com/mxsm/cheetah-string" @@ -15,14 +15,15 @@ A lightweight, high-performance string manipulation library optimized for speed- """ [dependencies] -bytes = "1.10.0" +bytes = { version = "1.10.0", optional = true, default-features = false } +memchr = { version = "2", default-features = false } serde = { version = "1.0", optional = true, default-features = false, features = ["alloc"] } [features] default = ["std"] -std = [] -serde = ["serde/alloc"] -bytes = [] +std = ["memchr/std"] +serde = ["dep:serde", "serde/alloc"] +bytes = ["dep:bytes"] simd = [] [dev-dependencies] @@ -37,6 +38,18 @@ harness = false name = "comprehensive" harness = false +[[bench]] +name = "layout" +harness = false + +[[bench]] +name = "mutation" +harness = false + +[[bench]] +name = "pattern" +harness = false + [[bench]] name = "simd" harness = false diff --git a/README.md b/README.md index 9156ce1..c1d6f91 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,8 @@ CheetahString is a versatile string type that goes beyond the standard library's - **⚡ Performance Focused** - Optimized for common string operations - Reduced memory allocations via intelligent internal representation - - Optional SIMD acceleration for string matching operations (x86_64 SSE2) + - `memchr`/`memmem` substring search by default + - Optional SIMD acceleration for selected byte comparisons (x86_64 SSE2) - Benchmarked against standard library types - **🛡️ Safe & Correct** @@ -45,14 +46,14 @@ Add this to your `Cargo.toml`: ```toml [dependencies] -cheetah-string = "1.0.0" +cheetah-string = "1.1.0" ``` ### Optional Features ```toml [dependencies] -cheetah-string = { version = "1.0.0", features = ["bytes", "serde", "simd"] } +cheetah-string = { version = "1.1.0", features = ["bytes", "serde", "simd"] } ``` Available features: @@ -106,7 +107,7 @@ CheetahString is designed with performance in mind: - **Efficient Sharing**: Large immutable strings use `Arc` for cheap cloning - **Fast Builders**: Capacity-preserving builder paths use owned heap storage for direct mutation - **Optimized Operations**: Common operations like concatenation have fast-path implementations -- **SIMD Acceleration** (with `simd` feature): String matching operations (`starts_with`, `ends_with`, `contains`, `find`, equality comparisons) are accelerated using SSE2 SIMD instructions on x86_64 platforms. The implementation automatically falls back to scalar code for small inputs or when SIMD is not available. +- **Search Acceleration**: Substring search uses `memchr`/`memmem` by default. With the `simd` feature, selected byte comparisons such as prefix, suffix, and equality paths can use SSE2 on x86_64 platforms. Run benchmarks: ```bash diff --git a/bench-results/README.md b/bench-results/README.md new file mode 100644 index 0000000..1e0fc59 --- /dev/null +++ b/bench-results/README.md @@ -0,0 +1,53 @@ +# Benchmark Artifacts + +This directory defines the artifact layout for performance-sensitive changes. +Generated benchmark output should be committed only when it is intentionally +used as review evidence for a release or PR. + +Recommended layout: + +```text +bench-results/ + layout/ + current.json + v1.1.json + v1.2.json + v2-packed.json + criterion/ + before/ + after/ + mq/ + topic.json + properties.json + remoting-header.json + summaries/ + summary-v1.1-v1.2.md + summary-v1.2-v2-packed.md +``` + +Minimum metadata for generated JSON artifacts: + +```json +{ + "crate": "cheetah-string", + "version": "1.1.0", + "profile": "release", + "target": "x86_64-unknown-linux-gnu", + "rustc": "rustc 1.xx.x", + "os": "linux", + "cpu": "model name", + "bench": "layout" +} +``` + +For local capture, run: + +```bash +scripts/bench-all.sh current +``` + +On Windows PowerShell: + +```powershell +scripts/bench-all.ps1 current +``` diff --git a/benches/comprehensive.rs b/benches/comprehensive.rs index 14a405f..174ce1a 100644 --- a/benches/comprehensive.rs +++ b/benches/comprehensive.rs @@ -362,13 +362,13 @@ fn bench_internal_hot_paths(c: &mut Criterion) { }); let long_bytes = vec![b'a'; 256]; - group.bench_function("CheetahString::from(Vec 256B)", |b| { - b.iter(|| black_box(CheetahString::from(long_bytes.clone()))) + group.bench_function("CheetahString::try_from_vec(256B)", |b| { + b.iter(|| black_box(CheetahString::try_from_vec(long_bytes.clone()).unwrap())) }); - group.bench_function("String::from(CheetahString::from(Vec 256B))", |b| { + group.bench_function("String::from(CheetahString::try_from_vec(256B))", |b| { b.iter(|| { - let value = CheetahString::from(long_bytes.clone()); + let value = CheetahString::try_from_vec(long_bytes.clone()).unwrap(); black_box(String::from(value)) }) }); diff --git a/benches/layout.rs b/benches/layout.rs new file mode 100644 index 0000000..ad7eb52 --- /dev/null +++ b/benches/layout.rs @@ -0,0 +1,57 @@ +use cheetah_string::CheetahString; +use std::env; +use std::fs; +use std::mem::{align_of, size_of}; +use std::path::PathBuf; + +fn target_dir() -> PathBuf { + env::var_os("CARGO_TARGET_DIR") + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from("target")) +} + +fn layout_entry(name: &str) -> String { + format!( + r#"{{"type":"{}","size":{},"align":{}}}"#, + name, + size_of::(), + align_of::() + ) +} + +fn main() { + let layouts = [ + layout_entry::("CheetahString"), + layout_entry::>("Option"), + layout_entry::("String"), + layout_entry::>("Option"), + layout_entry::<&str>("&str"), + layout_entry::>("Option<&str>"), + layout_entry::>("Arc"), + layout_entry::>>("Option>"), + ]; + + let snapshot = format!( + concat!( + "{{\n", + " \"crate\":\"cheetah-string\",\n", + " \"profile\":\"bench\",\n", + " \"target_arch\":\"{}\",\n", + " \"target_os\":\"{}\",\n", + " \"pointer_width\":\"{}\",\n", + " \"layouts\":[\n {}\n ]\n", + "}}\n" + ), + env::consts::ARCH, + env::consts::OS, + std::mem::size_of::() * 8, + layouts.join(",\n ") + ); + + let artifact_dir = target_dir().join("layout-artifacts"); + fs::create_dir_all(&artifact_dir).expect("create layout artifact directory"); + fs::write(artifact_dir.join("layout-bench.json"), &snapshot) + .expect("write layout bench artifact"); + + println!("{snapshot}"); +} diff --git a/benches/mutation.rs b/benches/mutation.rs new file mode 100644 index 0000000..f70add9 --- /dev/null +++ b/benches/mutation.rs @@ -0,0 +1,83 @@ +use cheetah_string::CheetahString; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; + +fn bench_push_str(c: &mut Criterion) { + let mut group = c.benchmark_group("push_str"); + + group.bench_function("inline_in_place", |b| { + b.iter(|| { + let mut s = CheetahString::from("hello"); + s.push_str(black_box(" world")); + black_box(s) + }) + }); + + group.bench_function("owned_spare_capacity", |b| { + b.iter(|| { + let mut s = CheetahString::with_capacity(128); + s.push_str("hello"); + s.push_str(black_box(" world")); + black_box(s) + }) + }); + + group.bench_function("static_fallback", |b| { + b.iter(|| { + let mut s = CheetahString::from_static_str("hello"); + s.push_str(black_box(" world")); + black_box(s) + }) + }); + + group.finish(); +} + +fn bench_add(c: &mut Criterion) { + let mut group = c.benchmark_group("add"); + + for rhs_len in [1, 8, 32, 128] { + let rhs = "x".repeat(rhs_len); + + group.bench_with_input( + BenchmarkId::new("owned_capacity_str", rhs_len), + &rhs, + |b, rhs| { + b.iter(|| { + let mut s = CheetahString::with_capacity(256); + s.push_str("hello"); + black_box(s + black_box(rhs.as_str())) + }) + }, + ); + + group.bench_with_input(BenchmarkId::new("inline_str", rhs_len), &rhs, |b, rhs| { + b.iter(|| black_box(CheetahString::from("h") + black_box(rhs.as_str()))) + }); + } + + group.finish(); +} + +fn bench_reserve(c: &mut Criterion) { + let mut group = c.benchmark_group("reserve"); + + for additional in [0, 8, 128] { + group.bench_with_input( + BenchmarkId::from_parameter(additional), + &additional, + |b, extra| { + b.iter(|| { + let mut s = CheetahString::with_capacity(64); + s.push_str("hello"); + s.reserve(black_box(*extra)); + black_box(s) + }) + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_push_str, bench_add, bench_reserve); +criterion_main!(benches); diff --git a/benches/pattern.rs b/benches/pattern.rs new file mode 100644 index 0000000..ef8a680 --- /dev/null +++ b/benches/pattern.rs @@ -0,0 +1,53 @@ +use cheetah_string::{CheetahFinder, CheetahString}; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; + +fn bench_pathological_find(c: &mut Criterion) { + let mut group = c.benchmark_group("pathological_find"); + + for size in [128, 1024, 16 * 1024, 64 * 1024] { + let haystack = format!("{}b", "a".repeat(size)); + let cheetah = CheetahString::from(haystack.as_str()); + let needle = "aaaab"; + let finder = CheetahFinder::new(needle); + + group.throughput(Throughput::Bytes(haystack.len() as u64)); + + group.bench_with_input(BenchmarkId::new("cheetah_find", size), &size, |b, _| { + b.iter(|| black_box(&cheetah).find(black_box(needle))) + }); + + group.bench_with_input(BenchmarkId::new("finder", size), &size, |b, _| { + b.iter(|| finder.find_in(black_box(&cheetah))) + }); + + group.bench_with_input(BenchmarkId::new("std_find", size), &size, |b, _| { + b.iter(|| black_box(haystack.as_str()).find(black_box(needle))) + }); + } + + group.finish(); +} + +fn bench_single_byte_find(c: &mut Criterion) { + let mut group = c.benchmark_group("single_byte_find"); + + for size in [128, 1024, 16 * 1024, 64 * 1024] { + let haystack = format!("{}z", "a".repeat(size)); + let cheetah = CheetahString::from(haystack.as_str()); + + group.throughput(Throughput::Bytes(haystack.len() as u64)); + + group.bench_with_input(BenchmarkId::new("cheetah_find", size), &size, |b, _| { + b.iter(|| black_box(&cheetah).find(black_box("z"))) + }); + + group.bench_with_input(BenchmarkId::new("std_find", size), &size, |b, _| { + b.iter(|| black_box(haystack.as_str()).find(black_box("z"))) + }); + } + + group.finish(); +} + +criterion_group!(benches, bench_pathological_find, bench_single_byte_find); +criterion_main!(benches); diff --git a/scripts/bench-all.ps1 b/scripts/bench-all.ps1 new file mode 100644 index 0000000..2bbc772 --- /dev/null +++ b/scripts/bench-all.ps1 @@ -0,0 +1,22 @@ +$Version = if ($args.Count -gt 0) { $args[0] } else { "current" } +$ResultDir = Join-Path "bench-results" $Version + +New-Item -ItemType Directory -Force -Path $ResultDir | Out-Null + +cargo test layout_snapshot --all-features -- --nocapture | + Tee-Object -FilePath (Join-Path $ResultDir "layout-test.txt") + +cargo bench --bench layout | + Tee-Object -FilePath (Join-Path $ResultDir "layout-bench.txt") + +cargo bench --bench comprehensive | + Tee-Object -FilePath (Join-Path $ResultDir "comprehensive.txt") + +cargo bench --bench mutation | + Tee-Object -FilePath (Join-Path $ResultDir "mutation.txt") + +cargo bench --bench pattern | + Tee-Object -FilePath (Join-Path $ResultDir "pattern.txt") + +cargo bench --bench simd --features simd | + Tee-Object -FilePath (Join-Path $ResultDir "simd.txt") diff --git a/scripts/bench-all.sh b/scripts/bench-all.sh new file mode 100755 index 0000000..062efe8 --- /dev/null +++ b/scripts/bench-all.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env sh +set -eu + +VERSION="${1:-current}" +RESULT_DIR="bench-results/${VERSION}" + +mkdir -p "$RESULT_DIR" + +cargo test layout_snapshot --all-features -- --nocapture | tee "$RESULT_DIR/layout-test.txt" +cargo bench --bench layout | tee "$RESULT_DIR/layout-bench.txt" +cargo bench --bench comprehensive | tee "$RESULT_DIR/comprehensive.txt" +cargo bench --bench mutation | tee "$RESULT_DIR/mutation.txt" +cargo bench --bench pattern | tee "$RESULT_DIR/pattern.txt" +cargo bench --bench simd --features simd | tee "$RESULT_DIR/simd.txt" diff --git a/src/cheetah_string.rs b/src/cheetah_string.rs index 9052254..79d9890 100644 --- a/src/cheetah_string.rs +++ b/src/cheetah_string.rs @@ -47,19 +47,12 @@ impl<'a> From<&'a str> for CheetahString { } } -/// # Safety Warning -/// -/// This implementation uses `unsafe` code and may cause undefined behavior -/// if the bytes are not valid UTF-8. Consider using `CheetahString::try_from_bytes()` -/// for safe UTF-8 validation. -/// -/// This implementation will be deprecated in a future version. -impl From<&[u8]> for CheetahString { +impl<'a> TryFrom<&'a [u8]> for CheetahString { + type Error = Utf8Error; + #[inline] - fn from(b: &[u8]) -> Self { - // SAFETY: This is unsafe and may cause UB if bytes are not valid UTF-8. - // This will be deprecated in favor of try_from_bytes in the next version. - CheetahString::from_slice(unsafe { str::from_utf8_unchecked(b) }) + fn try_from(b: &'a [u8]) -> Result { + CheetahString::try_from_bytes(b) } } @@ -71,19 +64,12 @@ impl FromStr for CheetahString { } } -/// # Safety Warning -/// -/// This implementation uses `unsafe` code and may cause undefined behavior -/// if the bytes are not valid UTF-8. Consider using `CheetahString::try_from_vec()` -/// for safe UTF-8 validation. -/// -/// This implementation will be deprecated in a future version. -impl From> for CheetahString { +impl TryFrom> for CheetahString { + type Error = Utf8Error; + #[inline] - fn from(v: Vec) -> Self { - // SAFETY: This constructor does not validate UTF-8 and may cause UB - // if the bytes are later observed as a string. - CheetahString::from_vec(v) + fn try_from(v: Vec) -> Result { + CheetahString::try_from_vec(v) } } @@ -159,10 +145,12 @@ impl<'a> FromIterator<&'a String> for CheetahString { } #[cfg(feature = "bytes")] -impl From for CheetahString { +impl TryFrom for CheetahString { + type Error = Utf8Error; + #[inline] - fn from(b: bytes::Bytes) -> Self { - CheetahString::from_bytes(b) + fn try_from(b: bytes::Bytes) -> Result { + CheetahString::try_from_bytes_buf(b) } } @@ -184,35 +172,14 @@ impl From for String { unsafe { String::from_utf8_unchecked(data[..len as usize].to_vec()) } } CheetahString { - inner: InnerString::StaticStr(s), + inner: InnerString::Static(s), } => s.to_string(), CheetahString { - inner: InnerString::ArcStr(s), + inner: InnerString::Shared(s), } => s.to_string(), CheetahString { inner: InnerString::Owned(s), } => s, - CheetahString { - inner: InnerString::ArcString(s), - } => match Arc::try_unwrap(s) { - Ok(s) => s, - Err(s) => s.as_ref().clone(), - }, - CheetahString { - inner: InnerString::ArcVecString(s), - } => match Arc::try_unwrap(s) { - // SAFETY: ArcVecString should only be created from valid UTF-8 sources. - Ok(s) => unsafe { String::from_utf8_unchecked(s) }, - // SAFETY: ArcVecString should only be created from valid UTF-8 sources. - Err(s) => unsafe { String::from_utf8_unchecked(s.as_ref().clone()) }, - }, - #[cfg(feature = "bytes")] - CheetahString { - inner: InnerString::Bytes(b), - } => { - // SAFETY: Bytes variant should only be created from valid UTF-8 sources - unsafe { String::from_utf8_unchecked(b.to_vec()) } - } } } } @@ -273,12 +240,33 @@ impl CheetahString { #[inline] pub const fn from_static_str(s: &'static str) -> Self { CheetahString { - inner: InnerString::StaticStr(s), + inner: InnerString::Static(s), } } - #[inline] + #[deprecated( + since = "1.1.0", + note = "use try_from_vec for checked construction or from_utf8_unchecked_vec for an explicit unsafe constructor" + )] pub fn from_vec(s: Vec) -> Self { + CheetahString::try_from_vec(s).expect( + "CheetahString::from_vec requires valid UTF-8; use try_from_vec for fallible construction", + ) + } + + /// Creates a `CheetahString` from a byte vector without validating UTF-8. + /// + /// # Safety + /// + /// The caller must guarantee that `s` contains valid UTF-8 for the entire + /// lifetime of the returned `CheetahString`. + #[inline] + pub unsafe fn from_utf8_unchecked_vec(s: Vec) -> Self { + CheetahString::from_validated_vec_unchecked(s) + } + + #[inline] + fn from_validated_vec_unchecked(s: Vec) -> Self { if s.len() <= INLINE_CAPACITY { let mut data = [0u8; INLINE_CAPACITY]; data[..s.len()].copy_from_slice(&s); @@ -289,9 +277,8 @@ impl CheetahString { }, } } else { - CheetahString { - inner: InnerString::ArcVecString(Arc::new(s)), - } + // SAFETY: Callers validate UTF-8 before reaching this helper. + CheetahString::from_builder_string(unsafe { String::from_utf8_unchecked(s) }) } } @@ -314,9 +301,8 @@ impl CheetahString { /// assert!(CheetahString::try_from_vec(invalid).is_err()); /// ``` pub fn try_from_vec(v: Vec) -> Result { - // Validate UTF-8 str::from_utf8(&v)?; - Ok(CheetahString::from_vec(v)) + Ok(CheetahString::from_validated_vec_unchecked(v)) } /// Creates a `CheetahString` from a byte slice with UTF-8 validation. @@ -342,10 +328,62 @@ impl CheetahString { Ok(CheetahString::from_slice(s)) } + /// Creates a `CheetahString` from a byte slice without validating UTF-8. + /// + /// # Safety + /// + /// The caller must guarantee that `b` contains valid UTF-8. + #[inline] + pub unsafe fn from_utf8_unchecked_bytes(b: &[u8]) -> Self { + // SAFETY: The caller guarantees that `b` contains valid UTF-8. + CheetahString::from_slice(unsafe { str::from_utf8_unchecked(b) }) + } + + /// Creates a `CheetahString` from a shared byte vector with UTF-8 validation. + /// + /// # Errors + /// + /// Returns an error if the bytes are not valid UTF-8. + #[inline] + pub fn try_from_arc_vec(s: Arc>) -> Result { + match Arc::try_unwrap(s) { + Ok(v) => CheetahString::try_from_vec(v), + Err(s) => { + let s = str::from_utf8(s.as_slice())?; + Ok(CheetahString::from_slice(s)) + } + } + } + + #[deprecated( + since = "1.1.0", + note = "use try_from_arc_vec for checked construction or from_utf8_unchecked_arc_vec for an explicit unsafe constructor" + )] #[inline] pub fn from_arc_vec(s: Arc>) -> Self { - CheetahString { - inner: InnerString::ArcVecString(s), + CheetahString::try_from_arc_vec(s).expect( + "CheetahString::from_arc_vec requires valid UTF-8; use try_from_arc_vec for fallible construction", + ) + } + + /// Creates a `CheetahString` from a shared byte vector without validating UTF-8. + /// + /// # Safety + /// + /// The caller must guarantee that `s` contains valid UTF-8. + #[inline] + pub unsafe fn from_utf8_unchecked_arc_vec(s: Arc>) -> Self { + CheetahString::from_validated_arc_vec_unchecked(s) + } + + #[inline] + fn from_validated_arc_vec_unchecked(s: Arc>) -> Self { + match Arc::try_unwrap(s) { + Ok(v) => CheetahString::from_validated_vec_unchecked(v), + Err(s) => { + // SAFETY: Callers validate UTF-8 before reaching this helper. + unsafe { CheetahString::from_utf8_unchecked_bytes(s.as_slice()) } + } } } @@ -365,7 +403,7 @@ impl CheetahString { // Use Arc for long borrowed strings to avoid the extra String header. let arc_str: Arc = Arc::from(s); CheetahString { - inner: InnerString::ArcStr(arc_str), + inner: InnerString::Shared(arc_str), } } } @@ -386,7 +424,7 @@ impl CheetahString { // Use Arc for long strings to avoid double allocation let arc_str: Arc = s.into_boxed_str().into(); CheetahString { - inner: InnerString::ArcStr(arc_str), + inner: InnerString::Shared(arc_str), } } } @@ -411,17 +449,47 @@ impl CheetahString { #[inline] pub fn from_arc_string(s: Arc) -> Self { - CheetahString { - inner: InnerString::ArcString(s), + match Arc::try_unwrap(s) { + Ok(s) => CheetahString::from_builder_string(s), + Err(s) => CheetahString::from_slice(s.as_str()), } } #[inline] #[cfg(feature = "bytes")] + #[deprecated( + since = "1.1.0", + note = "use try_from_bytes_buf for checked construction or from_utf8_unchecked_bytes_buf for an explicit unsafe constructor" + )] pub fn from_bytes(b: bytes::Bytes) -> Self { - CheetahString { - inner: InnerString::Bytes(b), - } + CheetahString::try_from_bytes_buf(b).expect( + "CheetahString::from_bytes requires valid UTF-8; use try_from_bytes_buf for fallible construction", + ) + } + + #[inline] + #[cfg(feature = "bytes")] + pub fn try_from_bytes_buf(b: bytes::Bytes) -> Result { + str::from_utf8(b.as_ref())?; + Ok(CheetahString::from_validated_bytes_unchecked(b)) + } + + /// Creates a `CheetahString` from `bytes::Bytes` without validating UTF-8. + /// + /// # Safety + /// + /// The caller must guarantee that `b` contains valid UTF-8. + #[inline] + #[cfg(feature = "bytes")] + pub unsafe fn from_utf8_unchecked_bytes_buf(b: bytes::Bytes) -> Self { + CheetahString::from_validated_bytes_unchecked(b) + } + + #[inline] + #[cfg(feature = "bytes")] + fn from_validated_bytes_unchecked(b: bytes::Bytes) -> Self { + // SAFETY: Callers validate UTF-8 before reaching this helper. + unsafe { CheetahString::from_utf8_unchecked_bytes(b.as_ref()) } } #[inline] @@ -432,21 +500,9 @@ impl CheetahString { // The data is always valid UTF-8 up to len bytes. unsafe { str::from_utf8_unchecked(&data[..*len as usize]) } } - InnerString::StaticStr(s) => s, - InnerString::ArcStr(s) => s.as_ref(), + InnerString::Static(s) => s, + InnerString::Shared(s) => s.as_ref(), InnerString::Owned(s) => s.as_str(), - InnerString::ArcString(s) => s.as_str(), - InnerString::ArcVecString(s) => { - // SAFETY: ArcVecString is only created from validated UTF-8 sources. - // All constructors ensure this invariant is maintained. - unsafe { str::from_utf8_unchecked(s.as_ref()) } - } - #[cfg(feature = "bytes")] - InnerString::Bytes(b) => { - // SAFETY: Bytes variant is only created from validated UTF-8 sources. - // The from_bytes constructor ensures this invariant. - unsafe { str::from_utf8_unchecked(b.as_ref()) } - } } } @@ -454,13 +510,9 @@ impl CheetahString { pub fn as_bytes(&self) -> &[u8] { match &self.inner { InnerString::Inline { len, data } => &data[..*len as usize], - InnerString::StaticStr(s) => s.as_bytes(), - InnerString::ArcStr(s) => s.as_bytes(), + InnerString::Static(s) => s.as_bytes(), + InnerString::Shared(s) => s.as_bytes(), InnerString::Owned(s) => s.as_bytes(), - InnerString::ArcString(s) => s.as_bytes(), - InnerString::ArcVecString(s) => s.as_ref(), - #[cfg(feature = "bytes")] - InnerString::Bytes(b) => b.as_ref(), } } @@ -468,13 +520,9 @@ impl CheetahString { pub fn len(&self) -> usize { match &self.inner { InnerString::Inline { len, .. } => *len as usize, - InnerString::StaticStr(s) => s.len(), - InnerString::ArcStr(s) => s.len(), + InnerString::Static(s) => s.len(), + InnerString::Shared(s) => s.len(), InnerString::Owned(s) => s.len(), - InnerString::ArcString(s) => s.len(), - InnerString::ArcVecString(s) => s.len(), - #[cfg(feature = "bytes")] - InnerString::Bytes(b) => b.len(), } } @@ -482,13 +530,9 @@ impl CheetahString { pub fn is_empty(&self) -> bool { match &self.inner { InnerString::Inline { len, .. } => *len == 0, - InnerString::StaticStr(s) => s.is_empty(), - InnerString::ArcStr(s) => s.is_empty(), + InnerString::Static(s) => s.is_empty(), + InnerString::Shared(s) => s.is_empty(), InnerString::Owned(s) => s.is_empty(), - InnerString::ArcString(s) => s.is_empty(), - InnerString::ArcVecString(s) => s.is_empty(), - #[cfg(feature = "bytes")] - InnerString::Bytes(b) => b.is_empty(), } } @@ -610,14 +654,7 @@ impl CheetahString { match pat.as_str_pattern() { StrPatternImpl::Char(c) => self.as_str().contains(c), StrPatternImpl::Str(s) => { - #[cfg(all(feature = "simd", target_arch = "x86_64"))] - { - crate::simd::find_bytes(self.as_bytes(), s.as_bytes()).is_some() - } - #[cfg(not(all(feature = "simd", target_arch = "x86_64")))] - { - self.as_str().contains(s) - } + crate::search::find_bytes(self.as_bytes(), s.as_bytes()).is_some() } } } @@ -655,14 +692,7 @@ impl CheetahString { #[inline] pub fn find>(&self, pat: P) -> Option { let pat = pat.as_ref(); - #[cfg(all(feature = "simd", target_arch = "x86_64"))] - { - crate::simd::find_bytes(self.as_bytes(), pat.as_bytes()) - } - #[cfg(not(all(feature = "simd", target_arch = "x86_64")))] - { - self.as_str().find(pat) - } + crate::search::find_bytes(self.as_bytes(), pat.as_bytes()) } /// Returns the byte index of the last occurrence of the pattern, or `None` if not found. @@ -677,7 +707,7 @@ impl CheetahString { /// ``` #[inline] pub fn rfind>(&self, pat: P) -> Option { - self.as_str().rfind(pat.as_ref()) + crate::search::rfind_bytes(self.as_bytes(), pat.as_ref().as_bytes()) } /// Returns a string slice with leading and trailing whitespace removed. @@ -905,6 +935,10 @@ impl CheetahString { #[inline] fn push_str_internal(&mut self, string: &str) { + if string.is_empty() { + return; + } + match &mut self.inner { InnerString::Inline { len, data } => { let total_len = *len as usize + string.len(); @@ -918,12 +952,6 @@ impl CheetahString { s.push_str(string); return; } - InnerString::ArcString(arc) => { - if let Some(s) = Arc::get_mut(arc) { - s.push_str(string); - return; - } - } _ => {} } @@ -938,7 +966,7 @@ impl CheetahString { /// /// This method is optimized for incremental building and will: /// - Mutate inline storage when possible - /// - Mutate unique Arc in-place when available + /// - Mutate owned heap storage in-place when capacity allows /// - Only allocate when necessary /// /// # Examples @@ -972,6 +1000,10 @@ impl CheetahString { /// ``` #[inline] pub fn reserve(&mut self, additional: usize) { + if additional == 0 { + return; + } + match &mut self.inner { InnerString::Inline { len, .. } if *len as usize + additional <= INLINE_CAPACITY => { return; @@ -981,12 +1013,6 @@ impl CheetahString { s.reserve(additional); return; } - InnerString::ArcString(arc) => { - if let Some(s) = Arc::get_mut(arc) { - s.reserve(additional); - return; - } - } _ => {} } @@ -1135,28 +1161,9 @@ impl Add<&str> for CheetahString { /// assert_eq!(result, "Hello World"); /// ``` #[inline] - fn add(self, rhs: &str) -> Self::Output { - let total_len = self.len() + rhs.len(); - - // Fast path: result fits in inline storage - if total_len <= INLINE_CAPACITY { - let mut data = [0u8; INLINE_CAPACITY]; - let self_bytes = self.as_bytes(); - data[..self_bytes.len()].copy_from_slice(self_bytes); - data[self_bytes.len()..total_len].copy_from_slice(rhs.as_bytes()); - return CheetahString { - inner: InnerString::Inline { - len: total_len as u8, - data, - }, - }; - } - - // Slow path: allocate for long result - let mut result = String::with_capacity(total_len); - result.push_str(self.as_str()); - result.push_str(rhs); - CheetahString::from_string(result) + fn add(mut self, rhs: &str) -> Self::Output { + self.push_str_internal(rhs); + self } } @@ -1176,28 +1183,9 @@ impl Add<&CheetahString> for CheetahString { /// assert_eq!(result, "Hello World"); /// ``` #[inline] - fn add(self, rhs: &CheetahString) -> Self::Output { - let total_len = self.len() + rhs.len(); - - // Fast path: result fits in inline storage - if total_len <= INLINE_CAPACITY { - let mut data = [0u8; INLINE_CAPACITY]; - let self_bytes = self.as_bytes(); - data[..self_bytes.len()].copy_from_slice(self_bytes); - data[self_bytes.len()..total_len].copy_from_slice(rhs.as_bytes()); - return CheetahString { - inner: InnerString::Inline { - len: total_len as u8, - data, - }, - }; - } - - // Slow path: allocate for long result - let mut result = String::with_capacity(total_len); - result.push_str(self.as_str()); - result.push_str(rhs.as_str()); - CheetahString::from_string(result) + fn add(mut self, rhs: &CheetahString) -> Self::Output { + self.push_str_internal(rhs.as_str()); + self } } @@ -1216,28 +1204,13 @@ impl Add for CheetahString { /// assert_eq!(result, "Hello World"); /// ``` #[inline] - fn add(self, rhs: String) -> Self::Output { - let total_len = self.len() + rhs.len(); - - // Fast path: result fits in inline storage - if total_len <= INLINE_CAPACITY { - let mut data = [0u8; INLINE_CAPACITY]; - let self_bytes = self.as_bytes(); - data[..self_bytes.len()].copy_from_slice(self_bytes); - data[self_bytes.len()..total_len].copy_from_slice(rhs.as_bytes()); - return CheetahString { - inner: InnerString::Inline { - len: total_len as u8, - data, - }, - }; + fn add(mut self, rhs: String) -> Self::Output { + if self.is_empty() { + return CheetahString::from_string(rhs); } - // Slow path: allocate for long result - let mut result = String::with_capacity(total_len); - result.push_str(self.as_str()); - result.push_str(&rhs); - CheetahString::from_string(result) + self.push_str_internal(&rhs); + self } } @@ -1288,12 +1261,9 @@ const INLINE_CAPACITY: usize = 23; /// Variants: /// /// * `Inline` - Inline storage for strings <= 23 bytes (zero heap allocations). -/// * `StaticStr(&'static str)` - A static string slice (zero heap allocations). -/// * `ArcStr(Arc)` - A reference-counted string slice (single heap allocation, optimized). +/// * `Static(&'static str)` - A static string slice (zero heap allocations). +/// * `Shared(Arc)` - A reference-counted string slice (single heap allocation, optimized). /// * `Owned(String)` - An owned heap string used for builder-style mutation. -/// * `ArcString(Arc)` - A reference-counted string (for backwards compatibility). -/// * `ArcVecString(Arc>)` - A reference-counted byte vector. -/// * `Bytes(bytes::Bytes)` - A byte buffer (available when the "bytes" feature is enabled). #[derive(Clone)] pub(super) enum InnerString { /// Inline storage for short strings (up to 23 bytes). @@ -1303,20 +1273,12 @@ pub(super) enum InnerString { data: [u8; INLINE_CAPACITY], }, /// Static string slice with 'static lifetime. - StaticStr(&'static str), + Static(&'static str), /// Reference-counted string slice (single heap allocation). /// Preferred for long immutable strings created from owned or borrowed data. - ArcStr(Arc), + Shared(Arc), /// Owned heap-allocated string used when exclusive mutability matters. Owned(String), - /// Reference-counted heap-allocated string. - /// Kept for backwards compatibility and when Arc is explicitly provided. - ArcString(Arc), - /// Reference-counted heap-allocated byte vector. - ArcVecString(Arc>), - /// Bytes type integration (requires "bytes" feature). - #[cfg(feature = "bytes")] - Bytes(bytes::Bytes), } // Sealed trait pattern to support both &str and char in starts_with/ends_with/contains @@ -1455,6 +1417,7 @@ impl<'a> DoubleEndedIterator for SplitWrapper<'a> { #[cfg(test)] mod tests { use super::*; + use alloc::{format, vec}; #[test] fn with_capacity_above_inline_uses_heap_storage() { @@ -1492,14 +1455,14 @@ mod tests { } #[test] - fn long_borrowed_str_uses_arc_str_storage() { + fn long_borrowed_str_uses_shared_storage() { let value = "a".repeat(INLINE_CAPACITY + 1); let s = CheetahString::from_slice(&value); match &s.inner { - InnerString::ArcStr(inner) => assert_eq!(inner.as_ref(), value.as_str()), + InnerString::Shared(inner) => assert_eq!(inner.as_ref(), value.as_str()), other => panic!( - "expected ArcStr for long borrowed input, got {:?}", + "expected Shared for long borrowed input, got {:?}", core::mem::discriminant(other) ), } @@ -1522,17 +1485,17 @@ mod tests { } #[test] - fn long_vec_conversion_uses_arc_vec_storage() { + fn long_vec_conversion_uses_owned_storage() { let value = "a".repeat(INLINE_CAPACITY + 1).into_bytes(); - let s = CheetahString::from(value); + let s = CheetahString::try_from_vec(value).expect("valid utf-8"); match &s.inner { - InnerString::ArcVecString(inner) => { + InnerString::Owned(inner) => { assert_eq!(inner.len(), INLINE_CAPACITY + 1); - assert_eq!(inner.as_slice(), vec![b'a'; INLINE_CAPACITY + 1].as_slice()); + assert_eq!(inner.as_bytes(), vec![b'a'; INLINE_CAPACITY + 1].as_slice()); } other => panic!( - "expected ArcVecString for long Vec conversion, got {:?}", + "expected Owned for long Vec conversion, got {:?}", core::mem::discriminant(other) ), } diff --git a/src/lib.rs b/src/lib.rs index 5d030ba..76eead9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,15 +4,15 @@ //! It is usable in both `std` and `no_std` environments. Additionally, CheetahString supports serde for serialization and deserialization. //! CheetahString also supports the `bytes` feature, allowing conversion to the `bytes::Bytes` type. //! It minimizes allocations across small, shared, and builder-oriented string workloads. +//! Substring search uses `memchr`/`memmem` by default. //! //! # SIMD Acceleration //! //! When compiled with the `simd` feature flag, CheetahString uses SIMD (Single Instruction, Multiple Data) -//! instructions to accelerate string matching operations on x86_64 platforms with SSE2 support. +//! instructions to accelerate selected byte comparisons on x86_64 platforms with SSE2 support. //! SIMD acceleration is applied to: //! - `starts_with()` - Pattern prefix matching //! - `ends_with()` - Pattern suffix matching -//! - `contains()` / `find()` - Substring search //! - Equality comparisons (`==`, `!=`) //! //! The implementation automatically uses SIMD for strings >= 16 bytes and falls back to scalar operations @@ -21,7 +21,7 @@ //! To enable SIMD acceleration: //! ```toml //! [dependencies] -//! cheetah-string = { version = "1.0.0", features = ["simd"] } +//! cheetah-string = { version = "1.1.0", features = ["simd"] } //! ``` //! //! # Examples @@ -40,13 +40,13 @@ //! //! ``` //! -//! Using SIMD-accelerated operations (when `simd` feature is enabled): +//! Using accelerated search operations: //! ```rust //! use cheetah_string::CheetahString; //! //! let url = CheetahString::from("https://api.example.com/v1/users"); //! -//! // These operations use SIMD when the pattern is >= 16 bytes +//! // Substring search uses memchr/memmem by default. //! if url.starts_with("https://") { //! println!("Secure connection"); //! } @@ -60,6 +60,7 @@ extern crate alloc; mod cheetah_string; mod error; +mod search; #[cfg(feature = "serde")] mod serde; @@ -69,3 +70,4 @@ mod simd; pub use cheetah_string::{CheetahString, SplitPattern, SplitStr, SplitWrapper, StrPattern}; pub use error::{Error, Result}; +pub use search::CheetahFinder; diff --git a/src/search.rs b/src/search.rs new file mode 100644 index 0000000..b5b52b9 --- /dev/null +++ b/src/search.rs @@ -0,0 +1,70 @@ +pub(crate) fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option { + if needle.is_empty() { + return Some(0); + } + + if needle.len() == 1 { + return memchr::memchr(needle[0], haystack); + } + + memchr::memmem::find(haystack, needle) +} + +pub(crate) fn rfind_bytes(haystack: &[u8], needle: &[u8]) -> Option { + if needle.is_empty() { + return Some(haystack.len()); + } + + if needle.len() == 1 { + return memchr::memrchr(needle[0], haystack); + } + + memchr::memmem::rfind(haystack, needle) +} + +/// Reusable substring finder for repeated searches with the same needle. +pub struct CheetahFinder<'a> { + needle: &'a str, + finder: Option>, +} + +impl<'a> CheetahFinder<'a> { + #[inline] + pub fn new(needle: &'a str) -> Self { + let finder = (needle.len() > 1).then(|| memchr::memmem::Finder::new(needle.as_bytes())); + Self { needle, finder } + } + + #[inline] + pub fn needle(&self) -> &'a str { + self.needle + } + + #[inline] + pub fn find_in(&self, haystack: &S) -> Option + where + S: AsRef + ?Sized, + { + let haystack = haystack.as_ref().as_bytes(); + + if self.needle.is_empty() { + return Some(0); + } + + if self.needle.len() == 1 { + return memchr::memchr(self.needle.as_bytes()[0], haystack); + } + + self.finder + .as_ref() + .and_then(|finder| finder.find(haystack)) + } + + #[inline] + pub fn is_match(&self, haystack: &S) -> bool + where + S: AsRef + ?Sized, + { + self.find_in(haystack).is_some() + } +} diff --git a/src/serde.rs b/src/serde.rs index 0eef167..f0d448d 100644 --- a/src/serde.rs +++ b/src/serde.rs @@ -1,10 +1,9 @@ -use crate::cheetah_string::InnerString; use crate::CheetahString; use alloc::string::String; use alloc::vec::Vec; use core::fmt; use core::str; -use serde::de::{Error, Unexpected, Visitor}; +use serde::de::{Error, Visitor}; use serde::{Deserialize, Deserializer, Serialize, Serializer}; impl Serialize for CheetahString { @@ -12,20 +11,7 @@ impl Serialize for CheetahString { where S: Serializer, { - match &self.inner { - InnerString::Inline { len, data } => { - // Safety: InnerString::Inline guarantees that data[0..len] is valid UTF-8 - let s = unsafe { str::from_utf8_unchecked(&data[..*len as usize]) }; - serializer.serialize_str(s) - } - InnerString::StaticStr(s) => serializer.serialize_str(s), - InnerString::ArcStr(s) => serializer.serialize_str(s.as_ref()), - InnerString::Owned(s) => serializer.serialize_str(s.as_str()), - InnerString::ArcString(s) => serializer.serialize_str(s.as_str()), - InnerString::ArcVecString(s) => serializer.serialize_bytes(s), - #[cfg(feature = "bytes")] - InnerString::Bytes(bytes) => serializer.serialize_bytes(bytes.as_ref()), - } + serializer.serialize_str(self.as_str()) } } @@ -67,27 +53,25 @@ where where E: Error, { - Ok(CheetahString::from(v)) + str::from_utf8(v) + .map(CheetahString::from_slice) + .map_err(Error::custom) } fn visit_borrowed_bytes(self, v: &'a [u8]) -> Result where E: Error, { - Ok(CheetahString::from(v)) + str::from_utf8(v) + .map(CheetahString::from_slice) + .map_err(Error::custom) } fn visit_byte_buf(self, v: Vec) -> Result where E: Error, { - match String::from_utf8(v) { - Ok(s) => Ok(CheetahString::from_string(s)), - Err(e) => Err(Error::invalid_value( - Unexpected::Bytes(&e.into_bytes()), - &self, - )), - } + CheetahString::try_from_vec(v).map_err(Error::custom) } } deserializer.deserialize_str(CheetahStringVisitor) diff --git a/src/simd.rs b/src/simd.rs index 175f491..9b4d32f 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -88,6 +88,7 @@ pub(crate) fn ends_with_bytes(haystack: &[u8], needle: &[u8]) -> bool { } /// Find the first occurrence of needle in haystack using SIMD when available +#[allow(dead_code)] #[inline] pub(crate) fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option { if needle.is_empty() { @@ -121,6 +122,7 @@ pub(crate) fn find_bytes(haystack: &[u8], needle: &[u8]) -> Option { .position(|window| window == needle) } +#[allow(dead_code)] #[inline] fn find_first_byte(haystack: &[u8], needle: u8) -> Option { #[cfg(all(feature = "simd", target_arch = "x86_64"))] @@ -133,6 +135,7 @@ fn find_first_byte(haystack: &[u8], needle: u8) -> Option { haystack.iter().position(|&b| b == needle) } +#[allow(dead_code)] #[inline] fn find_short_bytes(haystack: &[u8], needle: &[u8]) -> Option { debug_assert!(needle.len() > 1 && needle.len() < SIMD_THRESHOLD); @@ -191,6 +194,7 @@ unsafe fn eq_bytes_sse2(a: &[u8], b: &[u8]) -> bool { } #[cfg(all(feature = "simd", target_arch = "x86_64"))] +#[allow(dead_code)] #[target_feature(enable = "sse2")] #[inline] unsafe fn find_bytes_sse2(haystack: &[u8], needle: &[u8]) -> Option { @@ -213,18 +217,15 @@ unsafe fn find_bytes_sse2(haystack: &[u8], needle: &[u8]) -> Option { while pos + needle_len <= haystack_len { // Find the next occurrence of the first byte - if let Some(offset) = find_byte_sse2(&haystack[pos..], first_byte) { - let candidate_pos = pos + offset; - - // Check if the rest matches - if candidate_pos + needle_len <= haystack_len { - if eq_bytes_sse2(&haystack[candidate_pos..candidate_pos + needle_len], needle) { - return Some(candidate_pos); - } - pos = candidate_pos + 1; - } else { - return None; + let offset = find_byte_sse2(&haystack[pos..], first_byte)?; + let candidate_pos = pos + offset; + + // Check if the rest matches + if candidate_pos + needle_len <= haystack_len { + if eq_bytes_sse2(&haystack[candidate_pos..candidate_pos + needle_len], needle) { + return Some(candidate_pos); } + pos = candidate_pos + 1; } else { return None; } @@ -234,6 +235,7 @@ unsafe fn find_bytes_sse2(haystack: &[u8], needle: &[u8]) -> Option { } #[cfg(all(feature = "simd", target_arch = "x86_64"))] +#[allow(dead_code)] #[target_feature(enable = "sse2")] #[inline] unsafe fn find_byte_sse2(haystack: &[u8], needle: u8) -> Option { diff --git a/tests/basic.rs b/tests/basic.rs index af3cd9c..7f610b9 100644 --- a/tests/basic.rs +++ b/tests/basic.rs @@ -171,7 +171,7 @@ fn test_into_string_reuses_unique_arc_string_buffer() { } #[test] -fn test_into_string_clones_shared_arc_string_buffer() { +fn test_into_string_reuses_owned_arc_string_buffer_after_clone() { let value = "a".repeat(64); let arc = Arc::new(value); let original_ptr = arc.as_bytes().as_ptr(); @@ -180,31 +180,46 @@ fn test_into_string_clones_shared_arc_string_buffer() { let shared = s.clone(); let owned: String = s.into(); - assert_ne!(owned.as_bytes().as_ptr(), original_ptr); + assert_eq!(owned.as_bytes().as_ptr(), original_ptr); + assert_ne!(shared.as_bytes().as_ptr(), original_ptr); assert_eq!(shared.as_str(), "a".repeat(64)); } +#[test] +fn test_into_string_copies_shared_arc_string_input() { + let value = "a".repeat(64); + let arc = Arc::new(value); + let original_ptr = arc.as_bytes().as_ptr(); + let _held = Arc::clone(&arc); + + let s = CheetahString::from(arc); + let owned: String = s.into(); + + assert_ne!(owned.as_bytes().as_ptr(), original_ptr); +} + #[test] fn test_into_string_reuses_unique_vec_buffer() { let bytes = vec![b'a'; 64]; let original_ptr = bytes.as_ptr(); - let s = CheetahString::from(bytes); + let s = CheetahString::try_from_vec(bytes).unwrap(); let owned: String = s.into(); assert_eq!(owned.as_bytes().as_ptr(), original_ptr); } #[test] -fn test_into_string_clones_shared_vec_buffer() { +fn test_into_string_reuses_owned_vec_buffer_after_clone() { let bytes = vec![b'a'; 64]; let original_ptr = bytes.as_ptr(); - let s = CheetahString::from(bytes); + let s = CheetahString::try_from_vec(bytes).unwrap(); let shared = s.clone(); let owned: String = s.into(); - assert_ne!(owned.as_bytes().as_ptr(), original_ptr); + assert_eq!(owned.as_bytes().as_ptr(), original_ptr); + assert_ne!(shared.as_bytes().as_ptr(), original_ptr); assert_eq!(shared.as_str(), "a".repeat(64)); } @@ -281,6 +296,26 @@ fn test_try_from_vec_method() { assert!(CheetahString::try_from_vec(invalid).is_err()); } +#[test] +fn test_try_from_slice_trait() { + let bytes: &[u8] = b"hello"; + let s = CheetahString::try_from(bytes).unwrap(); + assert_eq!(s, "hello"); + + let invalid: &[u8] = &[0xFF, 0xFE]; + assert!(CheetahString::try_from(invalid).is_err()); +} + +#[test] +fn test_try_from_vec_trait() { + let bytes = vec![104, 101, 108, 108, 111]; + let s = CheetahString::try_from(bytes).unwrap(); + assert_eq!(s, "hello"); + + let invalid = vec![0xFF, 0xFE]; + assert!(CheetahString::try_from(invalid).is_err()); +} + #[test] fn test_unicode() { let s = CheetahString::from("\u{00E9}\u{00E7}\u{00F1}\u{00FC}"); // accented chars @@ -341,6 +376,9 @@ fn test_from_bytes_feature() { use bytes::Bytes; let bytes = Bytes::from("hello"); - let s = CheetahString::from(bytes); + let s = CheetahString::try_from(bytes).unwrap(); assert_eq!(s, "hello"); + + let invalid = Bytes::from_static(&[0xFF, 0xFE]); + assert!(CheetahString::try_from(invalid).is_err()); } diff --git a/tests/layout_snapshot.rs b/tests/layout_snapshot.rs new file mode 100644 index 0000000..a1ba971 --- /dev/null +++ b/tests/layout_snapshot.rs @@ -0,0 +1,62 @@ +use cheetah_string::CheetahString; +use std::env; +use std::fs; +use std::mem::{align_of, size_of}; +use std::path::PathBuf; + +fn target_dir() -> PathBuf { + env::var_os("CARGO_TARGET_DIR") + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from("target")) +} + +fn layout_entry(name: &str) -> String { + format!( + r#"{{"type":"{}","size":{},"align":{}}}"#, + name, + size_of::(), + align_of::() + ) +} + +#[test] +fn layout_snapshot() { + let layouts = [ + layout_entry::("CheetahString"), + layout_entry::>("Option"), + layout_entry::("String"), + layout_entry::>("Option"), + layout_entry::<&str>("&str"), + layout_entry::>("Option<&str>"), + layout_entry::>("Arc"), + layout_entry::>>("Option>"), + ]; + + let snapshot = format!( + concat!( + "{{\n", + " \"crate\":\"cheetah-string\",\n", + " \"profile\":\"test\",\n", + " \"target_arch\":\"{}\",\n", + " \"target_os\":\"{}\",\n", + " \"pointer_width\":\"{}\",\n", + " \"layouts\":[\n {}\n ]\n", + "}}\n" + ), + env::consts::ARCH, + env::consts::OS, + std::mem::size_of::() * 8, + layouts.join(",\n ") + ); + + let artifact_dir = target_dir().join("layout-artifacts"); + fs::create_dir_all(&artifact_dir).expect("create layout artifact directory"); + fs::write(artifact_dir.join("layout-snapshot.json"), &snapshot) + .expect("write layout snapshot artifact"); + + println!("{snapshot}"); + + assert!(size_of::() >= size_of::()); + assert!(align_of::() >= align_of::()); + assert!(size_of::>() >= size_of::()); +} diff --git a/tests/mutation.rs b/tests/mutation.rs new file mode 100644 index 0000000..2e3ab77 --- /dev/null +++ b/tests/mutation.rs @@ -0,0 +1,60 @@ +use cheetah_string::CheetahString; + +#[test] +fn inline_push_str_appends_in_place() { + let mut s = CheetahString::from("hello"); + let before = s.as_bytes().as_ptr(); + + s.push_str(" world"); + + assert_eq!(s, "hello world"); + assert_eq!(s.as_bytes().as_ptr(), before); +} + +#[test] +fn owned_push_str_reuses_spare_capacity() { + let mut s = CheetahString::with_capacity(128); + s.push_str("hello"); + let before = s.as_bytes().as_ptr(); + + s.push_str(" world"); + + assert_eq!(s, "hello world"); + assert_eq!(s.as_bytes().as_ptr(), before); +} + +#[test] +fn add_reuses_owned_lhs_capacity() { + let mut s = CheetahString::with_capacity(128); + s.push_str("hello"); + let before = s.as_bytes().as_ptr(); + + let s = s + " world"; + + assert_eq!(s, "hello world"); + assert_eq!(s.as_bytes().as_ptr(), before); +} + +#[test] +fn reserve_zero_keeps_existing_buffer() { + let mut s = CheetahString::with_capacity(128); + s.push_str("hello"); + let before = s.as_bytes().as_ptr(); + + s.reserve(0); + + assert_eq!(s, "hello"); + assert_eq!(s.as_bytes().as_ptr(), before); +} + +#[test] +fn add_assign_uses_push_str_path() { + let mut s = CheetahString::with_capacity(128); + s.push_str("hello"); + let before = s.as_bytes().as_ptr(); + + s += " world"; + + assert_eq!(s, "hello world"); + assert_eq!(s.as_bytes().as_ptr(), before); +} diff --git a/tests/search.rs b/tests/search.rs new file mode 100644 index 0000000..08b57ea --- /dev/null +++ b/tests/search.rs @@ -0,0 +1,50 @@ +use cheetah_string::{CheetahFinder, CheetahString}; + +#[test] +fn empty_needle_matches_str_find_semantics() { + let s = CheetahString::from("hello"); + + assert_eq!(s.find(""), "hello".find("")); + assert_eq!(s.rfind(""), "hello".rfind("")); + assert!(s.contains("")); +} + +#[test] +fn memmem_search_reports_byte_indices() { + let s = CheetahString::from("cafe cafe"); + + assert_eq!(s.find("fe"), Some(2)); + assert_eq!(s.rfind("fe"), Some(7)); + assert_eq!(s.find("missing"), None); +} + +#[test] +fn unicode_search_matches_str_indices() { + let s = CheetahString::from("éxé"); + + assert_eq!(s.find("é"), "éxé".find("é")); + assert_eq!(s.rfind("é"), "éxé".rfind("é")); + assert_eq!(s.find("xé"), "éxé".find("xé")); +} + +#[test] +fn reusable_finder_matches_repeated_needle() { + let finder = CheetahFinder::new("route"); + let first = CheetahString::from("topic.route.alpha"); + let second = CheetahString::from("topic.name.beta"); + + assert_eq!(finder.needle(), "route"); + assert_eq!(finder.find_in(&first), Some(6)); + assert_eq!(finder.find_in(&second), None); + assert!(finder.is_match(&first)); + assert!(!finder.is_match(&second)); +} + +#[test] +fn reusable_empty_finder_matches_start() { + let finder = CheetahFinder::new(""); + let s = CheetahString::from("payload"); + + assert_eq!(finder.find_in(&s), Some(0)); + assert!(finder.is_match(&s)); +}