From 91d058f290696cd6f0b52526751b427ac4c0e7f8 Mon Sep 17 00:00:00 2001 From: mxsm Date: Sat, 20 Jun 2026 09:52:02 +0800 Subject: [PATCH 1/2] Feat: harden string invariants and release automation --- .github/workflows/release.yml | 110 ++++++++++++++++++++++++++ Cargo.toml | 8 +- README.md | 4 +- benches/comprehensive.rs | 8 +- src/cheetah_string.rs | 144 ++++++++++++++++++++++++++-------- src/lib.rs | 2 +- src/serde.rs | 34 +++----- tests/basic.rs | 29 ++++++- 8 files changed, 269 insertions(+), 70 deletions(-) create mode 100644 .github/workflows/release.yml diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..d3f5beb --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,110 @@ +name: Release + +on: + push: + tags: + - "v*.*.*" + workflow_dispatch: + inputs: + version: + description: "Version to release, for example 1.1.0" + required: true + type: string + +permissions: + contents: write + +env: + CARGO_TERM_COLOR: always + RUST_BACKTRACE: full + +jobs: + publish: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Rust + uses: dtolnay/rust-toolchain@stable + with: + toolchain: stable + + - name: Resolve release version + id: version + shell: bash + run: | + if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + VERSION="${{ inputs.version }}" + else + VERSION="${GITHUB_REF_NAME#v}" + fi + + if [[ ! "$VERSION" =~ ^[0-9]+\.[0-9]+\.[0-9]+([+-][0-9A-Za-z.-]+)?$ ]]; then + echo "Invalid release version: $VERSION" >&2 + exit 1 + fi + + MANIFEST_VERSION="$(cargo metadata --no-deps --format-version 1 | python3 -c 'import json, sys; print(json.load(sys.stdin)["packages"][0]["version"])')" + if [[ "$MANIFEST_VERSION" != "$VERSION" ]]; then + echo "Cargo.toml version $MANIFEST_VERSION does not match release version $VERSION" >&2 + exit 1 + fi + + echo "version=$VERSION" >> "$GITHUB_OUTPUT" + echo "tag=v$VERSION" >> "$GITHUB_OUTPUT" + + - name: Create tag for manual release + if: github.event_name == 'workflow_dispatch' + shell: bash + run: | + TAG="${{ steps.version.outputs.tag }}" + CURRENT_SHA="$(git rev-parse HEAD)" + + if git rev-parse "$TAG" >/dev/null 2>&1; then + TAG_SHA="$(git rev-list -n 1 "$TAG")" + if [[ "$TAG_SHA" != "$CURRENT_SHA" ]]; then + echo "Tag $TAG already exists at $TAG_SHA, not current HEAD $CURRENT_SHA" >&2 + exit 1 + fi + else + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git tag "$TAG" + git push origin "$TAG" + fi + + - name: Check formatting + run: cargo fmt -- --check + + - name: Lint + run: cargo clippy --all-features -- -D warnings + + - name: Test all features + run: cargo test --all-features + + - name: Test no-default feature matrix + run: cargo test --no-default-features --features serde,bytes,simd + + - name: Package crate + run: cargo package + + - name: Publish crate to crates.io + run: cargo publish --token "$CARGO_REGISTRY_TOKEN" + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} + + - name: Create GitHub release + env: + GH_TOKEN: ${{ github.token }} + TAG: ${{ steps.version.outputs.tag }} + VERSION: ${{ steps.version.outputs.version }} + run: | + gh release create "$TAG" \ + "target/package/cheetah-string-$VERSION.crate#cheetah-string-$VERSION.crate" \ + --verify-tag \ + --title "cheetah-string $TAG" \ + --generate-notes diff --git a/Cargo.toml b/Cargo.toml index a258edb..2d54803 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cheetah-string" -version = "1.0.1" +version = "1.1.0" authors = ["mxsm "] edition = "2021" homepage = "https://github.com/mxsm/cheetah-string" @@ -15,14 +15,14 @@ A lightweight, high-performance string manipulation library optimized for speed- """ [dependencies] -bytes = "1.10.0" +bytes = { version = "1.10.0", optional = true, default-features = false } serde = { version = "1.0", optional = true, default-features = false, features = ["alloc"] } [features] default = ["std"] std = [] -serde = ["serde/alloc"] -bytes = [] +serde = ["dep:serde", "serde/alloc"] +bytes = ["dep:bytes"] simd = [] [dev-dependencies] diff --git a/README.md b/README.md index 9156ce1..db5583c 100644 --- a/README.md +++ b/README.md @@ -45,14 +45,14 @@ Add this to your `Cargo.toml`: ```toml [dependencies] -cheetah-string = "1.0.0" +cheetah-string = "1.1.0" ``` ### Optional Features ```toml [dependencies] -cheetah-string = { version = "1.0.0", features = ["bytes", "serde", "simd"] } +cheetah-string = { version = "1.1.0", features = ["bytes", "serde", "simd"] } ``` Available features: diff --git a/benches/comprehensive.rs b/benches/comprehensive.rs index 14a405f..174ce1a 100644 --- a/benches/comprehensive.rs +++ b/benches/comprehensive.rs @@ -362,13 +362,13 @@ fn bench_internal_hot_paths(c: &mut Criterion) { }); let long_bytes = vec![b'a'; 256]; - group.bench_function("CheetahString::from(Vec 256B)", |b| { - b.iter(|| black_box(CheetahString::from(long_bytes.clone()))) + group.bench_function("CheetahString::try_from_vec(256B)", |b| { + b.iter(|| black_box(CheetahString::try_from_vec(long_bytes.clone()).unwrap())) }); - group.bench_function("String::from(CheetahString::from(Vec 256B))", |b| { + group.bench_function("String::from(CheetahString::try_from_vec(256B))", |b| { b.iter(|| { - let value = CheetahString::from(long_bytes.clone()); + let value = CheetahString::try_from_vec(long_bytes.clone()).unwrap(); black_box(String::from(value)) }) }); diff --git a/src/cheetah_string.rs b/src/cheetah_string.rs index 9052254..89df165 100644 --- a/src/cheetah_string.rs +++ b/src/cheetah_string.rs @@ -47,19 +47,12 @@ impl<'a> From<&'a str> for CheetahString { } } -/// # Safety Warning -/// -/// This implementation uses `unsafe` code and may cause undefined behavior -/// if the bytes are not valid UTF-8. Consider using `CheetahString::try_from_bytes()` -/// for safe UTF-8 validation. -/// -/// This implementation will be deprecated in a future version. -impl From<&[u8]> for CheetahString { +impl<'a> TryFrom<&'a [u8]> for CheetahString { + type Error = Utf8Error; + #[inline] - fn from(b: &[u8]) -> Self { - // SAFETY: This is unsafe and may cause UB if bytes are not valid UTF-8. - // This will be deprecated in favor of try_from_bytes in the next version. - CheetahString::from_slice(unsafe { str::from_utf8_unchecked(b) }) + fn try_from(b: &'a [u8]) -> Result { + CheetahString::try_from_bytes(b) } } @@ -71,19 +64,12 @@ impl FromStr for CheetahString { } } -/// # Safety Warning -/// -/// This implementation uses `unsafe` code and may cause undefined behavior -/// if the bytes are not valid UTF-8. Consider using `CheetahString::try_from_vec()` -/// for safe UTF-8 validation. -/// -/// This implementation will be deprecated in a future version. -impl From> for CheetahString { +impl TryFrom> for CheetahString { + type Error = Utf8Error; + #[inline] - fn from(v: Vec) -> Self { - // SAFETY: This constructor does not validate UTF-8 and may cause UB - // if the bytes are later observed as a string. - CheetahString::from_vec(v) + fn try_from(v: Vec) -> Result { + CheetahString::try_from_vec(v) } } @@ -159,10 +145,12 @@ impl<'a> FromIterator<&'a String> for CheetahString { } #[cfg(feature = "bytes")] -impl From for CheetahString { +impl TryFrom for CheetahString { + type Error = Utf8Error; + #[inline] - fn from(b: bytes::Bytes) -> Self { - CheetahString::from_bytes(b) + fn try_from(b: bytes::Bytes) -> Result { + CheetahString::try_from_bytes_buf(b) } } @@ -277,8 +265,29 @@ impl CheetahString { } } - #[inline] + #[deprecated( + since = "1.1.0", + note = "use try_from_vec for checked construction or from_utf8_unchecked_vec for an explicit unsafe constructor" + )] pub fn from_vec(s: Vec) -> Self { + CheetahString::try_from_vec(s).expect( + "CheetahString::from_vec requires valid UTF-8; use try_from_vec for fallible construction", + ) + } + + /// Creates a `CheetahString` from a byte vector without validating UTF-8. + /// + /// # Safety + /// + /// The caller must guarantee that `s` contains valid UTF-8 for the entire + /// lifetime of the returned `CheetahString`. + #[inline] + pub unsafe fn from_utf8_unchecked_vec(s: Vec) -> Self { + CheetahString::from_validated_vec_unchecked(s) + } + + #[inline] + fn from_validated_vec_unchecked(s: Vec) -> Self { if s.len() <= INLINE_CAPACITY { let mut data = [0u8; INLINE_CAPACITY]; data[..s.len()].copy_from_slice(&s); @@ -314,9 +323,8 @@ impl CheetahString { /// assert!(CheetahString::try_from_vec(invalid).is_err()); /// ``` pub fn try_from_vec(v: Vec) -> Result { - // Validate UTF-8 str::from_utf8(&v)?; - Ok(CheetahString::from_vec(v)) + Ok(CheetahString::from_validated_vec_unchecked(v)) } /// Creates a `CheetahString` from a byte slice with UTF-8 validation. @@ -342,8 +350,51 @@ impl CheetahString { Ok(CheetahString::from_slice(s)) } + /// Creates a `CheetahString` from a byte slice without validating UTF-8. + /// + /// # Safety + /// + /// The caller must guarantee that `b` contains valid UTF-8. + #[inline] + pub unsafe fn from_utf8_unchecked_bytes(b: &[u8]) -> Self { + // SAFETY: The caller guarantees that `b` contains valid UTF-8. + CheetahString::from_slice(unsafe { str::from_utf8_unchecked(b) }) + } + + /// Creates a `CheetahString` from a shared byte vector with UTF-8 validation. + /// + /// # Errors + /// + /// Returns an error if the bytes are not valid UTF-8. + #[inline] + pub fn try_from_arc_vec(s: Arc>) -> Result { + str::from_utf8(s.as_slice())?; + Ok(CheetahString::from_validated_arc_vec_unchecked(s)) + } + + #[deprecated( + since = "1.1.0", + note = "use try_from_arc_vec for checked construction or from_utf8_unchecked_arc_vec for an explicit unsafe constructor" + )] #[inline] pub fn from_arc_vec(s: Arc>) -> Self { + CheetahString::try_from_arc_vec(s).expect( + "CheetahString::from_arc_vec requires valid UTF-8; use try_from_arc_vec for fallible construction", + ) + } + + /// Creates a `CheetahString` from a shared byte vector without validating UTF-8. + /// + /// # Safety + /// + /// The caller must guarantee that `s` contains valid UTF-8. + #[inline] + pub unsafe fn from_utf8_unchecked_arc_vec(s: Arc>) -> Self { + CheetahString::from_validated_arc_vec_unchecked(s) + } + + #[inline] + fn from_validated_arc_vec_unchecked(s: Arc>) -> Self { CheetahString { inner: InnerString::ArcVecString(s), } @@ -418,7 +469,37 @@ impl CheetahString { #[inline] #[cfg(feature = "bytes")] + #[deprecated( + since = "1.1.0", + note = "use try_from_bytes_buf for checked construction or from_utf8_unchecked_bytes_buf for an explicit unsafe constructor" + )] pub fn from_bytes(b: bytes::Bytes) -> Self { + CheetahString::try_from_bytes_buf(b).expect( + "CheetahString::from_bytes requires valid UTF-8; use try_from_bytes_buf for fallible construction", + ) + } + + #[inline] + #[cfg(feature = "bytes")] + pub fn try_from_bytes_buf(b: bytes::Bytes) -> Result { + str::from_utf8(b.as_ref())?; + Ok(CheetahString::from_validated_bytes_unchecked(b)) + } + + /// Creates a `CheetahString` from `bytes::Bytes` without validating UTF-8. + /// + /// # Safety + /// + /// The caller must guarantee that `b` contains valid UTF-8. + #[inline] + #[cfg(feature = "bytes")] + pub unsafe fn from_utf8_unchecked_bytes_buf(b: bytes::Bytes) -> Self { + CheetahString::from_validated_bytes_unchecked(b) + } + + #[inline] + #[cfg(feature = "bytes")] + fn from_validated_bytes_unchecked(b: bytes::Bytes) -> Self { CheetahString { inner: InnerString::Bytes(b), } @@ -1455,6 +1536,7 @@ impl<'a> DoubleEndedIterator for SplitWrapper<'a> { #[cfg(test)] mod tests { use super::*; + use alloc::{format, vec}; #[test] fn with_capacity_above_inline_uses_heap_storage() { @@ -1524,7 +1606,7 @@ mod tests { #[test] fn long_vec_conversion_uses_arc_vec_storage() { let value = "a".repeat(INLINE_CAPACITY + 1).into_bytes(); - let s = CheetahString::from(value); + let s = CheetahString::try_from_vec(value).expect("valid utf-8"); match &s.inner { InnerString::ArcVecString(inner) => { diff --git a/src/lib.rs b/src/lib.rs index 5d030ba..c9aab49 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,7 +21,7 @@ //! To enable SIMD acceleration: //! ```toml //! [dependencies] -//! cheetah-string = { version = "1.0.0", features = ["simd"] } +//! cheetah-string = { version = "1.1.0", features = ["simd"] } //! ``` //! //! # Examples diff --git a/src/serde.rs b/src/serde.rs index 0eef167..f0d448d 100644 --- a/src/serde.rs +++ b/src/serde.rs @@ -1,10 +1,9 @@ -use crate::cheetah_string::InnerString; use crate::CheetahString; use alloc::string::String; use alloc::vec::Vec; use core::fmt; use core::str; -use serde::de::{Error, Unexpected, Visitor}; +use serde::de::{Error, Visitor}; use serde::{Deserialize, Deserializer, Serialize, Serializer}; impl Serialize for CheetahString { @@ -12,20 +11,7 @@ impl Serialize for CheetahString { where S: Serializer, { - match &self.inner { - InnerString::Inline { len, data } => { - // Safety: InnerString::Inline guarantees that data[0..len] is valid UTF-8 - let s = unsafe { str::from_utf8_unchecked(&data[..*len as usize]) }; - serializer.serialize_str(s) - } - InnerString::StaticStr(s) => serializer.serialize_str(s), - InnerString::ArcStr(s) => serializer.serialize_str(s.as_ref()), - InnerString::Owned(s) => serializer.serialize_str(s.as_str()), - InnerString::ArcString(s) => serializer.serialize_str(s.as_str()), - InnerString::ArcVecString(s) => serializer.serialize_bytes(s), - #[cfg(feature = "bytes")] - InnerString::Bytes(bytes) => serializer.serialize_bytes(bytes.as_ref()), - } + serializer.serialize_str(self.as_str()) } } @@ -67,27 +53,25 @@ where where E: Error, { - Ok(CheetahString::from(v)) + str::from_utf8(v) + .map(CheetahString::from_slice) + .map_err(Error::custom) } fn visit_borrowed_bytes(self, v: &'a [u8]) -> Result where E: Error, { - Ok(CheetahString::from(v)) + str::from_utf8(v) + .map(CheetahString::from_slice) + .map_err(Error::custom) } fn visit_byte_buf(self, v: Vec) -> Result where E: Error, { - match String::from_utf8(v) { - Ok(s) => Ok(CheetahString::from_string(s)), - Err(e) => Err(Error::invalid_value( - Unexpected::Bytes(&e.into_bytes()), - &self, - )), - } + CheetahString::try_from_vec(v).map_err(Error::custom) } } deserializer.deserialize_str(CheetahStringVisitor) diff --git a/tests/basic.rs b/tests/basic.rs index af3cd9c..4c8329a 100644 --- a/tests/basic.rs +++ b/tests/basic.rs @@ -189,7 +189,7 @@ fn test_into_string_reuses_unique_vec_buffer() { let bytes = vec![b'a'; 64]; let original_ptr = bytes.as_ptr(); - let s = CheetahString::from(bytes); + let s = CheetahString::try_from_vec(bytes).unwrap(); let owned: String = s.into(); assert_eq!(owned.as_bytes().as_ptr(), original_ptr); @@ -200,7 +200,7 @@ fn test_into_string_clones_shared_vec_buffer() { let bytes = vec![b'a'; 64]; let original_ptr = bytes.as_ptr(); - let s = CheetahString::from(bytes); + let s = CheetahString::try_from_vec(bytes).unwrap(); let shared = s.clone(); let owned: String = s.into(); @@ -281,6 +281,26 @@ fn test_try_from_vec_method() { assert!(CheetahString::try_from_vec(invalid).is_err()); } +#[test] +fn test_try_from_slice_trait() { + let bytes: &[u8] = b"hello"; + let s = CheetahString::try_from(bytes).unwrap(); + assert_eq!(s, "hello"); + + let invalid: &[u8] = &[0xFF, 0xFE]; + assert!(CheetahString::try_from(invalid).is_err()); +} + +#[test] +fn test_try_from_vec_trait() { + let bytes = vec![104, 101, 108, 108, 111]; + let s = CheetahString::try_from(bytes).unwrap(); + assert_eq!(s, "hello"); + + let invalid = vec![0xFF, 0xFE]; + assert!(CheetahString::try_from(invalid).is_err()); +} + #[test] fn test_unicode() { let s = CheetahString::from("\u{00E9}\u{00E7}\u{00F1}\u{00FC}"); // accented chars @@ -341,6 +361,9 @@ fn test_from_bytes_feature() { use bytes::Bytes; let bytes = Bytes::from("hello"); - let s = CheetahString::from(bytes); + let s = CheetahString::try_from(bytes).unwrap(); assert_eq!(s, "hello"); + + let invalid = Bytes::from_static(&[0xFF, 0xFE]); + assert!(CheetahString::try_from(invalid).is_err()); } From 08189cdbbc1956b8053e7a06bab47a5876d09948 Mon Sep 17 00:00:00 2001 From: mxsm Date: Sat, 20 Jun 2026 10:27:45 +0800 Subject: [PATCH 2/2] Fix: satisfy nightly clippy question-mark lint --- src/simd.rs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/simd.rs b/src/simd.rs index 175f491..80a8293 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -213,18 +213,15 @@ unsafe fn find_bytes_sse2(haystack: &[u8], needle: &[u8]) -> Option { while pos + needle_len <= haystack_len { // Find the next occurrence of the first byte - if let Some(offset) = find_byte_sse2(&haystack[pos..], first_byte) { - let candidate_pos = pos + offset; - - // Check if the rest matches - if candidate_pos + needle_len <= haystack_len { - if eq_bytes_sse2(&haystack[candidate_pos..candidate_pos + needle_len], needle) { - return Some(candidate_pos); - } - pos = candidate_pos + 1; - } else { - return None; + let offset = find_byte_sse2(&haystack[pos..], first_byte)?; + let candidate_pos = pos + offset; + + // Check if the rest matches + if candidate_pos + needle_len <= haystack_len { + if eq_bytes_sse2(&haystack[candidate_pos..candidate_pos + needle_len], needle) { + return Some(candidate_pos); } + pos = candidate_pos + 1; } else { return None; }