From f9653eef586e74af02b106786f5dd0506af53ade Mon Sep 17 00:00:00 2001 From: warren Date: Wed, 11 Mar 2026 11:08:16 +0800 Subject: [PATCH] Initial commit: rust-scribe transcription tool --- .DS_Store | Bin 0 -> 6148 bytes .cargo/config.toml | 14 + .cargo/config.toml.bak | 21 ++ .gitignore | 1 + AGENTS.md | 173 ++++++++++ Cargo.lock | 718 +++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 34 ++ README.md | 73 +++++ models/.gitignore | 2 + src/main.rs | 432 +++++++++++++++++++++++++ src/main.rs.bak | 401 +++++++++++++++++++++++ 11 files changed, 1869 insertions(+) create mode 100644 .DS_Store create mode 100644 .cargo/config.toml create mode 100644 .cargo/config.toml.bak create mode 100644 .gitignore create mode 100644 AGENTS.md create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 README.md create mode 100644 models/.gitignore create mode 100644 src/main.rs create mode 100644 src/main.rs.bak diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..ed26afa82fd46ebc028b08ed07ab42006e034070 GIT binary patch literal 6148 zcmeHKJxc>Y5Ph43aQH#2M6kI+TfxF6)fr+b2=<~$Bno$&B+>eQfZz`h!9vi|LeN$l zYg_+;g}tDKGrOzKc?KI1kr~*1cRMrt=3ef0vj9wEw=)k+00^itRz|4%!r0HPU?o2? zNoedGU8He$ezlz@JB4T+Pz6+hzor0xyH(7gjScir__*u)?&h;YbqYW3uW%E=|lqE5!H>&xf=A&lLc`YRn$HrP z6MBq1W)8_iW?U+vOEvz8VO%I#YU&t4Pk;lv-jgaXd0V{)cs=%)*@Ctq6vg7~& literal 0 HcmV?d00001 diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000..b7ae605 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,14 @@ +[target.aarch64-apple-darwin] +rustflags = [ + "-L", "/opt/homebrew/lib", + "-l", "c++", + # 正確写法:使用 -C link-arg= 將參數傳遞給連結器 + "-C", "link-arg=-framework", + "-C", "link-arg=Metal", + "-C", "link-arg=-framework", + "-C", "link-arg=Foundation", + "-C", "link-arg=-framework", + "-C", "link-arg=QuartzCore", + "-C", "link-arg=-framework", + "-C", "link-arg=CoreGraphics" +] diff --git a/.cargo/config.toml.bak b/.cargo/config.toml.bak new file mode 100644 index 0000000..38a1074 --- /dev/null +++ b/.cargo/config.toml.bak @@ -0,0 +1,21 @@ +[target.aarch64-apple-darwin] +rustflags = [ + "-L", "/opt/homebrew/lib", + "-l", "c++", + # 新增:連結 Metal 相關框架 (解決 Undefined symbols 錯誤) + "-framework", "Metal", + "-framework", "Foundation", + "-framework", "QuartzCore", + "-framework", "CoreGraphics" +] + +# 如果是 Intel Mac,取消下面註解並使用 +# [target.x86_64-apple-darwin] +# rustflags = [ +# "-L", "/usr/local/lib", +# "-l", "c++", +# "-framework", "Metal", +# "-framework", "Foundation", +# "-framework", "QuartzCore", +# "-framework", "CoreGraphics" +# ] diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..ddc57c6 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,173 @@ +# AGENTS.md - rust-scribe + +## Project Overview + +rust-scribe is a high-performance video/audio transcriber with timestamps using Rust and Whisper. It extracts audio from video files and transcribes them using the Whisper.cpp library. + +## Build Commands + +```bash +# Build the project (debug mode) +cargo build + +# Build release version +cargo build --release + +# Run the application +cargo run --release -- --model [options] + +# Example usage: +cargo run --release -- video.mp4 --model models/ggml-base.bin --language zh +``` + +### Running Tests + +```bash +# Run all tests +cargo test + +# Run a single test by name +cargo test + +# Run tests with output +cargo test -- --nocapture +``` + +### Linting and Formatting + +```bash +# Run clippy for linting +cargo clippy + +# Fix clippy suggestions automatically +cargo clippy --fix + +# Format code +cargo fmt + +# Check formatting +cargo fmt --check +``` + +## Dependencies + +- **ffmpeg-next** (v8.0): FFmpeg bindings for audio extraction and resampling +- **whisper-rs** (v0.12): Rust bindings for Whisper.cpp +- **whisper-rs-sys** (v0.10): Low-level Whisper bindings +- **clap** (v4.5): CLI argument parsing +- **anyhow** (v1.0): Error handling +- **ndarray** (v0.15): Array operations + +## Code Style Guidelines + +### Formatting +- Use `cargo fmt` for consistent formatting +- 4-space indentation (Rust default) +- Maximum line length: 100 characters (default) + +### Imports +- Group imports by crate: std → external → local +- Use absolute paths with `crate::` for internal modules +- Prefer bringing traits into scope when using them + +```rust +use std::path::Path; +use anyhow::{Context, Result}; +use clap::Parser; +use ffmpeg_next as ffmpeg; +``` + +### Naming Conventions +- **Variables/functions**: snake_case (e.g., `extract_audio_to_f32`, `audio_data`) +- **Types/Enums**: PascalCase (e.g., `Args`, `WhisperContext`) +- **Constants**: SCREAMING_SNAKE_CASE (e.g., `WHISPER_SAMPLE_RATE`) +- **Files**: snake_case (e.g., `main.rs`) + +### Error Handling +- Use `anyhow::Result` for application-level error handling +- Use `?` operator for propagating errors +- Use `Context` trait for adding context to errors +- Use `anyhow::bail!` for early returns with errors +- Provide descriptive error messages in Chinese or English + +```rust +fn load_config() -> Result { + let file = File::open("config.toml") + .context("Failed to open config file")?; + // ... +} +``` + +### Unsafe Code +- Minimize unsafe code; isolate it in small, well-documented functions +- Use `unsafe` block only when necessary (e.g., FFI callbacks) +- Document preconditions and invariants + +```rust +unsafe extern "C" fn progress_callback(...) { + // Document what this callback does + // Keep unsafe block minimal +} +``` + +### Documentation +- Add doc comments (`///`) for public functions +- Document parameters and return values +- Include usage examples for complex functions + +### Performance Considerations +- Use `AtomicU64`/`AtomicBool` for global state in callbacks +- Pre-allocate vectors with `Vec::with_capacity()` when size is known +- Use `saturating_*` operations to prevent overflow +- Reuse objects instead of creating new ones in loops + +### Type Annotations +- Prefer explicit types for function signatures +- Use type inference for obvious local variables +- Use primitive types (`u32`, `f64`, etc.) over aliases + +### Control Flow +- Use early returns to reduce nesting +- Prefer `?` over `match` for simple error propagation +- Use `if let` for optional values when pattern matching is simple + +## Project Structure + +``` +rust-scribe/ +├── src/ +│ └── main.rs # Main application code +├── models/ # Whisper model files +├── Cargo.toml # Project manifest +└── .cargo/ + └── config.toml # Cargo configuration +``` + +## Configuration + +### CLI Arguments +- `input_file` (positional): Path to video/audio file +- `--model`: Path to Whisper model file +- `--language`: Target language (optional, auto-detects if not specified) +- `--verbose`: Enable verbose output + +### Model Requirements +Place Whisper model files (e.g., `ggml-base.bin`) in the `models/` directory. + +## Common Tasks + +### Adding a New Dependency +Add to `[dependencies]` section in `Cargo.toml`: +```toml +package_name = "version" +``` + +### Adding a New Feature +1. Implement the feature in a new function in `src/main.rs` +2. Add CLI argument if needed in `Args` struct +3. Test with sample audio/video files + +### Debugging +- Use `eprintln!` for debug output (goes to stderr) +- Use `println!` for progress messages +- Enable `--verbose` flag for Whisper debug output diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..def15a0 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,718 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "bindgen" +version = "0.69.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "itertools 0.12.1", + "lazy_static", + "lazycell", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash 1.1.0", + "shlex", + "syn", + "which", +] + +[[package]] +name = "bindgen" +version = "0.72.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "itertools 0.13.0", + "proc-macro2", + "quote", + "regex", + "rustc-hash 2.1.1", + "shlex", + "syn", +] + +[[package]] +name = "bitflags" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" + +[[package]] +name = "cc" +version = "1.2.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "clap" +version = "4.5.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" + +[[package]] +name = "cmake" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +dependencies = [ + "cc", +] + +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "ffmpeg-next" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d658424d233cbd993a972dd73a66ca733acd12a494c68995c9ac32ae1fe65b40" +dependencies = [ + "bitflags", + "ffmpeg-sys-next", + "libc", +] + +[[package]] +name = "ffmpeg-sys-next" +version = "8.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bca20aa4ee774fe384c2490096c122b0b23cf524a9910add0686691003d797b" +dependencies = [ + "bindgen 0.72.1", + "cc", + "libc", + "num_cpus", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "home" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "libc" +version = "0.2.182" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" + +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "rawpointer", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "ndarray" +version = "0.15.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "rawpointer", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "rust-scribe" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "ffmpeg-next", + "ndarray", + "whisper-rs", + "whisper-rs-sys", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + +[[package]] +name = "rustix" +version = "0.38.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.59.0", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + +[[package]] +name = "whisper-rs" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c597ac8a9d5c4719fee232abc871da184ea50a4fea38d2d00348fd95072b2b0" +dependencies = [ + "whisper-rs-sys", +] + +[[package]] +name = "whisper-rs-sys" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d22f00ed0995463eecc34ef89905845f6bf6fd37ea70789fed180520050da8f8" +dependencies = [ + "bindgen 0.69.5", + "cfg-if", + "cmake", + "fs_extra", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..e97b46e --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "rust-scribe" +version = "0.1.0" +edition = "2021" +description = "A high-performance video/audio transcriber with timestamps using Rust and Whisper." +authors = ["Warren Lo"] + +[dependencies] +# FFmpeg 綁定:用於音頻提取和重採樣 +ffmpeg-next = "8.0" + +# Whisper.cpp 的 Rust 綁定:核心轉寫引擎 +whisper-rs = "0.12" + +whisper-rs-sys = "0.10" + +# 用於命令行參數解析 (比手動解析更專業) +clap = { version = "4.5", features = ["derive"] } + +# 用於處理錯誤 +anyhow = "1.0" + +# 可選:如果 whisper-rs 需要額外的數學運算支持 +ndarray = "0.15" + + + +# --- 新增以下內容 --- +[target.'cfg(target_os = "macos")'.dependencies] +# 如果未來需要顯式依賴某些 crate 可放在這裡 + +# 關鍵:告訴 cargo 在 macOS 上編譯 whisper-rs (當啟用 metal 時) 需要連結哪些框架 +# 注意:whisper-rs 的 build script 通常會自動處理,但有時需要手動干預 +# 更可靠的方法是在 .cargo/config.toml 中設置 diff --git a/README.md b/README.md new file mode 100644 index 0000000..39bb9d1 --- /dev/null +++ b/README.md @@ -0,0 +1,73 @@ +# rust-scribe + +`rust-scribe` 是一個高效能的影片/音頻逐字稿轉寫工具,使用 Rust 語言編寫,並結合了 [Whisper.cpp](https://github.com/ggerganov/whisper.cpp) 的強大轉寫能力與 [FFmpeg](https://ffmpeg.org/) 的音頻處理功能,可自動生成帶有時間碼的逐字稿。 + +## 功能特性 + +- **高效音頻提取**:利用 FFmpeg 直接從影片或音頻檔案中提取並重採樣為 Whisper 所需的格式 (16kHz, Mono, f32)。 +- **精準轉寫**:基於 OpenAI Whisper 模型,提供高精度的語音轉文字功能。 +- **時間碼支援**:自動產生精確到毫秒的逐字稿時間戳。 +- **語言自動檢測**:若未指定語言,可自動偵測輸入音頻的語言。 +- **優化進度顯示**:在轉寫過程中提供即時的處理進度、耗時與預估剩餘時間。 + +## 前置需求 + +在編譯與執行前,請確保系統已安裝以下環境: + +1. **Rust 環境**:請安裝 [Rustup](https://rustup.rs/)。 +2. **FFmpeg**:系統必須安裝 FFmpeg 開發庫。 + - macOS: `brew install ffmpeg` + - Ubuntu/Debian: `sudo apt install libavcodec-dev libavformat-dev libavutil-dev libswresample-dev` +3. **Whisper 模型**:請準備 Whisper 的 `.bin` 模型檔案 (例如 `ggml-base.bin`),並放置於 `models/` 目錄中。 + +## 編譯與安裝 + +```bash +# 複製專案 +git clone +cd rust-scribe + +# 編譯 Release 版本 (建議) +cargo build --release +``` + +## 使用方式 + +執行程式時,需指定輸入檔案路徑及模型路徑: + +```bash +./target/release/rust-scribe --model models/ +``` + +### 參數說明 + +- ``: 欲轉寫的影片或音頻檔案路徑 (位置參數)。 +- `-m, --model `: Whisper 模型檔案路徑 (必填)。 +- `-l, --language `: 指定轉寫語言 (例如 `zh`, `en`)。若不指定,系統將自動偵測 (選填)。 +- `-v, --verbose`: 開啟詳細轉寫進度輸出 (選填)。 + +**使用範例**: + +```bash +# 指定使用 base 模型,自動檢測語言 +cargo run --release -- video.mp4 --model models/ggml-base.bin + +# 指定使用中文進行轉寫 +cargo run --release -- lecture.mkv --model models/ggml-base.bin --language zh +``` + +## 專案結構 + +``` +rust-scribe/ +├── src/ +│ └── main.rs # 核心邏輯 (音頻處理、Whisper 轉寫、CLI 介面) +├── models/ # 存放 Whisper 模型檔案 +├── Cargo.toml # 專案依賴與配置 +└── .cargo/ + └── config.toml # Cargo 編譯配置 +``` + +## 開發者規範 + +請參閱 `AGENTS.md` 了解詳細的代碼風格、編譯與測試指南。 diff --git a/models/.gitignore b/models/.gitignore new file mode 100644 index 0000000..7dd7cf8 --- /dev/null +++ b/models/.gitignore @@ -0,0 +1,2 @@ +*.bin +*.ggml diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..1a00a27 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,432 @@ +use anyhow::{Context, Result}; +use clap::Parser; +use ffmpeg_next as ffmpeg; +use ffmpeg::format::input; +use ffmpeg::media::Type; +use ffmpeg::codec::context::Context as CodecContext; +use ffmpeg::frame::Audio as AudioFrame; +use std::path::Path; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::time::{SystemTime, UNIX_EPOCH}; +use std::io::{self, Write}; +use std::ffi::c_void; + +// 導入 Whisper 相關類型 +use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters}; +use whisper_rs_sys::{whisper_context, whisper_state}; + +const WHISPER_SAMPLE_RATE: u32 = 16000; + +// --- 全局狀態 --- +static START_TIME_NANOS: AtomicU64 = AtomicU64::new(0); +static IS_FIRST_PROGRESS: AtomicBool = AtomicBool::new(true); +static TOTAL_DURATION_SEC_X100: AtomicU64 = AtomicU64::new(0); + +// 用於優化預估的全局原子變量 +// 存儲上一次的預估剩餘秒數 (x100),用於平滑處理 +static LAST_REMAINING_SEC_X100: AtomicU64 = AtomicU64::new(0); +// 存儲上次更新時的進度,防止同一進度重複計算 +static LAST_PROGRESS: AtomicU64 = AtomicU64::new(0); +// ---------------- + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + #[arg(index = 1)] + input_file: String, + #[arg(short, long)] + model: String, + #[arg(short, long, default_value = None)] + language: Option, + #[arg(short, long, default_value_t = false)] + verbose: bool, +} + +/// C 語言風格的回調函數 +unsafe extern "C" fn progress_callback( + _ctx: *mut whisper_context, + _state: *mut whisper_state, + progress: i32, + _user_data: *mut c_void, +) { + let now_nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos() as u64; + + // 初始化開始時間 + let start_nanos = START_TIME_NANOS.load(Ordering::Relaxed); + if start_nanos == 0 { + if let Err(existing) = START_TIME_NANOS.compare_exchange( + 0, now_nanos, Ordering::Relaxed, Ordering::Relaxed + ) { + if existing == 0 { START_TIME_NANOS.store(now_nanos, Ordering::Relaxed); } + } + } + + let actual_start_nanos = START_TIME_NANOS.load(Ordering::Relaxed); + if actual_start_nanos == 0 { return; } + + let elapsed_nanos = now_nanos.saturating_sub(actual_start_nanos); + let elapsed_sec = elapsed_nanos as f64 / 1_000_000_000.0; + + let total_sec_x100 = TOTAL_DURATION_SEC_X100.load(Ordering::Relaxed); + let total_sec = if total_sec_x100 > 0 { + total_sec_x100 as f64 / 100.0 + } else { + 1.0 + }; + + let current_percent = (progress as f64).min(100.0) / 100.0; + + // 【優化點 1】忽略過早的進度 (< 5%),此時數據極不穩定 + if current_percent < 0.05 { + return; + } + + // 計算原始預估 + let raw_remaining_sec = if current_percent >= 0.99 { + 0.0 + } else { + let estimated_total = elapsed_sec / current_percent; + estimated_total - elapsed_sec + }; + + // 【優化點 2】讀取上一次的預估值進行平滑 + let last_rem_x100 = LAST_REMAINING_SEC_X100.load(Ordering::Relaxed); + let last_rem_sec = last_rem_x100 as f64 / 100.0; + + // 如果這是第一次有效計算,或者進度變化很小,直接使用原始值 + let last_prog = LAST_PROGRESS.load(Ordering::Relaxed); + + let mut final_remaining_sec = raw_remaining_sec; + + if last_prog > 0 && (progress as u64) > last_prog { + // 【優化點 3】滑動平均:新預估 = 舊預估 * 0.7 + 新計算 * 0.3 + // 這樣可以防止數字劇烈跳變 + final_remaining_sec = last_rem_sec * 0.6 + raw_remaining_sec * 0.4; + + // 【優化點 4】限制最大增長幅度:如果新預估比舊預估大很多,說明前面卡住了, + // 不要讓剩餘時間無限增加,設置一個上限(例如最多增加 30 秒) + if final_remaining_sec > last_rem_sec + 30.0 { + final_remaining_sec = last_rem_sec + 30.0; + } + + // 確保不為負數 + if final_remaining_sec < 0.0 { + final_remaining_sec = 0.0; + } + } + + // 更新全局狀態 + LAST_REMAINING_SEC_X100.store((final_remaining_sec * 100.0) as u64, Ordering::Relaxed); + LAST_PROGRESS.store(progress as u64, Ordering::Relaxed); + + let rem_min = final_remaining_sec as u32 / 60; + let rem_s = final_remaining_sec as u32 % 60; + + let elapsed_min = elapsed_sec as u32 / 60; + let elapsed_s = elapsed_sec as u32 % 60; + + // 計算已處理時長 (用於分子) + let processed_sec = total_sec * current_percent; + let proc_min = processed_sec as u32 / 60; + let proc_s = processed_sec as u32 % 60; + let tot_min = total_sec as u32 / 60; + let tot_s = total_sec as u32 % 60; + + let percent_display = current_percent * 100.0; + + let stderr = io::stderr(); + let mut handle = stderr.lock(); + + if IS_FIRST_PROGRESS.load(Ordering::Relaxed) { + let _ = writeln!(handle); + IS_FIRST_PROGRESS.store(false, Ordering::Relaxed); + } + + // 顯示邏輯:如果進度太低,顯示"計算中" + let remaining_str = if current_percent < 0.10 { + "計算中...".to_string() + } else { + format!("{:02}:{:02}", rem_min, rem_s) + }; + + let _ = write!( + handle, + "\r🔄 識別進度: [{:02}:{:02}/{:02}:{:02}] {:.1}% | 耗時: {:02}:{:02} | 預計剩餘: {} ", + proc_min, proc_s, tot_min, tot_s, percent_display, + elapsed_min, elapsed_s, + remaining_str + ); + + if percent_display >= 99.9 { + let _ = writeln!(handle); + } else { + let _ = handle.flush(); + } +} + +fn main() -> Result<()> { + let args = Args::parse(); + + if !Path::new(&args.input_file).exists() { + anyhow::bail!("錯誤:找不到輸入文件 '{}'", args.input_file); + } + if !Path::new(&args.model).exists() { + anyhow::bail!("錯誤:找不到模型文件 '{}'", args.model); + } + + ffmpeg::init().context("Failed to initialize FFmpeg")?; + + println!("🎬 正在處理文件:{}", args.input_file); + println!("🧠 載入模型:{}", args.model); + + let audio_data = extract_audio_to_f32(&args.input_file) + .context("Failed to extract and process audio")?; + + if audio_data.is_empty() { + anyhow::bail!("錯誤:未能從文件中提取有效音頻數據"); + } + + let total_samples = audio_data.len() as f64; + let total_duration_sec = total_samples / WHISPER_SAMPLE_RATE as f64; + + println!("✅ 音頻準備完成 (樣本數:{}, 時長:{:.2} 分鐘)", total_samples as usize, total_duration_sec / 60.0); + + // 重置全局狀態 + TOTAL_DURATION_SEC_X100.store((total_duration_sec * 100.0) as u64, Ordering::Relaxed); + START_TIME_NANOS.store(0, Ordering::Relaxed); + IS_FIRST_PROGRESS.store(true, Ordering::Relaxed); + LAST_REMAINING_SEC_X100.store(0, Ordering::Relaxed); + LAST_PROGRESS.store(0, Ordering::Relaxed); + + println!("⏳ 正在初始化 Whisper 模型..."); + let ctx = WhisperContext::new_with_params( + &args.model, + WhisperContextParameters::default() + ).context("Failed to load Whisper model")?; + + let mut state = ctx.create_state() + .context("Failed to create Whisper state")?; + + let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 }); + params.set_print_special(false); + params.set_print_progress(args.verbose); + params.set_print_realtime(false); + params.set_print_timestamps(false); + params.set_single_segment(false); + + if let Some(lang) = &args.language { + params.set_language(Some(lang.as_str())); + } else { + params.set_language(None); + } + + unsafe { + params.set_progress_callback(Some(progress_callback)); + } + + println!("🎙️ 正在進行語音識別 (這可能需要幾分鐘)..."); + state.full(params, &audio_data) + .context("Whisper inference failed")?; + + eprintln!("\n✅ 識別完成!"); + + println!("\n=== 逐字稿 (帶時間碼) ===\n"); + + let num_segments = state.full_n_segments() + .context("Failed to get segment count")?; + + if num_segments == 0 { + println!("未檢測到任何語音內容。"); + return Ok(()); + } + + for i in 0..num_segments { + let text = match state.full_get_segment_text(i) { + Ok(t) => t, + Err(e) => { + eprintln!("⚠️ 警告:串流 #{} 包含無效字符 (UTF-8 Error),已跳過。詳情:{}", i, e); + continue; + } + }; + + let start_ts = match state.full_get_segment_t0(i) { + Ok(t) => t as f64 / 100.0, + Err(_) => { + eprintln!("⚠️ 警告:無法獲取串流 #{} 的時間戳", i); + continue; + } + }; + + let time_str = format_time(start_ts); + let clean_text = text.trim(); + + if !clean_text.is_empty() { + println!("[{}] {}", time_str, clean_text); + } + } + + if args.language.is_none() { + if let Ok(lang_id) = state.full_lang_id_from_state() { + let lang_name = get_language_name(lang_id); + println!("\n🌍 自動檢測語言:{} (ID: {})", lang_name, lang_id); + } + } + + Ok(()) +} + +fn get_language_name(lang_id: i32) -> &'static str { + match lang_id { + 0 => "English", 1 => "Chinese", 2 => "German", 3 => "Spanish", + 4 => "Russian", 5 => "Korean", 6 => "French", 7 => "Japanese", + 8 => "Portuguese", 9 => "Turkish", 10 => "Polish", 11 => "Catalan", + 12 => "Dutch", 13 => "Arabic", 14 => "Swedish", 15 => "Italian", + 16 => "Indonesian", 17 => "Hindi", 18 => "Finnish", 19 => "Vietnamese", + 20 => "Hebrew", 21 => "Ukrainian", 22 => "Greek", 23 => "Malay", + 24 => "Czech", 25 => "Romanian", 26 => "Danish", 27 => "Hungarian", + 28 => "Tamil", 29 => "Norwegian", 30 => "Thai", 31 => "Urdu", + 32 => "Croatian", 33 => "Bulgarian", 34 => "Lithuanian", 35 => "Latin", + 36 => "Maori", 37 => "Malayalam", 38 => "Welsh", 39 => "Slovak", + 40 => "Telugu", 41 => "Persian", 42 => "Latvian", 43 => "Bengali", + 44 => "Serbian", 45 => "Azerbaijani", 46 => "Slovenian", 47 => "Kannada", + 48 => "Estonian", 49 => "Macedonian", 50 => "Breton", 51 => "Basque", + 52 => "Icelandic", 53 => "Armenian", 54 => "Nepali", 55 => "Mongolian", + 56 => "Bosnian", 57 => "Kazakh", 58 => "Albanian", 59 => "Swahili", + 60 => "Galician", 61 => "Marathi", 62 => "Punjabi", 63 => "Sinhala", + 64 => "Khmer", 65 => "Shona", 66 => "Yoruba", 67 => "Somali", + 68 => "Afrikaans", 69 => "Occitan", 70 => "Georgian", 71 => "Belarusian", + 72 => "Tajik", 73 => "Sindhi", 74 => "Gujarati", 75 => "Amharic", + 76 => "Yiddish", 77 => "Lao", 78 => "Uzbek", 79 => "Faroese", + 80 => "Haitian Creole", 81 => "Pashto", 82 => "Turkmen", 83 => "Nynorsk", + 84 => "Maltese", 85 => "Sanskrit", 86 => "Luxembourgish", 87 => "Myanmar", + 88 => "Tibetan", 89 => "Tagalog", 90 => "Malagasy", 91 => "Assamese", + 92 => "Tatar", 93 => "Hawaiian", 94 => "Lingala", 95 => "Hausa", + 96 => "Bashkir", 97 => "Javanese", 98 => "Sundanese", 99 => "Cantonese", + _ => "Unknown", + } +} + +fn format_time(seconds: f64) -> String { + let total_secs = seconds as u64; + let millis = ((seconds - total_secs as f64) * 1000.0).round() as u32; + let (millis, total_secs) = if millis >= 1000 { + (millis - 1000, total_secs + 1) + } else { + (millis, total_secs) + }; + let h = total_secs / 3600; + let m = (total_secs % 3600) / 60; + let s = total_secs % 60; + format!("{:02}:{:02}:{:02}.{:03}", h, m, s, millis) +} + +fn extract_audio_to_f32(input_path: &str) -> Result> { + let mut ictx = input(&input_path)?; + let stream_index = ictx.streams().best(Type::Audio) + .ok_or_else(|| anyhow::anyhow!("未找到音頻串流"))? + .index(); + let stream = ictx.stream(stream_index).expect("Stream should exist"); + let codec_params = stream.parameters(); + let codec_id = codec_params.id(); + let codec_decoder = ffmpeg::codec::decoder::find(codec_id) + .ok_or_else(|| anyhow::anyhow!("未找到對應的解碼器"))?; + let mut context = CodecContext::new_with_codec(codec_decoder); + context.set_parameters(codec_params)?; + let mut decoder = context.decoder().audio()?; + let out_format = ffmpeg::format::Sample::F32(ffmpeg::format::sample::Type::Packed); + let out_channel_layout = ffmpeg::channel_layout::ChannelLayout::MONO; + let out_sample_rate = WHISPER_SAMPLE_RATE; + let mut resampler: Option = None; + let mut decoded_samples = Vec::new(); + let mut error_count = 0; + for (stream, packet) in ictx.packets() { + if stream.index() != stream_index { continue; } + if let Err(e) = decoder.send_packet(&packet) { eprintln!("⚠️ 發送包失敗:{}", e); continue; } + let mut decoded_frame = AudioFrame::empty(); + while decoder.receive_frame(&mut decoded_frame).is_ok() { + let in_format = decoded_frame.format(); + let mut in_channel_layout = decoded_frame.channel_layout(); + let in_sample_rate = decoded_frame.rate(); + let channels = decoded_frame.channels(); + if channels == 0 || in_channel_layout.is_empty() { + let safe_layout = if channels == 1 { ffmpeg::channel_layout::ChannelLayout::MONO } else if channels > 1 { ffmpeg::channel_layout::ChannelLayout::STEREO } else { ffmpeg::channel_layout::ChannelLayout::STEREO }; + in_channel_layout = safe_layout; + } + let mut resampled_frame = AudioFrame::empty(); + if resampler.is_none() { + eprintln!("ℹ️ 初始化重採樣器..."); + match ffmpeg::software::resampling::Context::get(in_format, in_channel_layout, in_sample_rate, out_format, out_channel_layout, out_sample_rate) { + Ok(new_resampler) => { + resampler = Some(new_resampler); + if let Some(r) = resampler.as_mut() { + if let Ok(_) = r.run(&decoded_frame, &mut resampled_frame) { + if resampled_frame.samples() > 0 { let _ = append_frame_samples(&mut decoded_samples, &resampled_frame); } + } + } + }, + Err(init_err) => { eprintln!("❌ 無法初始化重採樣器:{}. 跳過此幀。", init_err); error_count += 1; continue; } + } + } else { + let run_result = resampler.as_mut().unwrap().run(&decoded_frame, &mut resampled_frame); + match run_result { + Ok(delay_opt) => { + if resampled_frame.samples() > 0 { if let Err(e) = append_frame_samples(&mut decoded_samples, &resampled_frame) { eprintln!("⚠️ 追加樣本失敗:{}", e); } } + if let Some(_delay) = delay_opt { + if let Some(r) = resampler.as_mut() { + let mut flush_frame = AudioFrame::empty(); + while let Ok(Some(_)) = r.flush(&mut flush_frame) { if flush_frame.samples() > 0 { let _ = append_frame_samples(&mut decoded_samples, &flush_frame); } } + } + } + }, + Err(e) => { + let err_msg = format!("{}", e); + if err_msg.contains("Output changed") || err_msg.contains("Invalid") { + eprintln!("\n⚠️ 檢測到音頻參數變化 ('{}'),重置重採樣器...", err_msg); + drop(resampler.take()); error_count += 1; + match ffmpeg::software::resampling::Context::get(in_format, in_channel_layout, in_sample_rate, out_format, out_channel_layout, out_sample_rate) { + Ok(new_resampler) => { + resampler = Some(new_resampler); + let mut retry_frame = AudioFrame::empty(); + if let Some(r) = resampler.as_mut() { + if let Ok(_) = r.run(&decoded_frame, &mut retry_frame) { if retry_frame.samples() > 0 { let _ = append_frame_samples(&mut decoded_samples, &retry_frame); } } + } + }, + Err(init_err) => { eprintln!("❌ 重置重採樣器失敗:{}. 跳過此幀。", init_err); } + } + } else { eprintln!("❌ 嚴重錯誤:{}. 停止處理。", e); return Err(e).context("Audio resampling failed unrecoverably"); } + } + } + } + } + } + if error_count > 0 { eprintln!("⚠️ 總共跳過或重置了 {} 次音頻處理。", error_count); } + decoder.send_eof().ok(); + let mut decoded_frame = AudioFrame::empty(); + while decoder.receive_frame(&mut decoded_frame).is_ok() { + if let Some(r) = resampler.as_mut() { + let mut resampled_frame = AudioFrame::empty(); + if let Ok(_) = r.run(&decoded_frame, &mut resampled_frame) { if resampled_frame.samples() > 0 { let _ = append_frame_samples(&mut decoded_samples, &resampled_frame); } } + } + } + if let Some(r) = resampler.as_mut() { + let mut flush_frame = AudioFrame::empty(); + while let Ok(Some(_)) = r.flush(&mut flush_frame) { if flush_frame.samples() > 0 { let _ = append_frame_samples(&mut decoded_samples, &flush_frame); } } + } + Ok(decoded_samples) +} + +fn append_frame_samples(buffer: &mut Vec, frame: &AudioFrame) -> Result<()> { + if frame.format() != ffmpeg::format::Sample::F32(ffmpeg::format::sample::Type::Packed) { return Err(anyhow::anyhow!("Unexpected audio sample format")); } + let data = frame.data(0); + let len = frame.samples(); + let byte_len = len * 4; + if data.len() < byte_len { return Err(anyhow::anyhow!("Audio frame data size mismatch")); } + let slice = &data[0..byte_len]; + let ptr = slice.as_ptr() as *const f32; + let f32_slice = unsafe { std::slice::from_raw_parts(ptr, len) }; + buffer.extend_from_slice(f32_slice); + Ok(()) +} diff --git a/src/main.rs.bak b/src/main.rs.bak new file mode 100644 index 0000000..af0b67b --- /dev/null +++ b/src/main.rs.bak @@ -0,0 +1,401 @@ +use anyhow::{Context, Result}; +use clap::Parser; +use ffmpeg_next as ffmpeg; +use ffmpeg::format::input; +use ffmpeg::media::Type; +use ffmpeg::codec::context::Context as CodecContext; +use ffmpeg::frame::Audio as AudioFrame; +use std::path::Path; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::time::{SystemTime, UNIX_EPOCH}; +use std::io::{self, Write}; +use std::ffi::c_void; + +// 導入 Whisper 相關類型 +use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters}; +use whisper_rs_sys::{whisper_context, whisper_state}; + +const WHISPER_SAMPLE_RATE: u32 = 16000; + +// --- 全局狀態 (使用原子變量確保線程安全) --- +static START_TIME_NANOS: AtomicU64 = AtomicU64::new(0); +static IS_FIRST_PROGRESS: AtomicBool = AtomicBool::new(true); +// 存儲音頻總時長 (秒 * 100, 用於整數運算避免浮點數原子操作) +static TOTAL_DURATION_SEC_X100: AtomicU64 = AtomicU64::new(0); +// ----------------------------------------- + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + /// 輸入的視頻或音頻文件路徑 + #[arg(index = 1)] + input_file: String, + + /// Whisper 模型文件路徑 (.bin 格式) + #[arg(short, long)] + model: String, + + /// 目標語言代碼 (例如:zh, en)。留空則自動檢測。 + #[arg(short, long, default_value = None)] + language: Option, + + /// 是否顯示詳細日誌 + #[arg(short, long, default_value_t = false)] + verbose: bool, +} + +/// C 語言風格的回調函數 +unsafe extern "C" fn progress_callback( + _ctx: *mut whisper_context, + _state: *mut whisper_state, + progress: i32, + _user_data: *mut c_void, +) { + let now_nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos() as u64; + + // 初始化開始時間 + let start_nanos = START_TIME_NANOS.load(Ordering::Relaxed); + if start_nanos == 0 { + if let Err(existing) = START_TIME_NANOS.compare_exchange( + 0, now_nanos, Ordering::Relaxed, Ordering::Relaxed + ) { + if existing == 0 { START_TIME_NANOS.store(now_nanos, Ordering::Relaxed); } + } + } + + let actual_start_nanos = START_TIME_NANOS.load(Ordering::Relaxed); + if actual_start_nanos == 0 { return; } + + let elapsed_nanos = now_nanos.saturating_sub(actual_start_nanos); + let elapsed_sec = elapsed_nanos as f64 / 1_000_000_000.0; + + // 獲取總時長 (還原為 f64) + let total_sec_x100 = TOTAL_DURATION_SEC_X100.load(Ordering::Relaxed); + let total_sec = if total_sec_x100 > 0 { + total_sec_x100 as f64 / 100.0 + } else { + 1.0 // 防禦性默認值 + }; + + // 計算當前已處理的時長 (基於百分比估算,因為 whisper 回調只給百分比) + // 注意:Whisper 的 progress 是基於編碼器處理的塊數,與時間大致成正比 + let current_percent = (progress as f64).min(100.0) / 100.0; + let processed_sec = total_sec * current_percent; + + if progress > 0 { + let percent = current_percent * 100.0; + + // 計算剩餘時間 + let rem_sec = if percent >= 99.9 { + 0.0 + } else { + let est_total = elapsed_sec / current_percent; + (est_total - elapsed_sec).max(0.0) + }; + + let rem_min = rem_sec as u32 / 60; + let rem_s = rem_sec as u32 % 60; + + let elapsed_min = elapsed_sec as u32 / 60; + let elapsed_s = elapsed_sec as u32 % 60; + + let stderr = io::stderr(); + let mut handle = stderr.lock(); + + if IS_FIRST_PROGRESS.load(Ordering::Relaxed) { + let _ = writeln!(handle); + IS_FIRST_PROGRESS.store(false, Ordering::Relaxed); + } + + // 顯示格式:[已處理時長 / 總時長] 百分比 | 耗時 | 剩餘 + // 為了美觀,時長格式化為 MM:SS + let proc_min = processed_sec as u32 / 60; + let proc_s = processed_sec as u32 % 60; + let tot_min = total_sec as u32 / 60; + let tot_s = total_sec as u32 % 60; + + let _ = write!( + handle, + "\r🔄 識別進度: [{:02}:{:02}/{:02}:{:02}] {:.1}% | 耗時: {:02}:{:02} | 預計剩餘: {:02}:{:02} ", + proc_min, proc_s, tot_min, tot_s, percent, + elapsed_min, elapsed_s, + rem_min, rem_s + ); + + if percent >= 99.9 { + let _ = writeln!(handle); + } else { + let _ = handle.flush(); + } + } +} + +fn main() -> Result<()> { + let args = Args::parse(); + + if !Path::new(&args.input_file).exists() { + anyhow::bail!("錯誤:找不到輸入文件 '{}'", args.input_file); + } + if !Path::new(&args.model).exists() { + anyhow::bail!("錯誤:找不到模型文件 '{}'", args.model); + } + + ffmpeg::init().context("Failed to initialize FFmpeg")?; + + println!("🎬 正在處理文件:{}", args.input_file); + println!("🧠 載入模型:{}", args.model); + + let audio_data = extract_audio_to_f32(&args.input_file) + .context("Failed to extract and process audio")?; + + if audio_data.is_empty() { + anyhow::bail!("錯誤:未能從文件中提取有效音頻數據"); + } + + let total_samples = audio_data.len() as f64; + let total_duration_sec = total_samples / WHISPER_SAMPLE_RATE as f64; + + println!("✅ 音頻準備完成 (樣本數:{}, 時長:{:.2} 分鐘)", total_samples as usize, total_duration_sec / 60.0); + + // --- 將總時長存入全局原子變量,供回調使用 --- + // 存儲為 整数 (秒 * 100) 以避免浮點數原子操作的複雜性 + TOTAL_DURATION_SEC_X100.store((total_duration_sec * 100.0) as u64, Ordering::Relaxed); + // ------------------------------------------- + + println!("⏳ 正在初始化 Whisper 模型..."); + let ctx = WhisperContext::new_with_params( + &args.model, + WhisperContextParameters::default() + ).context("Failed to load Whisper model")?; + + let mut state = ctx.create_state() + .context("Failed to create Whisper state")?; + + let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 }); + params.set_print_special(false); + params.set_print_progress(args.verbose); + params.set_print_realtime(false); + params.set_print_timestamps(false); + params.set_single_segment(false); + + if let Some(lang) = &args.language { + params.set_language(Some(lang.as_str())); + } else { + params.set_language(None); + } + + // 重置全局狀態 + START_TIME_NANOS.store(0, Ordering::Relaxed); + IS_FIRST_PROGRESS.store(true, Ordering::Relaxed); + + unsafe { + params.set_progress_callback(Some(progress_callback)); + } + + println!("🎙️ 正在進行語音識別 (這可能需要幾分鐘)..."); + state.full(params, &audio_data) + .context("Whisper inference failed")?; + + eprintln!("\n✅ 識別完成!"); + + println!("\n=== 逐字稿 (帶時間碼) ===\n"); + + let num_segments = state.full_n_segments() + .context("Failed to get segment count")?; + + if num_segments == 0 { + println!("未檢測到任何語音內容。"); + return Ok(()); + } + + for i in 0..num_segments { + let text = match state.full_get_segment_text(i) { + Ok(t) => t, + Err(e) => { + eprintln!("⚠️ 警告:串流 #{} 包含無效字符 (UTF-8 Error),已跳過。詳情:{}", i, e); + continue; + } + }; + + let start_ts = match state.full_get_segment_t0(i) { + Ok(t) => t as f64 / 100.0, + Err(_) => { + eprintln!("⚠️ 警告:無法獲取串流 #{} 的時間戳", i); + continue; + } + }; + + let time_str = format_time(start_ts); + let clean_text = text.trim(); + + if !clean_text.is_empty() { + println!("[{}] {}", time_str, clean_text); + } + } + + if args.language.is_none() { + if let Ok(lang_id) = state.full_lang_id_from_state() { + let lang_name = get_language_name(lang_id); + println!("\n🌍 自動檢測語言:{} (ID: {})", lang_name, lang_id); + } + } + + Ok(()) +} + +// ... (其餘函數 get_language_name, format_time, extract_audio_to_f32, append_frame_samples 保持不變) ... +fn get_language_name(lang_id: i32) -> &'static str { + match lang_id { + 0 => "English", 1 => "Chinese", 2 => "German", 3 => "Spanish", + 4 => "Russian", 5 => "Korean", 6 => "French", 7 => "Japanese", + 8 => "Portuguese", 9 => "Turkish", 10 => "Polish", 11 => "Catalan", + 12 => "Dutch", 13 => "Arabic", 14 => "Swedish", 15 => "Italian", + 16 => "Indonesian", 17 => "Hindi", 18 => "Finnish", 19 => "Vietnamese", + 20 => "Hebrew", 21 => "Ukrainian", 22 => "Greek", 23 => "Malay", + 24 => "Czech", 25 => "Romanian", 26 => "Danish", 27 => "Hungarian", + 28 => "Tamil", 29 => "Norwegian", 30 => "Thai", 31 => "Urdu", + 32 => "Croatian", 33 => "Bulgarian", 34 => "Lithuanian", 35 => "Latin", + 36 => "Maori", 37 => "Malayalam", 38 => "Welsh", 39 => "Slovak", + 40 => "Telugu", 41 => "Persian", 42 => "Latvian", 43 => "Bengali", + 44 => "Serbian", 45 => "Azerbaijani", 46 => "Slovenian", 47 => "Kannada", + 48 => "Estonian", 49 => "Macedonian", 50 => "Breton", 51 => "Basque", + 52 => "Icelandic", 53 => "Armenian", 54 => "Nepali", 55 => "Mongolian", + 56 => "Bosnian", 57 => "Kazakh", 58 => "Albanian", 59 => "Swahili", + 60 => "Galician", 61 => "Marathi", 62 => "Punjabi", 63 => "Sinhala", + 64 => "Khmer", 65 => "Shona", 66 => "Yoruba", 67 => "Somali", + 68 => "Afrikaans", 69 => "Occitan", 70 => "Georgian", 71 => "Belarusian", + 72 => "Tajik", 73 => "Sindhi", 74 => "Gujarati", 75 => "Amharic", + 76 => "Yiddish", 77 => "Lao", 78 => "Uzbek", 79 => "Faroese", + 80 => "Haitian Creole", 81 => "Pashto", 82 => "Turkmen", 83 => "Nynorsk", + 84 => "Maltese", 85 => "Sanskrit", 86 => "Luxembourgish", 87 => "Myanmar", + 88 => "Tibetan", 89 => "Tagalog", 90 => "Malagasy", 91 => "Assamese", + 92 => "Tatar", 93 => "Hawaiian", 94 => "Lingala", 95 => "Hausa", + 96 => "Bashkir", 97 => "Javanese", 98 => "Sundanese", 99 => "Cantonese", + _ => "Unknown", + } +} + +fn format_time(seconds: f64) -> String { + let total_secs = seconds as u64; + let millis = ((seconds - total_secs as f64) * 1000.0).round() as u32; + let (millis, total_secs) = if millis >= 1000 { + (millis - 1000, total_secs + 1) + } else { + (millis, total_secs) + }; + let h = total_secs / 3600; + let m = (total_secs % 3600) / 60; + let s = total_secs % 60; + format!("{:02}:{:02}:{:02}.{:03}", h, m, s, millis) +} + +fn extract_audio_to_f32(input_path: &str) -> Result> { + let mut ictx = input(&input_path)?; + let stream_index = ictx.streams().best(Type::Audio) + .ok_or_else(|| anyhow::anyhow!("未找到音頻串流"))? + .index(); + let stream = ictx.stream(stream_index).expect("Stream should exist"); + let codec_params = stream.parameters(); + let codec_id = codec_params.id(); + let codec_decoder = ffmpeg::codec::decoder::find(codec_id) + .ok_or_else(|| anyhow::anyhow!("未找到對應的解碼器"))?; + let mut context = CodecContext::new_with_codec(codec_decoder); + context.set_parameters(codec_params)?; + let mut decoder = context.decoder().audio()?; + let out_format = ffmpeg::format::Sample::F32(ffmpeg::format::sample::Type::Packed); + let out_channel_layout = ffmpeg::channel_layout::ChannelLayout::MONO; + let out_sample_rate = WHISPER_SAMPLE_RATE; + let mut resampler: Option = None; + let mut decoded_samples = Vec::new(); + let mut error_count = 0; + for (stream, packet) in ictx.packets() { + if stream.index() != stream_index { continue; } + if let Err(e) = decoder.send_packet(&packet) { eprintln!("⚠️ 發送包失敗:{}", e); continue; } + let mut decoded_frame = AudioFrame::empty(); + while decoder.receive_frame(&mut decoded_frame).is_ok() { + let in_format = decoded_frame.format(); + let mut in_channel_layout = decoded_frame.channel_layout(); + let in_sample_rate = decoded_frame.rate(); + let channels = decoded_frame.channels(); + if channels == 0 || in_channel_layout.is_empty() { + let safe_layout = if channels == 1 { ffmpeg::channel_layout::ChannelLayout::MONO } else if channels > 1 { ffmpeg::channel_layout::ChannelLayout::STEREO } else { ffmpeg::channel_layout::ChannelLayout::STEREO }; + in_channel_layout = safe_layout; + } + let mut resampled_frame = AudioFrame::empty(); + if resampler.is_none() { + eprintln!("ℹ️ 初始化重採樣器..."); + match ffmpeg::software::resampling::Context::get(in_format, in_channel_layout, in_sample_rate, out_format, out_channel_layout, out_sample_rate) { + Ok(new_resampler) => { + resampler = Some(new_resampler); + if let Some(r) = resampler.as_mut() { + if let Ok(_) = r.run(&decoded_frame, &mut resampled_frame) { + if resampled_frame.samples() > 0 { let _ = append_frame_samples(&mut decoded_samples, &resampled_frame); } + } + } + }, + Err(init_err) => { eprintln!("❌ 無法初始化重採樣器:{}. 跳過此幀。", init_err); error_count += 1; continue; } + } + } else { + let run_result = resampler.as_mut().unwrap().run(&decoded_frame, &mut resampled_frame); + match run_result { + Ok(delay_opt) => { + if resampled_frame.samples() > 0 { if let Err(e) = append_frame_samples(&mut decoded_samples, &resampled_frame) { eprintln!("⚠️ 追加樣本失敗:{}", e); } } + if let Some(_delay) = delay_opt { + if let Some(r) = resampler.as_mut() { + let mut flush_frame = AudioFrame::empty(); + while let Ok(Some(_)) = r.flush(&mut flush_frame) { if flush_frame.samples() > 0 { let _ = append_frame_samples(&mut decoded_samples, &flush_frame); } } + } + } + }, + Err(e) => { + let err_msg = format!("{}", e); + if err_msg.contains("Output changed") || err_msg.contains("Invalid") { + eprintln!("\n⚠️ 檢測到音頻參數變化 ('{}'),重置重採樣器...", err_msg); + drop(resampler.take()); error_count += 1; + match ffmpeg::software::resampling::Context::get(in_format, in_channel_layout, in_sample_rate, out_format, out_channel_layout, out_sample_rate) { + Ok(new_resampler) => { + resampler = Some(new_resampler); + let mut retry_frame = AudioFrame::empty(); + if let Some(r) = resampler.as_mut() { + if let Ok(_) = r.run(&decoded_frame, &mut retry_frame) { if retry_frame.samples() > 0 { let _ = append_frame_samples(&mut decoded_samples, &retry_frame); } } + } + }, + Err(init_err) => { eprintln!("❌ 重置重採樣器失敗:{}. 跳過此幀。", init_err); } + } + } else { eprintln!("❌ 嚴重錯誤:{}. 停止處理。", e); return Err(e).context("Audio resampling failed unrecoverably"); } + } + } + } + } + } + if error_count > 0 { eprintln!("⚠️ 總共跳過或重置了 {} 次音頻處理。", error_count); } + decoder.send_eof().ok(); + let mut decoded_frame = AudioFrame::empty(); + while decoder.receive_frame(&mut decoded_frame).is_ok() { + if let Some(r) = resampler.as_mut() { + let mut resampled_frame = AudioFrame::empty(); + if let Ok(_) = r.run(&decoded_frame, &mut resampled_frame) { if resampled_frame.samples() > 0 { let _ = append_frame_samples(&mut decoded_samples, &resampled_frame); } } + } + } + if let Some(r) = resampler.as_mut() { + let mut flush_frame = AudioFrame::empty(); + while let Ok(Some(_)) = r.flush(&mut flush_frame) { if flush_frame.samples() > 0 { let _ = append_frame_samples(&mut decoded_samples, &flush_frame); } } + } + Ok(decoded_samples) +} + +fn append_frame_samples(buffer: &mut Vec, frame: &AudioFrame) -> Result<()> { + if frame.format() != ffmpeg::format::Sample::F32(ffmpeg::format::sample::Type::Packed) { return Err(anyhow::anyhow!("Unexpected audio sample format")); } + let data = frame.data(0); + let len = frame.samples(); + let byte_len = len * 4; + if data.len() < byte_len { return Err(anyhow::anyhow!("Audio frame data size mismatch")); } + let slice = &data[0..byte_len]; + let ptr = slice.as_ptr() as *const f32; + let f32_slice = unsafe { std::slice::from_raw_parts(ptr, len) }; + buffer.extend_from_slice(f32_slice); + Ok(()) +}