From d6af9546ea9bc476970ef40c3ea780775a436c2b Mon Sep 17 00:00:00 2001 From: warren Date: Wed, 11 Mar 2026 18:17:55 +0800 Subject: [PATCH] feat: Add per-segment language detection and dual output files - Add whatlang for real-time language detection per segment - Generate .asr.json (basic) and .asrx.json (with language labels) - Add auto-save progress with configurable interval - Add resume functionality for interrupted transcriptions - Add Music/Empty detection and statistics - Update progress display with unified format - Add comprehensive Chinese README with usage documentation --- Cargo.lock | 123 +++++++ Cargo.toml | 4 + README.md | 161 ++++++++-- src/main.rs | 909 +++++++++++++++++++++++++++++++++++++++++++--------- 4 files changed, 1022 insertions(+), 175 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index def15a0..08427d1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,18 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -11,6 +23,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "anstream" version = "0.6.21" @@ -270,6 +288,16 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", +] + [[package]] name = "heck" version = "0.5.0" @@ -315,6 +343,12 @@ dependencies = [ "either", ] +[[package]] +name = "itoa" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" + [[package]] name = "lazy_static" version = "1.5.0" @@ -525,7 +559,11 @@ dependencies = [ "anyhow", "clap", "ffmpeg-next", + "lazy_static", "ndarray", + "serde", + "serde_json", + "whatlang", "whisper-rs", "whisper-rs-sys", ] @@ -555,6 +593,49 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + [[package]] name = "shlex" version = "1.3.0" @@ -596,6 +677,22 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "whatlang" +version = "0.16.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "471d1c1645d361eb782a1650b1786a8fb58dd625e681a04c09f5ff7c8764a7b0" +dependencies = [ + "hashbrown", + "once_cell", +] + [[package]] name = "which" version = "4.4.2" @@ -716,3 +813,29 @@ name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zerocopy" +version = "0.8.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml index e97b46e..7251342 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,10 @@ anyhow = "1.0" # 可選:如果 whisper-rs 需要額外的數學運算支持 ndarray = "0.15" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +lazy_static = "1.4" +whatlang = "0.16" diff --git a/README.md b/README.md index 39bb9d1..a7d465c 100644 --- a/README.md +++ b/README.md @@ -4,21 +4,25 @@ ## 功能特性 -- **高效音頻提取**:利用 FFmpeg 直接從影片或音頻檔案中提取並重採樣為 Whisper 所需的格式 (16kHz, Mono, f32)。 -- **精準轉寫**:基於 OpenAI Whisper 模型,提供高精度的語音轉文字功能。 -- **時間碼支援**:自動產生精確到毫秒的逐字稿時間戳。 -- **語言自動檢測**:若未指定語言,可自動偵測輸入音頻的語言。 -- **優化進度顯示**:在轉寫過程中提供即時的處理進度、耗時與預估剩餘時間。 +- **高效音頻提取**:利用 FFmpeg 直接從影片或音頻檔案中提取並重採樣為 Whisper 所需的格式 (16kHz, Mono, f32) +- **精準轉寫**:基於 OpenAI Whisper 模型,提供高精度的語音轉文字功能 +- **時間碼支援**:自動產生精確到毫秒的逐字稿時間戳,格式為 `HH:MM:SS.mmm(fXX)` (25fps) +- **語言自動檢測**:若未指定語言,可自動偵測輸入音頻的語言 +- **優化進度顯示**:在轉寫過程中提供即時的處理進度、耗時與預估剩餘時間 +- **自動存檔**:支援定期自動存檔,避免長時間轉寫因意外中斷而遺失進度 +- **中斷續處理**:支援從上次中斷點繼續處理 +- **語言分段檢測**:自動檢測每段文字的語言類型,生成帶語言標籤的逐字稿 +- **特殊內容識別**:自動識別 Music (音樂) 和 Empty (空文字) 分段 ## 前置需求 在編譯與執行前,請確保系統已安裝以下環境: -1. **Rust 環境**:請安裝 [Rustup](https://rustup.rs/)。 -2. **FFmpeg**:系統必須安裝 FFmpeg 開發庫。 +1. **Rust 環境**:請安裝 [Rustup](https://rustup.rs/) +2. **FFmpeg**:系統必須安裝 FFmpeg 開發庫 - macOS: `brew install ffmpeg` - Ubuntu/Debian: `sudo apt install libavcodec-dev libavformat-dev libavutil-dev libswresample-dev` -3. **Whisper 模型**:請準備 Whisper 的 `.bin` 模型檔案 (例如 `ggml-base.bin`),並放置於 `models/` 目錄中。 +3. **Whisper 模型**:請準備 Whisper 的 `.bin` 模型檔案,並放置於 `models/` 目錄中 ## 編譯與安裝 @@ -33,7 +37,7 @@ cargo build --release ## 使用方式 -執行程式時,需指定輸入檔案路徑及模型路徑: +### 基本使用 ```bash ./target/release/rust-scribe --model models/ @@ -41,33 +45,148 @@ cargo build --release ### 參數說明 -- ``: 欲轉寫的影片或音頻檔案路徑 (位置參數)。 -- `-m, --model `: Whisper 模型檔案路徑 (必填)。 -- `-l, --language `: 指定轉寫語言 (例如 `zh`, `en`)。若不指定,系統將自動偵測 (選填)。 -- `-v, --verbose`: 開啟詳細轉寫進度輸出 (選填)。 +| 參數 | 說明 | 必填 | +|------|------|------| +| `` | 欲轉寫的影片或音頻檔案路徑 | ✅ | +| `-m, --model ` | Whisper 模型檔案路徑 | ✅ | +| `-l, --language ` | 指定轉寫語言 (例如 `zh`, `en`)。若不指定將自動偵測 | ❌ | +| `-v, --verbose` | 開啟詳細轉寫進度輸出 | ❌ | +| `--save-interval ` | 每 N 個分段自動存檔一次 (預設: 100) | ❌ | +| `--max-duration <秒>` | 限制處理的最大時長(用於測試) | ❌ | -**使用範例**: +### 使用範例 ```bash -# 指定使用 base 模型,自動檢測語言 -cargo run --release -- video.mp4 --model models/ggml-base.bin +# 基本使用,自動檢測語言,每 10 個分段存檔 +cargo run --release -- video.mp4 --model models/ggml-base.bin --save-interval 10 -# 指定使用中文進行轉寫 +# 指定中文轉寫 cargo run --release -- lecture.mkv --model models/ggml-base.bin --language zh + +# 限制處理前 60 秒(用於測試) +cargo run --release -- video.mp4 --model models/ggml-base.bin --max-duration 60 ``` +## 輸出檔案 + +程式會生成兩個輸出檔案: + +### 1. `.asr.json` - 基本逐字稿 +包含基本的轉寫結果: +```json +{ + "input_file": "video.mp4", + "language": "English", + "segments": [ + { + "start": "00:00:05.120(f03)", + "end": "00:00:08.560(f14)", + "text": "Hello everyone", + "language": "English" + } + ] +} +``` + +### 2. `.asrx.json` - 帶語言檢測的逐字稿 +包含每段文字的語言檢測結果: +```json +{ + "input_file": "video.mp4", + "segments": [ + { + "start": "00:00:05.120(f03)", + "end": "00:00:08.560(f14)", + "text": "Hello everyone", + "language": "English", + "confidence": 0.98 + }, + { + "start": "00:00:10.000(f00)", + "end": "00:00:13.000(f00)", + "text": "[Music]", + "language": "Music", + "confidence": 1.0 + } + ] +} +``` + +## 進度顯示說明 + +轉寫過程中會顯示即時進度: + +``` +═══════════════════════════════════════════════════════════ +🎙️ Transcribing (this may take a few minutes)... +💡 Auto-saving every 10 segments +📋 Legend: + P = Processed (已處理分段數) + A = ASR saved (.asr.json 已存檔數) + X = Lang detect saved (.asrx.json 已存檔數) +═══════════════════════════════════════════════════════════ + +🔄 Progress: [71:05/114:39] 62.0% | Elapsed: 03:04 | Remaining: 01:54 | P:1112 A:1110 X:1110 | English:500 Unknown:52 Music:58 +``` + +### 進度列格式說明 + +- **時間進度**:`[已處理時間/總時長] 完成百分比` +- **時間統計**:`Elapsed: 已耗時 | Remaining: 預估剩餘時間` +- **分段統計**:`P:已處理數 A:ASR存檔數 X:語言檢測存檔數` +- **語言統計**:各語言類型的分段數量(依序顯示) + +### 特殊類型標籤 + +- **Music**:音樂或背景音效段落 +- **Empty**:空文字或純空白段落 +- **Unknown**:語言識別置信度低於 0.5 的段落 + +## 中斷續處理 + +程式支援中斷後繼續處理: + +1. 當程式偵測到已存在的輸出檔案時,會詢問是否繼續 +2. 選擇 `[C] Continue` 會從上次中斷點繼續 +3. 選擇 `[R] Restart` 會刪除舊檔案重新開始 + +兩個檔案(`.asr.json` 和 `.asrx.json`)會同步處理。 + +## 語言檢測說明 + +使用 [whatlang](https://github.com/greyblake/whatlang-rs) 函式庫進行語言檢測: + +- **置信度 ≥ 0.5**:顯示檢測到的語言名稱 +- **置信度 < 0.5**:標註為 "Unknown" +- **Music**:自動識別包含 [Music] 或 (music) 標記的段落 +- **Empty**:空文字或純空白段落 + ## 專案結構 ``` rust-scribe/ ├── src/ -│ └── main.rs # 核心邏輯 (音頻處理、Whisper 轉寫、CLI 介面) -├── models/ # 存放 Whisper 模型檔案 -├── Cargo.toml # 專案依賴與配置 +│ └── main.rs # 核心邏輯 +├── models/ # Whisper 模型檔案 +├── Cargo.toml # 專案配置 +├── Cargo.lock # 依賴鎖定 +├── README.md # 使用文檔 +├── AGENTS.md # 開發者規範 └── .cargo/ - └── config.toml # Cargo 編譯配置 + └── config.toml # Cargo 配置 ``` ## 開發者規範 請參閱 `AGENTS.md` 了解詳細的代碼風格、編譯與測試指南。 + +## 許可證 + +MIT License + +## 致謝 + +- [Whisper.cpp](https://github.com/ggerganov/whisper.cpp) - OpenAI Whisper 的 C++ 實現 +- [FFmpeg](https://ffmpeg.org/) - 音頻處理函式庫 +- [whisper-rs](https://github.com/tazz4843/whisper-rs) - Whisper.cpp 的 Rust 綁定 +- [whatlang](https://github.com/greyblake/whatlang-rs) - 自然語言檢測函式庫 diff --git a/src/main.rs b/src/main.rs index 1a00a27..5cca638 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,18 +1,19 @@ use anyhow::{Context, Result}; use clap::Parser; use ffmpeg_next as ffmpeg; -use ffmpeg::format::input; -use ffmpeg::media::Type; -use ffmpeg::codec::context::Context as CodecContext; -use ffmpeg::frame::Audio as AudioFrame; -use std::path::Path; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; -use std::time::{SystemTime, UNIX_EPOCH}; -use std::io::{self, Write}; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; use std::ffi::c_void; +use std::io::{self, Write}; +use std::path::PathBuf; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::{Arc, Mutex}; +use std::time::{SystemTime, UNIX_EPOCH}; // 導入 Whisper 相關類型 -use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters}; +use whisper_rs::{ + FullParams, SamplingStrategy, SegmentCallbackData, WhisperContext, WhisperContextParameters, +}; use whisper_rs_sys::{whisper_context, whisper_state}; const WHISPER_SAMPLE_RATE: u32 = 16000; @@ -21,12 +22,27 @@ const WHISPER_SAMPLE_RATE: u32 = 16000; static START_TIME_NANOS: AtomicU64 = AtomicU64::new(0); static IS_FIRST_PROGRESS: AtomicBool = AtomicBool::new(true); static TOTAL_DURATION_SEC_X100: AtomicU64 = AtomicU64::new(0); +static SEGMENTS_PROCESSED: AtomicU64 = AtomicU64::new(0); +static SEGMENTS_SAVED: AtomicU64 = AtomicU64::new(0); +static SEGMENTS_X_SAVED: AtomicU64 = AtomicU64::new(0); +static MUSIC_COUNT: AtomicU64 = AtomicU64::new(0); +static EMPTY_COUNT: AtomicU64 = AtomicU64::new(0); // 用於優化預估的全局原子變量 -// 存儲上一次的預估剩餘秒數 (x100),用於平滑處理 static LAST_REMAINING_SEC_X100: AtomicU64 = AtomicU64::new(0); -// 存儲上次更新時的進度,防止同一進度重複計算 static LAST_PROGRESS: AtomicU64 = AtomicU64::new(0); + +// 用於分段回調的全局狀態 +lazy_static::lazy_static! { + static ref SEGMENTS_BUFFER: Arc>> = Arc::new(Mutex::new(Vec::new())); + static ref SEGMENTS_X_BUFFER: Arc>> = Arc::new(Mutex::new(Vec::new())); + static ref OUTPUT_PATH: Arc>> = Arc::new(Mutex::new(None)); + static ref OUTPUT_X_PATH: Arc>> = Arc::new(Mutex::new(None)); + static ref INPUT_FILE: Arc>> = Arc::new(Mutex::new(None)); + static ref SAVE_INTERVAL: AtomicU64 = AtomicU64::new(10); + static ref DETECTED_LANGUAGE: Arc> = Arc::new(Mutex::new("auto".to_string())); + static ref LANG_STATS: Arc>> = Arc::new(Mutex::new(HashMap::new())); +} // ---------------- #[derive(Parser, Debug)] @@ -40,6 +56,40 @@ struct Args { language: Option, #[arg(short, long, default_value_t = false)] verbose: bool, + #[arg(long, default_value_t = 0.0)] + max_duration: f64, + #[arg(long, default_value_t = 100)] + save_interval: u32, +} + +#[derive(Serialize, Deserialize, Clone)] +struct Segment { + start: String, + end: String, + text: String, + language: String, +} + +#[derive(Serialize, Deserialize, Clone)] +struct SegmentX { + start: String, + end: String, + text: String, + language: String, + confidence: f64, +} + +#[derive(Serialize, Deserialize, Clone)] +struct Transcript { + input_file: String, + language: String, + segments: Vec, +} + +#[derive(Serialize, Deserialize, Clone)] +struct TranscriptX { + input_file: String, + segments: Vec, } /// C 語言風格的回調函數 @@ -57,15 +107,19 @@ unsafe extern "C" fn progress_callback( // 初始化開始時間 let start_nanos = START_TIME_NANOS.load(Ordering::Relaxed); if start_nanos == 0 { - if let Err(existing) = START_TIME_NANOS.compare_exchange( - 0, now_nanos, Ordering::Relaxed, Ordering::Relaxed - ) { - if existing == 0 { START_TIME_NANOS.store(now_nanos, Ordering::Relaxed); } + if let Err(existing) = + START_TIME_NANOS.compare_exchange(0, now_nanos, Ordering::Relaxed, Ordering::Relaxed) + { + if existing == 0 { + START_TIME_NANOS.store(now_nanos, Ordering::Relaxed); + } } } - + let actual_start_nanos = START_TIME_NANOS.load(Ordering::Relaxed); - if actual_start_nanos == 0 { return; } + if actual_start_nanos == 0 { + return; + } let elapsed_nanos = now_nanos.saturating_sub(actual_start_nanos); let elapsed_sec = elapsed_nanos as f64 / 1_000_000_000.0; @@ -78,7 +132,7 @@ unsafe extern "C" fn progress_callback( }; let current_percent = (progress as f64).min(100.0) / 100.0; - + // 【優化點 1】忽略過早的進度 (< 5%),此時數據極不穩定 if current_percent < 0.05 { return; @@ -95,23 +149,23 @@ unsafe extern "C" fn progress_callback( // 【優化點 2】讀取上一次的預估值進行平滑 let last_rem_x100 = LAST_REMAINING_SEC_X100.load(Ordering::Relaxed); let last_rem_sec = last_rem_x100 as f64 / 100.0; - + // 如果這是第一次有效計算,或者進度變化很小,直接使用原始值 let last_prog = LAST_PROGRESS.load(Ordering::Relaxed); - + let mut final_remaining_sec = raw_remaining_sec; if last_prog > 0 && (progress as u64) > last_prog { // 【優化點 3】滑動平均:新預估 = 舊預估 * 0.7 + 新計算 * 0.3 // 這樣可以防止數字劇烈跳變 final_remaining_sec = last_rem_sec * 0.6 + raw_remaining_sec * 0.4; - + // 【優化點 4】限制最大增長幅度:如果新預估比舊預估大很多,說明前面卡住了, // 不要讓剩餘時間無限增加,設置一個上限(例如最多增加 30 秒) if final_remaining_sec > last_rem_sec + 30.0 { final_remaining_sec = last_rem_sec + 30.0; } - + // 確保不為負數 if final_remaining_sec < 0.0 { final_remaining_sec = 0.0; @@ -145,21 +199,82 @@ unsafe extern "C" fn progress_callback( IS_FIRST_PROGRESS.store(false, Ordering::Relaxed); } - // 顯示邏輯:如果進度太低,顯示"計算中" + // Show "calculating..." if progress is too low let remaining_str = if current_percent < 0.10 { - "計算中...".to_string() + "calculating...".to_string() } else { format!("{:02}:{:02}", rem_min, rem_s) }; + let segments_processed = SEGMENTS_PROCESSED.load(Ordering::Relaxed); + let segments_saved = SEGMENTS_SAVED.load(Ordering::Relaxed); + let segments_x_saved = SEGMENTS_X_SAVED.load(Ordering::Relaxed); + + // Get language statistics + let _lang_stats_str = { + let stats = LANG_STATS.lock().unwrap(); + if stats.is_empty() { + String::new() + } else { + let mut parts: Vec = stats + .iter() + .map(|(lang, count)| format!("{}:{}", lang, count)) + .collect(); + parts.sort(); + format!(" | {}", parts.join(" ")) + } + }; + + let music_count = MUSIC_COUNT.load(Ordering::Relaxed); + let empty_count = EMPTY_COUNT.load(Ordering::Relaxed); + + let stats_parts = { + let stats = LANG_STATS.lock().unwrap(); + let mut parts: Vec = stats + .iter() + .filter(|(lang, _)| *lang != "Music" && *lang != "Empty") + .map(|(lang, count)| format!("{}:{}", lang, count)) + .collect(); + parts.sort(); + if music_count > 0 { + parts.push(format!("Music:{}", music_count)); + } + if empty_count > 0 { + parts.push(format!("Empty:{}", empty_count)); + } + parts.join(" ") + }; + + let segment_info = if segments_processed > 0 { + format!( + " | P:{} A:{} X:{}{}", + segments_processed, + segments_saved, + segments_x_saved, + if !stats_parts.is_empty() { + format!(" | {}", stats_parts) + } else { + "".to_string() + } + ) + } else { + String::new() + }; + let _ = write!( - handle, - "\r🔄 識別進度: [{:02}:{:02}/{:02}:{:02}] {:.1}% | 耗時: {:02}:{:02} | 預計剩餘: {} ", - proc_min, proc_s, tot_min, tot_s, percent_display, - elapsed_min, elapsed_s, - remaining_str + handle, + "\r🔄 Progress: [{:02}:{:02}/{:02}:{:02}] {:.1}% | Elapsed: {:02}:{:02} | Remaining: {} {} ", + proc_min, + proc_s, + tot_min, + tot_s, + percent_display, + elapsed_min, + elapsed_s, + remaining_str, + segment_info ); - + if percent_display >= 99.9 { let _ = writeln!(handle); } else { @@ -170,44 +285,195 @@ unsafe extern "C" fn progress_callback( fn main() -> Result<()> { let args = Args::parse(); - if !Path::new(&args.input_file).exists() { - anyhow::bail!("錯誤:找不到輸入文件 '{}'", args.input_file); - } - if !Path::new(&args.model).exists() { - anyhow::bail!("錯誤:找不到模型文件 '{}'", args.model); + let input_path = PathBuf::from(&args.input_file) + .canonicalize() + .context("Invalid input file path")?; + let model_path = PathBuf::from(&args.model) + .canonicalize() + .context("Invalid model file path")?; + let output_path = input_path.with_extension(format!( + "{}.asr.json", + input_path.extension().unwrap().to_str().unwrap() + )); + let output_x_path = input_path.with_extension(format!( + "{}.asrx.json", + input_path.extension().unwrap().to_str().unwrap() + )); + + println!("📂 Input: {:?}", input_path); + println!("🧠 Model: {:?}", model_path); + println!("📄 Output: {:?}", output_path); + println!("📄 Language Detection: {:?}", output_x_path); + + let mut resume_segments: Vec = Vec::new(); + let mut resume_segments_x: Vec = Vec::new(); + let mut skip_samples: usize = 0; + + if output_path.exists() { + println!("⚠️ Output file exists, loading..."); + let content = std::fs::read_to_string(&output_path)?; + if let Ok(transcript) = serde_json::from_str::(&content) { + let seg_count = transcript.segments.len(); + if seg_count > 0 { + println!("📊 Found {} segments in .asr.json", seg_count); + if let Some(last) = transcript.segments.last() { + println!("📍 Last segment end time: {}", last.end); + if let Some(end_sec) = parse_time_to_seconds(&last.end) { + skip_samples = (end_sec * WHISPER_SAMPLE_RATE as f64) as usize; + println!("⏭️ Will resume from {} seconds", end_sec); + } + } + resume_segments = transcript.segments; + + // Also load .asrx.json if exists + let mut seg_x_count = 0; + if output_x_path.exists() { + if let Ok(content_x) = std::fs::read_to_string(&output_x_path) { + if let Ok(transcript_x) = serde_json::from_str::(&content_x) { + seg_x_count = transcript_x.segments.len(); + resume_segments_x = transcript_x.segments; + println!("📄 Found {} segments in .asrx.json", seg_x_count); + } + } + } + + println!("\n🆘 Continue processing?"); + println!( + " [C] Continue from {}s (.asr.json: {}, .asrx.json: {})", + skip_samples as f64 / WHISPER_SAMPLE_RATE as f64, + seg_count, + seg_x_count + ); + println!(" [R] Restart (delete both files)"); + print!("Enter choice [C/R]: "); + io::stdout().flush()?; + let mut choice = String::new(); + io::stdin().read_line(&mut choice)?; + let choice = choice.trim().to_uppercase(); + if choice != "C" && choice != "CONTINUE" { + resume_segments.clear(); + resume_segments_x.clear(); + skip_samples = 0; + // Delete existing files and restart + if output_path.exists() { + std::fs::remove_file(&output_path)?; + println!("🗑️ Deleted old transcript file"); + } + if output_x_path.exists() { + std::fs::remove_file(&output_x_path)?; + println!("🗑️ Deleted old language detection file"); + } + println!("🔄 Restarting..."); + } + } + } + } else { + let empty_transcript = Transcript { + input_file: args.input_file.clone(), + language: args.language.clone().unwrap_or_else(|| "auto".to_string()), + segments: Vec::new(), + }; + let json = serde_json::to_string_pretty(&empty_transcript)?; + std::fs::write(&output_path, json)?; + println!("✅ Created empty output file"); } ffmpeg::init().context("Failed to initialize FFmpeg")?; - println!("🎬 正在處理文件:{}", args.input_file); - println!("🧠 載入模型:{}", args.model); + println!("🎬 Processing file..."); + println!("🧠 Loading model..."); - let audio_data = extract_audio_to_f32(&args.input_file) + let mut audio_data = extract_audio_to_f32(&input_path.to_string_lossy()) .context("Failed to extract and process audio")?; - + if audio_data.is_empty() { - anyhow::bail!("錯誤:未能從文件中提取有效音頻數據"); + anyhow::bail!("Error: Failed to extract valid audio data from file"); } - + + if skip_samples > 0 && skip_samples < audio_data.len() { + let skipped_sec = skip_samples as f64 / WHISPER_SAMPLE_RATE as f64; + println!("⏭️ Skipping first {} seconds of audio...", skipped_sec); + audio_data = audio_data[skip_samples..].to_vec(); + } + let total_samples = audio_data.len() as f64; let total_duration_sec = total_samples / WHISPER_SAMPLE_RATE as f64; - - println!("✅ 音頻準備完成 (樣本數:{}, 時長:{:.2} 分鐘)", total_samples as usize, total_duration_sec / 60.0); - // 重置全局狀態 - TOTAL_DURATION_SEC_X100.store((total_duration_sec * 100.0) as u64, Ordering::Relaxed); + let actual_duration_sec = if args.max_duration > 0.0 { + let max_samples = (args.max_duration * WHISPER_SAMPLE_RATE as f64) as usize; + let actual = max_samples.min(audio_data.len()); + audio_data.truncate(actual); + actual as f64 / WHISPER_SAMPLE_RATE as f64 + } else { + total_duration_sec + }; + + println!( + "✅ Audio ready (samples: {}, duration: {:.2} min){}", + audio_data.len(), + actual_duration_sec / 60.0, + if args.max_duration > 0.0 { + format!(" [limit: {:.1}s]", args.max_duration) + } else { + String::new() + } + ); + + // Reset global state + TOTAL_DURATION_SEC_X100.store((actual_duration_sec * 100.0) as u64, Ordering::Relaxed); START_TIME_NANOS.store(0, Ordering::Relaxed); IS_FIRST_PROGRESS.store(true, Ordering::Relaxed); LAST_REMAINING_SEC_X100.store(0, Ordering::Relaxed); LAST_PROGRESS.store(0, Ordering::Relaxed); + SEGMENTS_PROCESSED.store(resume_segments.len() as u64, Ordering::Relaxed); + SEGMENTS_SAVED.store(resume_segments.len() as u64, Ordering::Relaxed); + SEGMENTS_X_SAVED.store(resume_segments_x.len() as u64, Ordering::Relaxed); + MUSIC_COUNT.store(0, Ordering::Relaxed); + EMPTY_COUNT.store(0, Ordering::Relaxed); + SAVE_INTERVAL.store(args.save_interval as u64, Ordering::Relaxed); - println!("⏳ 正在初始化 Whisper 模型..."); + // Setup global variables for callbacks + { + let mut segments_lock = SEGMENTS_BUFFER.lock().unwrap(); + *segments_lock = resume_segments.clone(); + + // Setup segments_x buffer and rebuild language stats + let mut segments_x_lock = SEGMENTS_X_BUFFER.lock().unwrap(); + *segments_x_lock = resume_segments_x.clone(); + drop(segments_x_lock); + + // Rebuild language statistics from resume data + { + let mut stats = LANG_STATS.lock().unwrap(); + stats.clear(); + for seg in &resume_segments_x { + *stats.entry(seg.language.clone()).or_insert(0) += 1; + } + } + + let mut path_lock = OUTPUT_PATH.lock().unwrap(); + *path_lock = Some(output_path.clone()); + let mut path_x_lock = OUTPUT_X_PATH.lock().unwrap(); + *path_x_lock = Some(output_x_path.clone()); + let mut input_lock = INPUT_FILE.lock().unwrap(); + *input_lock = Some(args.input_file.clone()); + + // Use user-specified language from the start + if let Some(ref lang) = args.language { + let mut lang_lock = DETECTED_LANGUAGE.lock().unwrap(); + *lang_lock = lang.clone(); + } + } + + println!("⏳ Initializing Whisper model..."); let ctx = WhisperContext::new_with_params( - &args.model, - WhisperContextParameters::default() - ).context("Failed to load Whisper model")?; - - let mut state = ctx.create_state() + &model_path.to_string_lossy(), + WhisperContextParameters::default(), + ) + .context("Failed to load Whisper model")?; + + let mut state = ctx + .create_state() .context("Failed to create Whisper state")?; let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 }); @@ -216,7 +482,7 @@ fn main() -> Result<()> { params.set_print_realtime(false); params.set_print_timestamps(false); params.set_single_segment(false); - + if let Some(lang) = &args.language { params.set_language(Some(lang.as_str())); } else { @@ -227,84 +493,315 @@ fn main() -> Result<()> { params.set_progress_callback(Some(progress_callback)); } - println!("🎙️ 正在進行語音識別 (這可能需要幾分鐘)..."); - state.full(params, &audio_data) - .context("Whisper inference failed")?; - - eprintln!("\n✅ 識別完成!"); + // 設置新分段回調 - 在識別過程中實時收集和儲存分段 + let resume_offset_sec = skip_samples as f64 / WHISPER_SAMPLE_RATE as f64; + params.set_segment_callback_safe(move |segment: SegmentCallbackData| { + let start_ts = segment.start_timestamp as f64 / 100.0 + resume_offset_sec; + let end_ts = segment.end_timestamp as f64 / 100.0 + resume_offset_sec; + let start_time_str = format_time(start_ts); + let end_time_str = format_time(end_ts); + let clean_text = segment.text.trim(); - println!("\n=== 逐字稿 (帶時間碼) ===\n"); - - let num_segments = state.full_n_segments() - .context("Failed to get segment count")?; - - if num_segments == 0 { - println!("未檢測到任何語音內容。"); - return Ok(()); - } - - for i in 0..num_segments { - let text = match state.full_get_segment_text(i) { - Ok(t) => t, - Err(e) => { - eprintln!("⚠️ 警告:串流 #{} 包含無效字符 (UTF-8 Error),已跳過。詳情:{}", i, e); - continue; - } - }; - - let start_ts = match state.full_get_segment_t0(i) { - Ok(t) => t as f64 / 100.0, - Err(_) => { - eprintln!("⚠️ 警告:無法獲取串流 #{} 的時間戳", i); - continue; - } - }; - - let time_str = format_time(start_ts); - let clean_text = text.trim(); - if !clean_text.is_empty() { - println!("[{}] {}", time_str, clean_text); + // 獲取整體語言設定 + let lang = DETECTED_LANGUAGE.lock().unwrap().clone(); + + // 創建基本分段 (for .asr.json) + let new_segment = Segment { + start: start_time_str.clone(), + end: end_time_str.clone(), + text: clean_text.to_string(), + language: lang.clone(), + }; + + // 檢測是否為 Music 或空文字 + let is_music = clean_text.eq_ignore_ascii_case("[music]") + || clean_text.eq_ignore_ascii_case("(music)") + || clean_text.contains("[Music]"); + let is_empty = clean_text.is_empty() || clean_text.chars().all(|c| c.is_whitespace()); + + if is_music { + MUSIC_COUNT.fetch_add(1, Ordering::Relaxed); + } else if is_empty { + EMPTY_COUNT.fetch_add(1, Ordering::Relaxed); + } + + // 用 whatlang 檢測每段文字的語言 + let (detected_lang, confidence) = if is_music { + ("Music".to_string(), 1.0) + } else if is_empty { + ("Empty".to_string(), 0.0) + } else if let Some(info) = whatlang::detect(clean_text) { + let conf = info.confidence() as f64; + if conf < 0.5 { + ("Unknown".to_string(), conf) + } else { + (info.lang().eng_name().to_string(), conf) + } + } else { + ("Unknown".to_string(), 0.0) + }; + + // 更新語言統計 + { + let mut stats = LANG_STATS.lock().unwrap(); + *stats.entry(detected_lang.clone()).or_insert(0) += 1; + } + + // 創建帶語言標籤的分段 (for .asrx.json) + let new_segment_x = SegmentX { + start: start_time_str, + end: end_time_str, + text: clean_text.to_string(), + language: detected_lang, + confidence, + }; + + // 添加到全局緩衝區 + let mut segments = SEGMENTS_BUFFER.lock().unwrap(); + segments.push(new_segment); + let current_count = segments.len() as u64; + drop(segments); + + // 添加到 X 緩衝區 + let mut segments_x = SEGMENTS_X_BUFFER.lock().unwrap(); + segments_x.push(new_segment_x.clone()); + drop(segments_x); + + SEGMENTS_PROCESSED.store(current_count, Ordering::Relaxed); + + // 檢查是否需要存檔 + let save_interval = SAVE_INTERVAL.load(Ordering::Relaxed); + if save_interval > 0 && current_count % save_interval == 0 { + // 存儲 .asr.json + if let (Ok(segments), Ok(path_opt), Ok(input_opt)) = ( + SEGMENTS_BUFFER.lock(), + OUTPUT_PATH.lock(), + INPUT_FILE.lock(), + ) { + if let (Some(ref path), Some(ref input)) = (&*path_opt, &*input_opt) { + let transcript = Transcript { + input_file: input.clone(), + language: lang.clone(), + segments: segments.clone(), + }; + if let Ok(json) = serde_json::to_string_pretty(&transcript) { + if std::fs::write(path, json).is_ok() { + SEGMENTS_SAVED.store(current_count, Ordering::Relaxed); + } + } + } + } + + // 存儲 .asrx.json + if let (Ok(segments_x), Ok(path_x_opt), Ok(input_opt)) = ( + SEGMENTS_X_BUFFER.lock(), + OUTPUT_X_PATH.lock(), + INPUT_FILE.lock(), + ) { + if let (Some(ref path_x), Some(ref input)) = (&*path_x_opt, &*input_opt) { + let transcript_x = TranscriptX { + input_file: input.clone(), + segments: segments_x.clone(), + }; + if let Ok(json) = serde_json::to_string_pretty(&transcript_x) { + if std::fs::write(path_x, json).is_ok() { + SEGMENTS_X_SAVED.store(current_count, Ordering::Relaxed); + } + } + } + } + } } + }); + + println!("\n═══════════════════════════════════════════════════════════"); + println!("🎙️ Transcribing (this may take a few minutes)..."); + println!("💡 Auto-saving every {} segments", args.save_interval); + println!("📋 Legend:"); + println!(" P = Processed"); + println!(" A = ASR saved"); + println!(" X = Lang detect saved"); + println!("═══════════════════════════════════════════════════════════\n"); + state + .full(params, &audio_data) + .context("Whisper inference failed")?; + + eprintln!("\n✅ Transcription complete!"); + + // Detect language and update global variable + let detected_lang = if let Some(lang) = &args.language { + lang.clone() + } else if let Ok(lang_id) = state.full_lang_id_from_state() { + let lang_name = get_language_name(lang_id); + println!( + "\n🌍 Auto-detected language: {} (ID: {})", + lang_name, lang_id + ); + lang_name.to_string() + } else { + "unknown".to_string() + }; + + { + let mut lang_lock = DETECTED_LANGUAGE.lock().unwrap(); + *lang_lock = detected_lang.clone(); } - if args.language.is_none() { - if let Ok(lang_id) = state.full_lang_id_from_state() { - let lang_name = get_language_name(lang_id); - println!("\n🌍 自動檢測語言:{} (ID: {})", lang_name, lang_id); - } + // Get final results from buffer and update all segments with detected language + let mut final_segments = SEGMENTS_BUFFER.lock().unwrap().clone(); + for segment in &mut final_segments { + segment.language = detected_lang.clone(); } + let transcript = Transcript { + input_file: args.input_file.clone(), + language: detected_lang.clone(), + segments: final_segments.clone(), + }; + let json = serde_json::to_string_pretty(&transcript)?; + std::fs::write(&output_path, json)?; + SEGMENTS_SAVED.store(final_segments.len() as u64, Ordering::Relaxed); + println!("\n✅ Transcript saved to: {:?}", output_path); + println!("📊 Total segments: {}", final_segments.len()); + + // Save final .asrx.json + let final_segments_x = SEGMENTS_X_BUFFER.lock().unwrap().clone(); + let transcript_x = TranscriptX { + input_file: args.input_file.clone(), + segments: final_segments_x.clone(), + }; + let json_x = serde_json::to_string_pretty(&transcript_x)?; + std::fs::write(&output_x_path, json_x)?; + println!("✅ Language detection file saved to: {:?}", output_x_path); + + // Print final statistics report + let music_total = MUSIC_COUNT.load(Ordering::Relaxed); + let empty_total = EMPTY_COUNT.load(Ordering::Relaxed); + let stats = LANG_STATS.lock().unwrap(); + + println!("\n📋 Final Statistics Report"); + println!("═══════════════════════════════"); + println!("📊 Total Segments: {}", final_segments.len()); + println!("🎵 Music Segments: {}", music_total); + println!("📭 Empty Segments: {}", empty_total); + println!( + "🗣️ Speech Segments: {}", + final_segments.len() - music_total as usize - empty_total as usize + ); + println!("\n🌐 Language Distribution:"); + let mut lang_stats_sorted: Vec<_> = stats.iter().collect(); + lang_stats_sorted.sort_by_key(|(lang, _)| *lang); + for (lang, count) in lang_stats_sorted { + println!(" {}: {}", lang, count); + } + println!("═══════════════════════════════"); + Ok(()) } fn get_language_name(lang_id: i32) -> &'static str { match lang_id { - 0 => "English", 1 => "Chinese", 2 => "German", 3 => "Spanish", - 4 => "Russian", 5 => "Korean", 6 => "French", 7 => "Japanese", - 8 => "Portuguese", 9 => "Turkish", 10 => "Polish", 11 => "Catalan", - 12 => "Dutch", 13 => "Arabic", 14 => "Swedish", 15 => "Italian", - 16 => "Indonesian", 17 => "Hindi", 18 => "Finnish", 19 => "Vietnamese", - 20 => "Hebrew", 21 => "Ukrainian", 22 => "Greek", 23 => "Malay", - 24 => "Czech", 25 => "Romanian", 26 => "Danish", 27 => "Hungarian", - 28 => "Tamil", 29 => "Norwegian", 30 => "Thai", 31 => "Urdu", - 32 => "Croatian", 33 => "Bulgarian", 34 => "Lithuanian", 35 => "Latin", - 36 => "Maori", 37 => "Malayalam", 38 => "Welsh", 39 => "Slovak", - 40 => "Telugu", 41 => "Persian", 42 => "Latvian", 43 => "Bengali", - 44 => "Serbian", 45 => "Azerbaijani", 46 => "Slovenian", 47 => "Kannada", - 48 => "Estonian", 49 => "Macedonian", 50 => "Breton", 51 => "Basque", - 52 => "Icelandic", 53 => "Armenian", 54 => "Nepali", 55 => "Mongolian", - 56 => "Bosnian", 57 => "Kazakh", 58 => "Albanian", 59 => "Swahili", - 60 => "Galician", 61 => "Marathi", 62 => "Punjabi", 63 => "Sinhala", - 64 => "Khmer", 65 => "Shona", 66 => "Yoruba", 67 => "Somali", - 68 => "Afrikaans", 69 => "Occitan", 70 => "Georgian", 71 => "Belarusian", - 72 => "Tajik", 73 => "Sindhi", 74 => "Gujarati", 75 => "Amharic", - 76 => "Yiddish", 77 => "Lao", 78 => "Uzbek", 79 => "Faroese", - 80 => "Haitian Creole", 81 => "Pashto", 82 => "Turkmen", 83 => "Nynorsk", - 84 => "Maltese", 85 => "Sanskrit", 86 => "Luxembourgish", 87 => "Myanmar", - 88 => "Tibetan", 89 => "Tagalog", 90 => "Malagasy", 91 => "Assamese", - 92 => "Tatar", 93 => "Hawaiian", 94 => "Lingala", 95 => "Hausa", - 96 => "Bashkir", 97 => "Javanese", 98 => "Sundanese", 99 => "Cantonese", + 0 => "English", + 1 => "Chinese", + 2 => "German", + 3 => "Spanish", + 4 => "Russian", + 5 => "Korean", + 6 => "French", + 7 => "Japanese", + 8 => "Portuguese", + 9 => "Turkish", + 10 => "Polish", + 11 => "Catalan", + 12 => "Dutch", + 13 => "Arabic", + 14 => "Swedish", + 15 => "Italian", + 16 => "Indonesian", + 17 => "Hindi", + 18 => "Finnish", + 19 => "Vietnamese", + 20 => "Hebrew", + 21 => "Ukrainian", + 22 => "Greek", + 23 => "Malay", + 24 => "Czech", + 25 => "Romanian", + 26 => "Danish", + 27 => "Hungarian", + 28 => "Tamil", + 29 => "Norwegian", + 30 => "Thai", + 31 => "Urdu", + 32 => "Croatian", + 33 => "Bulgarian", + 34 => "Lithuanian", + 35 => "Latin", + 36 => "Maori", + 37 => "Malayalam", + 38 => "Welsh", + 39 => "Slovak", + 40 => "Telugu", + 41 => "Persian", + 42 => "Latvian", + 43 => "Bengali", + 44 => "Serbian", + 45 => "Azerbaijani", + 46 => "Slovenian", + 47 => "Kannada", + 48 => "Estonian", + 49 => "Macedonian", + 50 => "Breton", + 51 => "Basque", + 52 => "Icelandic", + 53 => "Armenian", + 54 => "Nepali", + 55 => "Mongolian", + 56 => "Bosnian", + 57 => "Kazakh", + 58 => "Albanian", + 59 => "Swahili", + 60 => "Galician", + 61 => "Marathi", + 62 => "Punjabi", + 63 => "Sinhala", + 64 => "Khmer", + 65 => "Shona", + 66 => "Yoruba", + 67 => "Somali", + 68 => "Afrikaans", + 69 => "Occitan", + 70 => "Georgian", + 71 => "Belarusian", + 72 => "Tajik", + 73 => "Sindhi", + 74 => "Gujarati", + 75 => "Amharic", + 76 => "Yiddish", + 77 => "Lao", + 78 => "Uzbek", + 79 => "Faroese", + 80 => "Haitian Creole", + 81 => "Pashto", + 82 => "Turkmen", + 83 => "Nynorsk", + 84 => "Maltese", + 85 => "Sanskrit", + 86 => "Luxembourgish", + 87 => "Myanmar", + 88 => "Tibetan", + 89 => "Tagalog", + 90 => "Malagasy", + 91 => "Assamese", + 92 => "Tatar", + 93 => "Hawaiian", + 94 => "Lingala", + 95 => "Hausa", + 96 => "Bashkir", + 97 => "Javanese", + 98 => "Sundanese", + 99 => "Cantonese", _ => "Unknown", } } @@ -320,12 +817,39 @@ fn format_time(seconds: f64) -> String { let h = total_secs / 3600; let m = (total_secs % 3600) / 60; let s = total_secs % 60; - format!("{:02}:{:02}:{:02}.{:03}", h, m, s, millis) + let frames = millis / 40; + format!("{:02}:{:02}:{:02}.{:03}(f{:02})", h, m, s, millis, frames) +} + +fn parse_time_to_seconds(time_str: &str) -> Option { + let parts: Vec<&str> = time_str.split(|c| c == ':' || c == '.').collect(); + if parts.len() >= 3 { + let h: f64 = parts[0].parse().ok()?; + let m: f64 = parts[1].parse().ok()?; + let s_part = parts[2]; + let (s, _frames) = if let Some(dot_pos) = s_part.find('(') { + let s: f64 = s_part[..dot_pos].parse().ok()?; + (s, 0) + } else { + let s: f64 = s_part.parse().ok()?; + (s, 0) + }; + let ms: f64 = if parts.len() > 3 { + parts[3].parse().ok()? + } else { + 0.0 + }; + Some(h * 3600.0 + m * 60.0 + s + ms / 1000.0) + } else { + None + } } fn extract_audio_to_f32(input_path: &str) -> Result> { - let mut ictx = input(&input_path)?; - let stream_index = ictx.streams().best(Type::Audio) + let mut ictx = ffmpeg::format::input(&input_path)?; + let stream_index = ictx + .streams() + .best(ffmpeg::media::Type::Audio) .ok_or_else(|| anyhow::anyhow!("未找到音頻串流"))? .index(); let stream = ictx.stream(stream_index).expect("Stream should exist"); @@ -333,7 +857,7 @@ fn extract_audio_to_f32(input_path: &str) -> Result> { let codec_id = codec_params.id(); let codec_decoder = ffmpeg::codec::decoder::find(codec_id) .ok_or_else(|| anyhow::anyhow!("未找到對應的解碼器"))?; - let mut context = CodecContext::new_with_codec(codec_decoder); + let mut context = ffmpeg::codec::context::Context::new_with_codec(codec_decoder); context.set_parameters(codec_params)?; let mut decoder = context.decoder().audio()?; let out_format = ffmpeg::format::Sample::F32(ffmpeg::format::sample::Type::Packed); @@ -343,87 +867,164 @@ fn extract_audio_to_f32(input_path: &str) -> Result> { let mut decoded_samples = Vec::new(); let mut error_count = 0; for (stream, packet) in ictx.packets() { - if stream.index() != stream_index { continue; } - if let Err(e) = decoder.send_packet(&packet) { eprintln!("⚠️ 發送包失敗:{}", e); continue; } - let mut decoded_frame = AudioFrame::empty(); + if stream.index() != stream_index { + continue; + } + if let Err(e) = decoder.send_packet(&packet) { + eprintln!("⚠️ 發送包失敗:{}", e); + continue; + } + let mut decoded_frame = ffmpeg::frame::Audio::empty(); while decoder.receive_frame(&mut decoded_frame).is_ok() { let in_format = decoded_frame.format(); let mut in_channel_layout = decoded_frame.channel_layout(); let in_sample_rate = decoded_frame.rate(); let channels = decoded_frame.channels(); if channels == 0 || in_channel_layout.is_empty() { - let safe_layout = if channels == 1 { ffmpeg::channel_layout::ChannelLayout::MONO } else if channels > 1 { ffmpeg::channel_layout::ChannelLayout::STEREO } else { ffmpeg::channel_layout::ChannelLayout::STEREO }; + let safe_layout = if channels == 1 { + ffmpeg::channel_layout::ChannelLayout::MONO + } else if channels > 1 { + ffmpeg::channel_layout::ChannelLayout::STEREO + } else { + ffmpeg::channel_layout::ChannelLayout::STEREO + }; in_channel_layout = safe_layout; } - let mut resampled_frame = AudioFrame::empty(); + let mut resampled_frame = ffmpeg::frame::Audio::empty(); if resampler.is_none() { eprintln!("ℹ️ 初始化重採樣器..."); - match ffmpeg::software::resampling::Context::get(in_format, in_channel_layout, in_sample_rate, out_format, out_channel_layout, out_sample_rate) { + match ffmpeg::software::resampling::Context::get( + in_format, + in_channel_layout, + in_sample_rate, + out_format, + out_channel_layout, + out_sample_rate, + ) { Ok(new_resampler) => { resampler = Some(new_resampler); if let Some(r) = resampler.as_mut() { if let Ok(_) = r.run(&decoded_frame, &mut resampled_frame) { - if resampled_frame.samples() > 0 { let _ = append_frame_samples(&mut decoded_samples, &resampled_frame); } + if resampled_frame.samples() > 0 { + let _ = append_frame_samples( + &mut decoded_samples, + &resampled_frame, + ); + } } } - }, - Err(init_err) => { eprintln!("❌ 無法初始化重採樣器:{}. 跳過此幀。", init_err); error_count += 1; continue; } + } + Err(init_err) => { + eprintln!("❌ 無法初始化重採樣器:{}. 跳過此幀。", init_err); + error_count += 1; + continue; + } } } else { - let run_result = resampler.as_mut().unwrap().run(&decoded_frame, &mut resampled_frame); + let run_result = resampler + .as_mut() + .unwrap() + .run(&decoded_frame, &mut resampled_frame); match run_result { Ok(delay_opt) => { - if resampled_frame.samples() > 0 { if let Err(e) = append_frame_samples(&mut decoded_samples, &resampled_frame) { eprintln!("⚠️ 追加樣本失敗:{}", e); } } - if let Some(_delay) = delay_opt { - if let Some(r) = resampler.as_mut() { - let mut flush_frame = AudioFrame::empty(); - while let Ok(Some(_)) = r.flush(&mut flush_frame) { if flush_frame.samples() > 0 { let _ = append_frame_samples(&mut decoded_samples, &flush_frame); } } + if resampled_frame.samples() > 0 { + if let Err(e) = + append_frame_samples(&mut decoded_samples, &resampled_frame) + { + eprintln!("⚠️ 追加樣本失敗:{}", e); } } - }, + if let Some(_delay) = delay_opt { + if let Some(r) = resampler.as_mut() { + let mut flush_frame = ffmpeg::frame::Audio::empty(); + while let Ok(Some(_)) = r.flush(&mut flush_frame) { + if flush_frame.samples() > 0 { + let _ = append_frame_samples( + &mut decoded_samples, + &flush_frame, + ); + } + } + } + } + } Err(e) => { let err_msg = format!("{}", e); if err_msg.contains("Output changed") || err_msg.contains("Invalid") { eprintln!("\n⚠️ 檢測到音頻參數變化 ('{}'),重置重採樣器...", err_msg); - drop(resampler.take()); error_count += 1; - match ffmpeg::software::resampling::Context::get(in_format, in_channel_layout, in_sample_rate, out_format, out_channel_layout, out_sample_rate) { + drop(resampler.take()); + error_count += 1; + match ffmpeg::software::resampling::Context::get( + in_format, + in_channel_layout, + in_sample_rate, + out_format, + out_channel_layout, + out_sample_rate, + ) { Ok(new_resampler) => { resampler = Some(new_resampler); - let mut retry_frame = AudioFrame::empty(); + let mut retry_frame = ffmpeg::frame::Audio::empty(); if let Some(r) = resampler.as_mut() { - if let Ok(_) = r.run(&decoded_frame, &mut retry_frame) { if retry_frame.samples() > 0 { let _ = append_frame_samples(&mut decoded_samples, &retry_frame); } } + if let Ok(_) = r.run(&decoded_frame, &mut retry_frame) { + if retry_frame.samples() > 0 { + let _ = append_frame_samples( + &mut decoded_samples, + &retry_frame, + ); + } + } } - }, - Err(init_err) => { eprintln!("❌ 重置重採樣器失敗:{}. 跳過此幀。", init_err); } + } + Err(init_err) => { + eprintln!("❌ 重置重採樣器失敗:{}. 跳過此幀。", init_err); + } } - } else { eprintln!("❌ 嚴重錯誤:{}. 停止處理。", e); return Err(e).context("Audio resampling failed unrecoverably"); } + } else { + eprintln!("❌ 嚴重錯誤:{}. 停止處理。", e); + return Err(e).context("Audio resampling failed unrecoverably"); + } } } } } } - if error_count > 0 { eprintln!("⚠️ 總共跳過或重置了 {} 次音頻處理。", error_count); } + if error_count > 0 { + eprintln!("⚠️ 總共跳過或重置了 {} 次音頻處理。", error_count); + } decoder.send_eof().ok(); - let mut decoded_frame = AudioFrame::empty(); + let mut decoded_frame = ffmpeg::frame::Audio::empty(); while decoder.receive_frame(&mut decoded_frame).is_ok() { if let Some(r) = resampler.as_mut() { - let mut resampled_frame = AudioFrame::empty(); - if let Ok(_) = r.run(&decoded_frame, &mut resampled_frame) { if resampled_frame.samples() > 0 { let _ = append_frame_samples(&mut decoded_samples, &resampled_frame); } } + let mut resampled_frame = ffmpeg::frame::Audio::empty(); + if let Ok(_) = r.run(&decoded_frame, &mut resampled_frame) { + if resampled_frame.samples() > 0 { + let _ = append_frame_samples(&mut decoded_samples, &resampled_frame); + } + } } } if let Some(r) = resampler.as_mut() { - let mut flush_frame = AudioFrame::empty(); - while let Ok(Some(_)) = r.flush(&mut flush_frame) { if flush_frame.samples() > 0 { let _ = append_frame_samples(&mut decoded_samples, &flush_frame); } } + let mut flush_frame = ffmpeg::frame::Audio::empty(); + while let Ok(Some(_)) = r.flush(&mut flush_frame) { + if flush_frame.samples() > 0 { + let _ = append_frame_samples(&mut decoded_samples, &flush_frame); + } + } } Ok(decoded_samples) } -fn append_frame_samples(buffer: &mut Vec, frame: &AudioFrame) -> Result<()> { - if frame.format() != ffmpeg::format::Sample::F32(ffmpeg::format::sample::Type::Packed) { return Err(anyhow::anyhow!("Unexpected audio sample format")); } +fn append_frame_samples(buffer: &mut Vec, frame: &ffmpeg::frame::Audio) -> Result<()> { + if frame.format() != ffmpeg::format::Sample::F32(ffmpeg::format::sample::Type::Packed) { + return Err(anyhow::anyhow!("Unexpected audio sample format")); + } let data = frame.data(0); let len = frame.samples(); let byte_len = len * 4; - if data.len() < byte_len { return Err(anyhow::anyhow!("Audio frame data size mismatch")); } + if data.len() < byte_len { + return Err(anyhow::anyhow!("Audio frame data size mismatch")); + } let slice = &data[0..byte_len]; let ptr = slice.as_ptr() as *const f32; let f32_slice = unsafe { std::slice::from_raw_parts(ptr, len) };