feat: Add per-segment language detection and dual output files
- Add whatlang for real-time language detection per segment - Generate .asr.json (basic) and .asrx.json (with language labels) - Add auto-save progress with configurable interval - Add resume functionality for interrupted transcriptions - Add Music/Empty detection and statistics - Update progress display with unified format - Add comprehensive Chinese README with usage documentation
This commit is contained in:
123
Cargo.lock
generated
123
Cargo.lock
generated
@@ -2,6 +2,18 @@
|
|||||||
# It is not intended for manual editing.
|
# It is not intended for manual editing.
|
||||||
version = 4
|
version = 4
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ahash"
|
||||||
|
version = "0.8.12"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"once_cell",
|
||||||
|
"version_check",
|
||||||
|
"zerocopy",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aho-corasick"
|
name = "aho-corasick"
|
||||||
version = "1.1.4"
|
version = "1.1.4"
|
||||||
@@ -11,6 +23,12 @@ dependencies = [
|
|||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "allocator-api2"
|
||||||
|
version = "0.2.21"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anstream"
|
name = "anstream"
|
||||||
version = "0.6.21"
|
version = "0.6.21"
|
||||||
@@ -270,6 +288,16 @@ version = "0.3.3"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
|
checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hashbrown"
|
||||||
|
version = "0.14.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
|
||||||
|
dependencies = [
|
||||||
|
"ahash",
|
||||||
|
"allocator-api2",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "heck"
|
name = "heck"
|
||||||
version = "0.5.0"
|
version = "0.5.0"
|
||||||
@@ -315,6 +343,12 @@ dependencies = [
|
|||||||
"either",
|
"either",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "itoa"
|
||||||
|
version = "1.0.17"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lazy_static"
|
name = "lazy_static"
|
||||||
version = "1.5.0"
|
version = "1.5.0"
|
||||||
@@ -525,7 +559,11 @@ dependencies = [
|
|||||||
"anyhow",
|
"anyhow",
|
||||||
"clap",
|
"clap",
|
||||||
"ffmpeg-next",
|
"ffmpeg-next",
|
||||||
|
"lazy_static",
|
||||||
"ndarray",
|
"ndarray",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"whatlang",
|
||||||
"whisper-rs",
|
"whisper-rs",
|
||||||
"whisper-rs-sys",
|
"whisper-rs-sys",
|
||||||
]
|
]
|
||||||
@@ -555,6 +593,49 @@ dependencies = [
|
|||||||
"windows-sys 0.59.0",
|
"windows-sys 0.59.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde"
|
||||||
|
version = "1.0.228"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
|
||||||
|
dependencies = [
|
||||||
|
"serde_core",
|
||||||
|
"serde_derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_core"
|
||||||
|
version = "1.0.228"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
|
||||||
|
dependencies = [
|
||||||
|
"serde_derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_derive"
|
||||||
|
version = "1.0.228"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_json"
|
||||||
|
version = "1.0.149"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
|
||||||
|
dependencies = [
|
||||||
|
"itoa",
|
||||||
|
"memchr",
|
||||||
|
"serde",
|
||||||
|
"serde_core",
|
||||||
|
"zmij",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "shlex"
|
name = "shlex"
|
||||||
version = "1.3.0"
|
version = "1.3.0"
|
||||||
@@ -596,6 +677,22 @@ version = "0.2.15"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "version_check"
|
||||||
|
version = "0.9.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "whatlang"
|
||||||
|
version = "0.16.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "471d1c1645d361eb782a1650b1786a8fb58dd625e681a04c09f5ff7c8764a7b0"
|
||||||
|
dependencies = [
|
||||||
|
"hashbrown",
|
||||||
|
"once_cell",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "which"
|
name = "which"
|
||||||
version = "4.4.2"
|
version = "4.4.2"
|
||||||
@@ -716,3 +813,29 @@ name = "windows_x86_64_msvc"
|
|||||||
version = "0.52.6"
|
version = "0.52.6"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zerocopy"
|
||||||
|
version = "0.8.42"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3"
|
||||||
|
dependencies = [
|
||||||
|
"zerocopy-derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zerocopy-derive"
|
||||||
|
version = "0.8.42"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zmij"
|
||||||
|
version = "1.0.21"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
|
||||||
|
|||||||
@@ -22,6 +22,10 @@ anyhow = "1.0"
|
|||||||
|
|
||||||
# 可選:如果 whisper-rs 需要額外的數學運算支持
|
# 可選:如果 whisper-rs 需要額外的數學運算支持
|
||||||
ndarray = "0.15"
|
ndarray = "0.15"
|
||||||
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
|
serde_json = "1.0"
|
||||||
|
lazy_static = "1.4"
|
||||||
|
whatlang = "0.16"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
161
README.md
161
README.md
@@ -4,21 +4,25 @@
|
|||||||
|
|
||||||
## 功能特性
|
## 功能特性
|
||||||
|
|
||||||
- **高效音頻提取**:利用 FFmpeg 直接從影片或音頻檔案中提取並重採樣為 Whisper 所需的格式 (16kHz, Mono, f32)。
|
- **高效音頻提取**:利用 FFmpeg 直接從影片或音頻檔案中提取並重採樣為 Whisper 所需的格式 (16kHz, Mono, f32)
|
||||||
- **精準轉寫**:基於 OpenAI Whisper 模型,提供高精度的語音轉文字功能。
|
- **精準轉寫**:基於 OpenAI Whisper 模型,提供高精度的語音轉文字功能
|
||||||
- **時間碼支援**:自動產生精確到毫秒的逐字稿時間戳。
|
- **時間碼支援**:自動產生精確到毫秒的逐字稿時間戳,格式為 `HH:MM:SS.mmm(fXX)` (25fps)
|
||||||
- **語言自動檢測**:若未指定語言,可自動偵測輸入音頻的語言。
|
- **語言自動檢測**:若未指定語言,可自動偵測輸入音頻的語言
|
||||||
- **優化進度顯示**:在轉寫過程中提供即時的處理進度、耗時與預估剩餘時間。
|
- **優化進度顯示**:在轉寫過程中提供即時的處理進度、耗時與預估剩餘時間
|
||||||
|
- **自動存檔**:支援定期自動存檔,避免長時間轉寫因意外中斷而遺失進度
|
||||||
|
- **中斷續處理**:支援從上次中斷點繼續處理
|
||||||
|
- **語言分段檢測**:自動檢測每段文字的語言類型,生成帶語言標籤的逐字稿
|
||||||
|
- **特殊內容識別**:自動識別 Music (音樂) 和 Empty (空文字) 分段
|
||||||
|
|
||||||
## 前置需求
|
## 前置需求
|
||||||
|
|
||||||
在編譯與執行前,請確保系統已安裝以下環境:
|
在編譯與執行前,請確保系統已安裝以下環境:
|
||||||
|
|
||||||
1. **Rust 環境**:請安裝 [Rustup](https://rustup.rs/)。
|
1. **Rust 環境**:請安裝 [Rustup](https://rustup.rs/)
|
||||||
2. **FFmpeg**:系統必須安裝 FFmpeg 開發庫。
|
2. **FFmpeg**:系統必須安裝 FFmpeg 開發庫
|
||||||
- macOS: `brew install ffmpeg`
|
- macOS: `brew install ffmpeg`
|
||||||
- Ubuntu/Debian: `sudo apt install libavcodec-dev libavformat-dev libavutil-dev libswresample-dev`
|
- Ubuntu/Debian: `sudo apt install libavcodec-dev libavformat-dev libavutil-dev libswresample-dev`
|
||||||
3. **Whisper 模型**:請準備 Whisper 的 `.bin` 模型檔案 (例如 `ggml-base.bin`),並放置於 `models/` 目錄中。
|
3. **Whisper 模型**:請準備 Whisper 的 `.bin` 模型檔案,並放置於 `models/` 目錄中
|
||||||
|
|
||||||
## 編譯與安裝
|
## 編譯與安裝
|
||||||
|
|
||||||
@@ -33,7 +37,7 @@ cargo build --release
|
|||||||
|
|
||||||
## 使用方式
|
## 使用方式
|
||||||
|
|
||||||
執行程式時,需指定輸入檔案路徑及模型路徑:
|
### 基本使用
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./target/release/rust-scribe <input_file> --model models/<your-model.bin>
|
./target/release/rust-scribe <input_file> --model models/<your-model.bin>
|
||||||
@@ -41,33 +45,148 @@ cargo build --release
|
|||||||
|
|
||||||
### 參數說明
|
### 參數說明
|
||||||
|
|
||||||
- `<input_file>`: 欲轉寫的影片或音頻檔案路徑 (位置參數)。
|
| 參數 | 說明 | 必填 |
|
||||||
- `-m, --model <MODEL_PATH>`: Whisper 模型檔案路徑 (必填)。
|
|------|------|------|
|
||||||
- `-l, --language <LANG>`: 指定轉寫語言 (例如 `zh`, `en`)。若不指定,系統將自動偵測 (選填)。
|
| `<input_file>` | 欲轉寫的影片或音頻檔案路徑 | ✅ |
|
||||||
- `-v, --verbose`: 開啟詳細轉寫進度輸出 (選填)。
|
| `-m, --model <MODEL_PATH>` | Whisper 模型檔案路徑 | ✅ |
|
||||||
|
| `-l, --language <LANG>` | 指定轉寫語言 (例如 `zh`, `en`)。若不指定將自動偵測 | ❌ |
|
||||||
|
| `-v, --verbose` | 開啟詳細轉寫進度輸出 | ❌ |
|
||||||
|
| `--save-interval <N>` | 每 N 個分段自動存檔一次 (預設: 100) | ❌ |
|
||||||
|
| `--max-duration <秒>` | 限制處理的最大時長(用於測試) | ❌ |
|
||||||
|
|
||||||
**使用範例**:
|
### 使用範例
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 指定使用 base 模型,自動檢測語言
|
# 基本使用,自動檢測語言,每 10 個分段存檔
|
||||||
cargo run --release -- video.mp4 --model models/ggml-base.bin
|
cargo run --release -- video.mp4 --model models/ggml-base.bin --save-interval 10
|
||||||
|
|
||||||
# 指定使用中文進行轉寫
|
# 指定中文轉寫
|
||||||
cargo run --release -- lecture.mkv --model models/ggml-base.bin --language zh
|
cargo run --release -- lecture.mkv --model models/ggml-base.bin --language zh
|
||||||
|
|
||||||
|
# 限制處理前 60 秒(用於測試)
|
||||||
|
cargo run --release -- video.mp4 --model models/ggml-base.bin --max-duration 60
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 輸出檔案
|
||||||
|
|
||||||
|
程式會生成兩個輸出檔案:
|
||||||
|
|
||||||
|
### 1. `.asr.json` - 基本逐字稿
|
||||||
|
包含基本的轉寫結果:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"input_file": "video.mp4",
|
||||||
|
"language": "English",
|
||||||
|
"segments": [
|
||||||
|
{
|
||||||
|
"start": "00:00:05.120(f03)",
|
||||||
|
"end": "00:00:08.560(f14)",
|
||||||
|
"text": "Hello everyone",
|
||||||
|
"language": "English"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. `.asrx.json` - 帶語言檢測的逐字稿
|
||||||
|
包含每段文字的語言檢測結果:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"input_file": "video.mp4",
|
||||||
|
"segments": [
|
||||||
|
{
|
||||||
|
"start": "00:00:05.120(f03)",
|
||||||
|
"end": "00:00:08.560(f14)",
|
||||||
|
"text": "Hello everyone",
|
||||||
|
"language": "English",
|
||||||
|
"confidence": 0.98
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"start": "00:00:10.000(f00)",
|
||||||
|
"end": "00:00:13.000(f00)",
|
||||||
|
"text": "[Music]",
|
||||||
|
"language": "Music",
|
||||||
|
"confidence": 1.0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## 進度顯示說明
|
||||||
|
|
||||||
|
轉寫過程中會顯示即時進度:
|
||||||
|
|
||||||
|
```
|
||||||
|
═══════════════════════════════════════════════════════════
|
||||||
|
🎙️ Transcribing (this may take a few minutes)...
|
||||||
|
💡 Auto-saving every 10 segments
|
||||||
|
📋 Legend:
|
||||||
|
P = Processed (已處理分段數)
|
||||||
|
A = ASR saved (.asr.json 已存檔數)
|
||||||
|
X = Lang detect saved (.asrx.json 已存檔數)
|
||||||
|
═══════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
🔄 Progress: [71:05/114:39] 62.0% | Elapsed: 03:04 | Remaining: 01:54 | P:1112 A:1110 X:1110 | English:500 Unknown:52 Music:58
|
||||||
|
```
|
||||||
|
|
||||||
|
### 進度列格式說明
|
||||||
|
|
||||||
|
- **時間進度**:`[已處理時間/總時長] 完成百分比`
|
||||||
|
- **時間統計**:`Elapsed: 已耗時 | Remaining: 預估剩餘時間`
|
||||||
|
- **分段統計**:`P:已處理數 A:ASR存檔數 X:語言檢測存檔數`
|
||||||
|
- **語言統計**:各語言類型的分段數量(依序顯示)
|
||||||
|
|
||||||
|
### 特殊類型標籤
|
||||||
|
|
||||||
|
- **Music**:音樂或背景音效段落
|
||||||
|
- **Empty**:空文字或純空白段落
|
||||||
|
- **Unknown**:語言識別置信度低於 0.5 的段落
|
||||||
|
|
||||||
|
## 中斷續處理
|
||||||
|
|
||||||
|
程式支援中斷後繼續處理:
|
||||||
|
|
||||||
|
1. 當程式偵測到已存在的輸出檔案時,會詢問是否繼續
|
||||||
|
2. 選擇 `[C] Continue` 會從上次中斷點繼續
|
||||||
|
3. 選擇 `[R] Restart` 會刪除舊檔案重新開始
|
||||||
|
|
||||||
|
兩個檔案(`.asr.json` 和 `.asrx.json`)會同步處理。
|
||||||
|
|
||||||
|
## 語言檢測說明
|
||||||
|
|
||||||
|
使用 [whatlang](https://github.com/greyblake/whatlang-rs) 函式庫進行語言檢測:
|
||||||
|
|
||||||
|
- **置信度 ≥ 0.5**:顯示檢測到的語言名稱
|
||||||
|
- **置信度 < 0.5**:標註為 "Unknown"
|
||||||
|
- **Music**:自動識別包含 [Music] 或 (music) 標記的段落
|
||||||
|
- **Empty**:空文字或純空白段落
|
||||||
|
|
||||||
## 專案結構
|
## 專案結構
|
||||||
|
|
||||||
```
|
```
|
||||||
rust-scribe/
|
rust-scribe/
|
||||||
├── src/
|
├── src/
|
||||||
│ └── main.rs # 核心邏輯 (音頻處理、Whisper 轉寫、CLI 介面)
|
│ └── main.rs # 核心邏輯
|
||||||
├── models/ # 存放 Whisper 模型檔案
|
├── models/ # Whisper 模型檔案
|
||||||
├── Cargo.toml # 專案依賴與配置
|
├── Cargo.toml # 專案配置
|
||||||
|
├── Cargo.lock # 依賴鎖定
|
||||||
|
├── README.md # 使用文檔
|
||||||
|
├── AGENTS.md # 開發者規範
|
||||||
└── .cargo/
|
└── .cargo/
|
||||||
└── config.toml # Cargo 編譯配置
|
└── config.toml # Cargo 配置
|
||||||
```
|
```
|
||||||
|
|
||||||
## 開發者規範
|
## 開發者規範
|
||||||
|
|
||||||
請參閱 `AGENTS.md` 了解詳細的代碼風格、編譯與測試指南。
|
請參閱 `AGENTS.md` 了解詳細的代碼風格、編譯與測試指南。
|
||||||
|
|
||||||
|
## 許可證
|
||||||
|
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
## 致謝
|
||||||
|
|
||||||
|
- [Whisper.cpp](https://github.com/ggerganov/whisper.cpp) - OpenAI Whisper 的 C++ 實現
|
||||||
|
- [FFmpeg](https://ffmpeg.org/) - 音頻處理函式庫
|
||||||
|
- [whisper-rs](https://github.com/tazz4843/whisper-rs) - Whisper.cpp 的 Rust 綁定
|
||||||
|
- [whatlang](https://github.com/greyblake/whatlang-rs) - 自然語言檢測函式庫
|
||||||
|
|||||||
873
src/main.rs
873
src/main.rs
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user