Spaces:

Thanh-Lam
/

Diarization_labeling

Running

App Files Files Community

Thanh-Lam commited on 13 days ago

Commit

4193ca8

1 Parent(s): f7fe2af

Update project for Hugging Face Space deployment

Browse files

Files changed (10) hide show

.gitignore +18 -0
README.md +19 -2
__pycache__/app.cpython-312.pyc +0 -0
app.py +454 -17
requirements.txt +7 -0
src/__pycache__/models.cpython-312.pyc +0 -0
src/__pycache__/utils.cpython-312.pyc +0 -0
src/models.py +52 -13
src/utils.py +118 -2
test_gradio.py +11 -0

.gitignore CHANGED Viewed

	@@ -1 +1,19 @@
1	hugging_face_key.txt

 hugging_face_key.txt
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.so
+*.egg
+*.egg-info/
+dist/
+build/
+*.log
+.DS_Store
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+path_demo.txt

README.md CHANGED Viewed

@@ -1,3 +1,14 @@
 # Vietnamese_Diarization
 Kho mã mẫu diarization tiếng Việt dùng pyannote/speaker-diarization-community-1.
@@ -9,9 +20,15 @@ Kho mã mẫu diarization tiếng Việt dùng pyannote/speaker-diarization-comm
 - Hugging Face access token (dán vào hugging_face_key.txt hoặc đặt biến môi trường HUGGINGFACE_TOKEN/HUGGINGFACE_ACCESS_TOKEN)
 ## Cài đặt nhanh
-- Cài thư viện: `pip install pyannote.audio` hoặc `uv add pyannote.audio`
 - Đảm bảo ffmpeg đã có trong PATH
 ## Chạy mẫu
 - Diarization và in kết quả: `python infer.py path/to/audio.wav`
 - Lưu thêm RTTM: `python infer.py path/to/audio.wav --rttm outputs/audio.rttm`
@@ -25,7 +42,7 @@ segments = diarize_file("audio.wav", device="auto")
 ```
 ## Cấu trúc
-- app.py: API Python đơn giản
 - infer.py: CLI chạy diarization
 - src/models.py: Bao gói pipeline pyannote
 - src/utils.py: Hỗ trợ đọc token, định dạng kết quả

+---
+title: Diarization Labeling
+emoji: "\U0001F4E3"
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: "4.39.0"
+app_file: app.py
+pinned: false
+---
 # Vietnamese_Diarization
 Kho mã mẫu diarization tiếng Việt dùng pyannote/speaker-diarization-community-1.
 - Hugging Face access token (dán vào hugging_face_key.txt hoặc đặt biến môi trường HUGGINGFACE_TOKEN/HUGGINGFACE_ACCESS_TOKEN)
 ## Cài đặt nhanh
+- Cài thư viện: `pip install pyannote.audio gradio yt-dlp` hoặc `uv add pyannote.audio gradio yt-dlp`
 - Đảm bảo ffmpeg đã có trong PATH
+## Chạy Gradio
+- Lệnh: `python app.py`
+- Trình duyệt mở tại http://localhost:7860 (hoặc địa chỉ máy chủ nếu chạy từ xa)
+- Điền token nếu chưa đặt sẵn, tải file âm thanh hoặc dán URL YouTube/TikTok, chọn thiết bị rồi nhấn Chạy
+- Bảng kết quả hiển thị dạng phút:giây; có thể gán nhãn giới tính (nam/nữ), vùng miền (bắc/trung/nam) và transcription, sau đó bấm "Tách và tải" để nhận zip gồm các đoạn WAV và metadata.csv
 ## Chạy mẫu
 - Diarization và in kết quả: `python infer.py path/to/audio.wav`
 - Lưu thêm RTTM: `python infer.py path/to/audio.wav --rttm outputs/audio.rttm`
 ```
 ## Cấu trúc
+- app.py: API Python và giao diện Gradio
 - infer.py: CLI chạy diarization
 - src/models.py: Bao gói pipeline pyannote
 - src/utils.py: Hỗ trợ đọc token, định dạng kết quả

__pycache__/app.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-312.pyc and b/__pycache__/app.cpython-312.pyc differ

app.py CHANGED Viewed

@@ -1,9 +1,29 @@
 from __future__ import annotations
 from pathlib import Path
-from typing import List
 from src.models import DiarizationEngine, Segment
 def diarize_file(
@@ -17,20 +37,437 @@ def diarize_file(
     return engine.run(audio_path, show_progress=show_progress)
 if __name__ == "__main__":
-    # Ví dụ nhanh: python app.py audio.wav
-    import argparse
-    parser = argparse.ArgumentParser(description="Ví dụ chạy diarization qua hàm Python.")
-    parser.add_argument("audio", help="Đường dẫn tới file âm thanh")
-    parser.add_argument(
-        "--device",
-        choices=["auto", "cpu", "cuda"],
-        default="auto",
-        help="Thiết bị ưu tiên khi khởi tạo pipeline",
-    )
-    args = parser.parse_args()
-    segments = diarize_file(args.audio, device=args.device)
-    for idx, seg in enumerate(segments, start=1):
-        print(f"{idx:02d} | {seg.start:7.2f}s -> {seg.end:7.2f}s | speaker {seg.speaker}")

 from __future__ import annotations
+import functools
+import tempfile
 from pathlib import Path
+from typing import List, Any
+import shutil
+import csv
+import subprocess
+import zipfile
+import gradio as gr
 from src.models import DiarizationEngine, Segment
+from src.utils import (
+    export_segments_json,
+    format_segments_table,
+    seconds_to_mmss,
+    download_audio_from_url,
+)
+DEFAULT_TOKEN_SENTINEL = "__FROM_FILE_OR_ENV__"
+GENDER_MAP = {"nam": "0", "male": "0", "nữ": "1", "nu": "1", "female": "1"}
+REGION_MAP = {"bắc": "0", "bac": "0", "north": "0", "trung": "1", "central": "1", "nam": "2", "south": "2"}
+ALLOWED_GENDER = {"nam", "nữ", "nu", "male", "female"}
+ALLOWED_REGION = {"bắc", "trung", "nam", "bac", "north", "central", "south"}
 def diarize_file(
     return engine.run(audio_path, show_progress=show_progress)
+def _token_key(raw_token: str | None) -> str:
+    cleaned = raw_token.strip() if raw_token else None
+    return cleaned if cleaned else DEFAULT_TOKEN_SENTINEL
+@functools.lru_cache(maxsize=2)
+def _get_engine(token_key: str, device: str) -> DiarizationEngine:
+    token_value = None if token_key == DEFAULT_TOKEN_SENTINEL else token_key
+    return DiarizationEngine(token=token_value, device=device)
+def _diarize_action(
+    audio_path: str | None,
+    hf_token: str | None,
+    device: str,
+    url: str | None = None,
+):
+    if not audio_path and not url:
+        empty_state = ["", "", "", ""]
+        return "Vui lòng tải file âm thanh hoặc nhập URL.", None, None, [], [], empty_state, ""
+    try:
+        downloaded_path = None
+        download_tmp = None
+        audio_input = audio_path
+        if url:
+            downloaded_path, download_tmp = download_audio_from_url(url)
+            audio_input = str(downloaded_path)
+        engine = _get_engine(_token_key(hf_token), device)
+        diarization, prepared_path, prep_tmpdir = engine.diarize(
+            audio_input, show_progress=False, keep_audio=True
+        )
+        segments = engine.to_segments(diarization)
+        dict_segments = [
+            {"start": float(seg.start), "end": float(seg.end), "speaker": seg.speaker}
+            for seg in segments
+        ]
+        table = format_segments_table(dict_segments)
+        output_tmp = Path(tempfile.mkdtemp(prefix="diarization_out_"))
+        rttm_path = engine.save_rttm(diarization, output_tmp / "output.rttm")
+        json_path = export_segments_json(dict_segments, output_tmp / "segments.json")
+        df_rows = [
+            [
+                seconds_to_mmss(seg["start"]),
+                seconds_to_mmss(seg["end"]),
+                seg["speaker"],
+                "",
+                "",
+                "",
+            ]
+            for seg in dict_segments
+        ]
+        source_name = Path(audio_input).stem if audio_input else "unknown"
+        audio_state = [
+            str(prepared_path),
+            str(prep_tmpdir) if prep_tmpdir else "",
+            source_name,
+            str(download_tmp) if download_tmp else "",
+        ]
+        return (
+            table,
+            str(rttm_path),
+            str(json_path),
+            df_rows,
+            dict_segments,
+            audio_state,
+            str(prepared_path),
+        )
+    except Exception as exc:  # pragma: no cover - hiển thị lỗi cho người dùng giao diện
+        empty_state = ["", "", "", ""]
+        return f"Lỗi: {exc}", None, None, [], [], empty_state, ""
+def _normalize_label(value: Any) -> str:
+    return str(value).strip().lower() if value is not None else ""
+def _table_to_rows(table_data: Any) -> list[list[Any]]:
+    """Chuyển giá trị DataFrame/ndarray/list sang list of list để thao tác."""
+    if table_data is None:
+        return []
+    if hasattr(table_data, "values"):  # pandas DataFrame hoặc ndarray
+        try:
+            return table_data.values.tolist()
+        except Exception:
+            pass
+    if isinstance(table_data, list):
+        return table_data
+    if isinstance(table_data, tuple):
+        return list(table_data)
+    return []
+def _select_row_action(evt: gr.SelectData):
+    row_idx = evt.index[0] if evt and evt.index else -1
+    if row_idx is None or row_idx < 0:
+        return "Chưa chọn hàng", -1
+    return f"Đang chọn hàng {row_idx + 1}", row_idx
+def _apply_dropdown_action(
+    table_rows: list[list[Any]] | None,
+    selected_idx: int,
+    gender_choice: str,
+    region_choice: str,
+    transcription_text: str,
+):
+    rows = _table_to_rows(table_rows)
+    if selected_idx is None or selected_idx < 0 or selected_idx >= len(rows):
+        return rows, "Chọn một hàng trước."
+    gender_val = _normalize_label(gender_choice)
+    region_val = _normalize_label(region_choice)
+    if gender_val and gender_val not in ALLOWED_GENDER:
+        return rows, "Giới tính chỉ được chọn nam/nữ."
+    if region_val and region_val not in ALLOWED_REGION:
+        return rows, "Vùng miền chỉ được chọn bắc/trung/nam."
+    new_rows = [list(r) for r in rows]
+    # row order: start_mmss, end_mmss, speaker, gender, region, transcription
+    if len(new_rows[selected_idx]) < 6:
+        new_rows[selected_idx] = (new_rows[selected_idx] + [""] * 6)[:6]
+    new_rows[selected_idx][3] = gender_val
+    new_rows[selected_idx][4] = region_val
+    new_rows[selected_idx][5] = transcription_text
+    return new_rows, f"Đã áp dụng cho hàng {selected_idx + 1}."
+def _import_archives_action(files: list[Any] | None, output_root: str = "outputs"):
+    if not files:
+        return "Chọn ít nhất một file ZIP.", None
+    merged_root = Path(tempfile.mkdtemp(prefix="merged_zip_"))
+    merged_data = merged_root / "merged"
+    merged_data.mkdir(parents=True, exist_ok=True)
+    meta_all = merged_data / "metadata_all.csv"
+    appended = 0
+    extracted = 0
+    with meta_all.open("w", newline="", encoding="utf-8") as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow(
+            [
+                "id",
+                "file_name",
+                "start_mmss",
+                "end_mmss",
+                "gender",
+                "region",
+                "transcription",
+                "speaker",
+                "duration_sec",
+                "source",
+            ]
+        )
+    for f in files:
+        zip_path = Path(getattr(f, "name", f))
+        if not zip_path.exists() and isinstance(f, dict) and "name" in f:
+            zip_path = Path(f["name"])
+        if not zip_path.exists():
+            continue
+        extracted += 1
+        dest_dir = merged_data / zip_path.stem
+        dest_dir.mkdir(parents=True, exist_ok=True)
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            zf.extractall(dest_dir)
+        meta_csv = dest_dir / "metadata.csv"
+        if meta_csv.exists():
+            with meta_all.open("a", newline="", encoding="utf-8") as out_csv:
+                writer = csv.writer(out_csv)
+                with meta_csv.open("r", encoding="utf-8") as src:
+                    next(src, None)  # skip header
+                    for line in src:
+                        row = line.strip().split(",")
+                        if row and any(row):
+                            writer.writerow(row + [zip_path.stem])
+                            appended += 1
+    merged_zip = shutil.make_archive(str(merged_data), "zip", merged_data)
+    status = f"Đã gộp {extracted} ZIP, metadata_all.csv có thêm {appended} dòng. Tải merged.zip."
+    return status, merged_zip
+def _split_segments_action(
+    table_rows: list[list[Any]] | None,
+    segments_state: list[dict],
+    audio_state: list[str],
+):
+    if not shutil.which("ffmpeg"):
+        return "Cần cài ffmpeg để tách đoạn.", None
+    if not segments_state:
+        return "Chạy diarization trước.", None
+    if not audio_state or len(audio_state) < 1 or not audio_state[0]:
+        return "Thiếu thông tin file đã chuẩn hóa.", None
+    prepared_path = Path(audio_state[0])
+    tmp_root = Path(tempfile.mkdtemp(prefix="segments_"))
+    output_dir = tmp_root / "data"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    metadata_path = output_dir / "metadata.csv"
+    rows = _table_to_rows(table_rows)
+    try:
+        with metadata_path.open("w", newline="", encoding="utf-8") as csvfile:
+            writer = csv.writer(csvfile)
+            writer.writerow(
+                [
+                    "id",
+                    "file_name",
+                    "start_mmss",
+                    "end_mmss",
+                    "gender",
+                    "region",
+                    "transcription",
+                    "speaker",
+                    "duration_sec",
+                ]
+            )
+            for idx, seg in enumerate(segments_state):
+                row = rows[idx] if idx < len(rows) else []
+                # row order: start_mmss, end_mmss, speaker, gender, region, transcription
+                gender = _normalize_label(row[3] if len(row) > 3 else "")
+                region = _normalize_label(row[4] if len(row) > 4 else "")
+                transcription = row[5] if len(row) > 5 else ""
+                if gender and gender not in ALLOWED_GENDER:
+                    return f"Lỗi: giới tính hàng {idx+1} phải là nam/nữ.", None
+                if region and region not in ALLOWED_REGION:
+                    return f"Lỗi: vùng miền hàng {idx+1} phải là bắc/trung/nam.", None
+                gender_code = GENDER_MAP.get(gender, "")
+                region_code = REGION_MAP.get(region, "")
+                seg_id = f"id_{gender_code or 'x'}_{region_code or 'x'}_{idx:03d}"
+                gender_disp = "Nam" if gender_code == "0" else "Nữ" if gender_code == "1" else gender
+                region_disp = (
+                    "Bắc"
+                    if region_code == "0"
+                    else "Trung"
+                    if region_code == "1"
+                    else "Nam"
+                    if region_code == "2"
+                    else region
+                )
+                start = float(seg["start"])
+                end = float(seg["end"])
+                duration = max(end - start, 0.0)
+                out_file = output_dir / f"{seg_id}.wav"
+                cmd = [
+                    "ffmpeg",
+                    "-y",
+                    "-i",
+                    str(prepared_path),
+                    "-ss",
+                    f"{start:.3f}",
+                    "-to",
+                    f"{end:.3f}",
+                    "-ac",
+                    "1",
+                    "-ar",
+                    "16000",
+                    "-vn",
+                    "-f",
+                    "wav",
+                    str(out_file),
+                ]
+                result = subprocess.run(cmd, capture_output=True, text=True)
+                if result.returncode != 0:
+                    stderr = result.stderr.strip()
+                    raise RuntimeError(f"ffmpeg lỗi khi tách đoạn {idx}: {stderr}")
+                writer.writerow(
+                    [
+                        seg_id,
+                        out_file.name,
+                        seconds_to_mmss(start),
+                        seconds_to_mmss(end),
+                        gender_disp,
+                        region_disp,
+                        transcription,
+                        seg.get("speaker", ""),
+                        duration,
+                    ]
+                )
+        archive = shutil.make_archive(str(output_dir), "zip", output_dir)
+        return f"Tách {len(segments_state)} đoạn thành công. Tải zip bên dưới.", archive
+    except Exception as exc:  # pragma: no cover
+        return f"Lỗi khi tách: {exc}", None
+def build_interface() -> gr.Blocks:
+    with gr.Blocks(title="Vietnamese Diarization", analytics_enabled=False) as demo:
+        gr.Markdown(
+            """
+### Diarization tiếng Việt với pyannote
+- Tải file âm thanh, điền Hugging Face access token (hoặc để trống nếu đã đặt trong môi trường/file).
+- Chọn thiết bị chạy, nhấn Chạy. Kết quả hiển thị dạng bảng và file RTTM/JSON tải về.
+"""
+        )
+        segments_state = gr.State([])
+        audio_state = gr.State({})
+        with gr.Row():
+            with gr.Column():
+                audio_input = gr.Audio(label="Tải file audio (tùy chọn)", type="filepath")
+                playback = gr.Audio(
+                    label="Audio đã chuyển đổi/đang dùng",
+                    type="filepath",
+                    interactive=False,
+                )
+            with gr.Column():
+                url_input = gr.Textbox(
+                    label="URL YouTube/TikTok (tùy chọn)",
+                    placeholder="Dán link video nếu không tải file",
+                )
+                token_input = gr.Textbox(
+                    label="Hugging Face access token (tùy chọn)",
+                    type="password",
+                    placeholder="Để trống nếu đã cấu hình môi trường hoặc hugging_face_key.txt",
+                )
+                device_input = gr.Dropdown(
+                    choices=["auto", "cpu", "cuda"],
+                    value="auto",
+                    label="Thiết bị",
+                )
+                run_btn = gr.Button("Chạy diarization")
+        gr.Markdown(
+            """
+#### Gán nhãn và tách đoạn
+- Chọn các ô gender (nam/nữ) và region (bắc/trung/nam) bằng dropdown trong bảng, transcription nhập tay.
+- Nhấn "Tách và tải" để tải zip gồm các đoạn WAV và metadata.csv (không lưu lại trên server).
+"""
+        )
+        segment_df = gr.DataFrame(
+            headers=[
+                "start_mmss",
+                "end_mmss",
+                "speaker",
+                "gender",
+                "region",
+                "transcription",
+            ],
+            datatype=["str", "str", "str", "str", "str", "str"],
+            interactive=True,
+            row_count=(0, "dynamic"),
+        )
+        gender_dropdown = gr.Dropdown(choices=["", "nam", "nữ"], value="", label="Giới tính chọn nhanh")
+        region_dropdown = gr.Dropdown(choices=["", "bắc", "trung", "nam"], value="", label="Vùng miền chọn nhanh")
+        transcription_input = gr.Textbox(label="Transcription (áp dụng nhanh)", lines=1, placeholder="Nhập lời thoại")
+        selection_info = gr.Textbox(label="Hàng đang chọn", interactive=False, value="Chưa chọn hàng")
+        split_btn = gr.Button("Tách và tải")
+        split_status = gr.Textbox(label="Trạng thái tách", lines=2)
+        zip_file = gr.File(label="Tải ZIP các đoạn")
+        gr.Markdown(
+            """
+#### Nhập ZIP đã tách (gộp nhiều ZIP thành một)
+- Tải lên nhiều file ZIP đã tải về trước đó, công cụ sẽ gộp lại và tạo một merged.zip kèm metadata_all.csv.
+"""
+        )
+        import_files = gr.File(label="Chọn nhiều ZIP", file_count="multiple", file_types=[".zip"])
+        import_btn = gr.Button("Nhập ZIP vào thư mục chung")
+        import_status = gr.Textbox(label="Trạng thái nhập ZIP", lines=2)
+        result_box = gr.Textbox(label="Bảng phân đoạn", lines=12)
+        rttm_file = gr.File(label="Tải RTTM")
+        json_file = gr.File(label="Tải JSON")
+        selected_row = gr.State(-1)
+        run_btn.click(
+            fn=_diarize_action,
+            inputs=[audio_input, token_input, device_input, url_input],
+            outputs=[result_box, rttm_file, json_file, segment_df, segments_state, audio_state, playback],
+        )
+        segment_df.select(
+            fn=_select_row_action,
+            inputs=None,
+            outputs=[selection_info, selected_row],
+        )
+        gender_dropdown.change(
+            fn=_apply_dropdown_action,
+            inputs=[segment_df, selected_row, gender_dropdown, region_dropdown, transcription_input],
+            outputs=[segment_df, selection_info],
+        )
+        region_dropdown.change(
+            fn=_apply_dropdown_action,
+            inputs=[segment_df, selected_row, gender_dropdown, region_dropdown, transcription_input],
+            outputs=[segment_df, selection_info],
+        )
+        transcription_input.change(
+            fn=_apply_dropdown_action,
+            inputs=[segment_df, selected_row, gender_dropdown, region_dropdown, transcription_input],
+            outputs=[segment_df, selection_info],
+        )
+        split_btn.click(
+            fn=_split_segments_action,
+            inputs=[segment_df, segments_state, audio_state],
+            outputs=[split_status, zip_file],
+        )
+        import_btn.click(
+            fn=_import_archives_action,
+            inputs=[import_files],
+            outputs=[import_status, zip_file],
+        )
+    return demo
 if __name__ == "__main__":
+    import sys
+    print("=" * 60, file=sys.stderr)
+    print("Khởi tạo Vietnamese Diarization App...", file=sys.stderr)
+    print("=" * 60, file=sys.stderr)
+    try:
+        demo = build_interface()
+        print("Interface đã được khởi tạo thành công!", file=sys.stderr)
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+        )
+    except Exception as e:
+        print(f"LỖI khi khởi động app: {e}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+pyannote.audio
+torch==2.1.2
+torchaudio==2.1.2
+gradio==4.39.0
+huggingface_hub==0.23.4
+yt-dlp
+numpy<2.0

src/__pycache__/models.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/models.cpython-312.pyc and b/src/__pycache__/models.cpython-312.pyc differ

src/__pycache__/utils.cpython-312.pyc CHANGED Viewed

Binary files a/src/__pycache__/utils.cpython-312.pyc and b/src/__pycache__/utils.cpython-312.pyc differ

src/models.py CHANGED Viewed

@@ -2,13 +2,14 @@ from __future__ import annotations
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Iterable, List
 import torch
 from pyannote.audio import Pipeline
 from pyannote.audio.pipelines.utils.hook import ProgressHook
-from .utils import ensure_audio_path, read_hf_token
 @dataclass
@@ -27,10 +28,27 @@ class DiarizationEngine:
         token: str | None = None,
         key_path: str | Path = "hugging_face_key.txt",
         device: str = "auto",
     ) -> None:
         self.device = self._resolve_device(device)
         auth_token = read_hf_token(token, key_path)
-        self.pipeline = Pipeline.from_pretrained(model_id, token=auth_token)
         self.pipeline.to(self.device)
     @staticmethod
@@ -45,17 +63,37 @@ class DiarizationEngine:
             return torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
         raise ValueError("Giá trị device hợp lệ: auto, cpu, cuda.")
-    def diarize(self, audio_path: str | Path, show_progress: bool = True):
         audio_path = ensure_audio_path(audio_path)
-        if show_progress:
-            with ProgressHook() as hook:
-                return self.pipeline(str(audio_path), hook=hook)
-        return self.pipeline(str(audio_path))
     @staticmethod
-    def to_segments(diarization) -> List[Segment]:
         segments: List[Segment] = []
-        for segment, _, speaker in diarization.itertracks(yield_label=True):
             segments.append(
                 Segment(
                     start=float(segment.start),
@@ -65,11 +103,12 @@ class DiarizationEngine:
             )
         return segments
-    @staticmethod
-    def save_rttm(diarization, output_path: str | Path) -> Path:
         path = Path(output_path)
         path.parent.mkdir(parents=True, exist_ok=True)
-        diarization.write_rttm(path)
         return path
     def run(self, audio_path: str | Path, show_progress: bool = True) -> List[Segment]:

 from dataclasses import dataclass
 from pathlib import Path
+from typing import Iterable, List, Any, Dict, Optional
+import shutil
 import torch
 from pyannote.audio import Pipeline
 from pyannote.audio.pipelines.utils.hook import ProgressHook
+from .utils import ensure_audio_path, read_hf_token, convert_to_wav_16k
 @dataclass
         token: str | None = None,
         key_path: str | Path = "hugging_face_key.txt",
         device: str = "auto",
+        segmentation_params: Optional[Dict[str, float]] = None,
+        clustering_params: Optional[Dict[str, float]] = None,
     ) -> None:
         self.device = self._resolve_device(device)
         auth_token = read_hf_token(token, key_path)
+        pipeline = Pipeline.from_pretrained(model_id, token=auth_token)
+        params = pipeline.parameters()
+        # Giảm phân mảnh: chỉ cập nhật các khóa thực sự tồn tại để tránh lỗi.
+        seg_cfg = params.get("segmentation")
+        if seg_cfg:
+            default_seg = {"min_duration_on": 1.0, "min_duration_off": 0.8}
+            for k, v in default_seg.items():
+                if k in seg_cfg:
+                    seg_cfg[k] = v
+            if segmentation_params:
+                for k, v in segmentation_params.items():
+                    if k in seg_cfg:
+                        seg_cfg[k] = v
+        if clustering_params and "clustering" in params:
+            params["clustering"].update(clustering_params)
+        self.pipeline = pipeline.instantiate(params)
         self.pipeline.to(self.device)
     @staticmethod
             return torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
         raise ValueError("Giá trị device hợp lệ: auto, cpu, cuda.")
+    def diarize(
+        self, audio_path: str | Path, show_progress: bool = True, keep_audio: bool = False
+    ):
         audio_path = ensure_audio_path(audio_path)
+        prepared_path, tmpdir = convert_to_wav_16k(audio_path)
+        try:
+            if show_progress:
+                with ProgressHook() as hook:
+                    result = self.pipeline(str(prepared_path), hook=hook)
+            else:
+                result = self.pipeline(str(prepared_path))
+            if keep_audio:
+                return result, prepared_path, tmpdir
+            return result
+        finally:
+            if tmpdir and not keep_audio:
+                shutil.rmtree(tmpdir, ignore_errors=True)
     @staticmethod
+    def _get_annotation(diarization: Any):
+        """Hỗ trợ cả dạng trả về cũ (Annotation) và mới (có speaker_diarization)."""
+        if hasattr(diarization, "itertracks"):
+            return diarization
+        if hasattr(diarization, "speaker_diarization"):
+            return diarization.speaker_diarization
+        raise TypeError("Output pipeline không có Annotation hoặc speaker_diarization.")
+    def to_segments(self, diarization: Any) -> List[Segment]:
+        annotation = self._get_annotation(diarization)
         segments: List[Segment] = []
+        for segment, _, speaker in annotation.itertracks(yield_label=True):
             segments.append(
                 Segment(
                     start=float(segment.start),
             )
         return segments
+    def save_rttm(self, diarization: Any, output_path: str | Path) -> Path:
+        annotation = self._get_annotation(diarization)
         path = Path(output_path)
         path.parent.mkdir(parents=True, exist_ok=True)
+        with path.open("w", encoding="utf-8") as f:
+            annotation.write_rttm(f)
         return path
     def run(self, audio_path: str | Path, show_progress: bool = True) -> List[Segment]:

src/utils.py CHANGED Viewed

@@ -2,8 +2,11 @@ from __future__ import annotations
 import json
 import os
 from pathlib import Path
-from typing import Iterable, List
 def read_hf_token(token: str | None = None, key_path: str | Path = "hugging_face_key.txt") -> str:
@@ -46,6 +49,12 @@ def export_segments_json(segments: Iterable[dict], output_path: str | Path) -> P
     return path
 def format_segments_table(segments: Iterable[dict]) -> str:
     """Trả về chuỗi bảng đơn giản để in ra terminal."""
     lines = []
@@ -53,5 +62,112 @@ def format_segments_table(segments: Iterable[dict]) -> str:
         start = seg.get("start", 0.0)
         end = seg.get("end", 0.0)
         speaker = seg.get("speaker", "unknown")
-        lines.append(f"{idx:02d} | {start:7.2f}s -> {end:7.2f}s | speaker {speaker}")
     return "\n".join(lines)

 import json
 import os
+import shutil
+import subprocess
+import tempfile
 from pathlib import Path
+from typing import Iterable, List, Tuple
 def read_hf_token(token: str | None = None, key_path: str | Path = "hugging_face_key.txt") -> str:
     return path
+def seconds_to_mmss(seconds: float) -> str:
+    total_seconds = int(round(seconds))
+    minutes, sec = divmod(total_seconds, 60)
+    return f"{minutes:02d}:{sec:02d}"
 def format_segments_table(segments: Iterable[dict]) -> str:
     """Trả về chuỗi bảng đơn giản để in ra terminal."""
     lines = []
         start = seg.get("start", 0.0)
         end = seg.get("end", 0.0)
         speaker = seg.get("speaker", "unknown")
+        lines.append(
+            f"{idx:02d} | {seconds_to_mmss(start)} -> {seconds_to_mmss(end)} | speaker {speaker}"
+        )
     return "\n".join(lines)
+def merge_adjacent_segments(
+    segments: list[dict],
+    max_gap: float = 0.5,
+    min_duration: float = 1.0,
+) -> list[dict]:
+    """
+    Ghép các đoạn liên tiếp cùng speaker nếu khoảng trống <= max_gap (giây).
+    Đồng thời lọc bỏ đoạn quá ngắn (< min_duration).
+    """
+    if not segments:
+        return []
+    merged: list[dict] = []
+    # đảm bảo sắp xếp theo thời gian
+    segs = sorted(segments, key=lambda s: s.get("start", 0.0))
+    current = segs[0].copy()
+    for seg in segs[1:]:
+        if (
+            seg.get("speaker") == current.get("speaker")
+            and seg.get("start", 0.0) - current.get("end", 0.0) <= max_gap
+        ):
+            current["end"] = max(current.get("end", 0.0), seg.get("end", 0.0))
+        else:
+            if current.get("end", 0.0) - current.get("start", 0.0) >= min_duration:
+                merged.append(current)
+            current = seg.copy()
+    if current.get("end", 0.0) - current.get("start", 0.0) >= min_duration:
+        merged.append(current)
+    return merged
+def convert_to_wav_16k(audio_path: Path) -> Tuple[Path, Path | None]:
+    """
+    Chuyển audio về WAV mono 16 kHz bằng ffmpeg.
+    Trả về (đường dẫn dùng để suy luận, thư mục tạm để dọn dẹp hoặc None nếu không cần).
+    """
+    safe_stem = audio_path.stem.replace(" ", "_")
+    tmpdir = Path(tempfile.mkdtemp(prefix="diarization_audio_"))
+    if not shutil.which("ffmpeg"):
+        # Sao chép tệp hiện có vào thư mục tạm với tên không dấu cách
+        output = tmpdir / audio_path.name.replace(" ", "_")
+        shutil.copy2(audio_path, output)
+        return output, tmpdir
+    output = tmpdir / f"{safe_stem}_16k.wav"
+    cmd = [
+        "ffmpeg",
+        "-y",
+        "-i",
+        str(audio_path),
+        "-ac",
+        "1",
+        "-ar",
+        "16000",
+        "-vn",
+        "-f",
+        "wav",
+        str(output),
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        stderr = result.stderr.strip()
+        raise RuntimeError(f"ffmpeg convert thất bại: {stderr}")
+    if not output.exists():
+        raise RuntimeError("ffmpeg không tạo được file WAV.")
+    return output, tmpdir
+def download_audio_from_url(url: str) -> Tuple[Path, Path]:
+    """
+    Tải audio từ YouTube/TikTok/... dùng yt-dlp, xuất WAV để xử lý tiếp.
+    Trả về (đường dẫn file wav, thư mục tạm chứa file).
+    """
+    if not shutil.which("yt-dlp"):
+        raise RuntimeError("Cần cài yt-dlp để tải liên kết (pip install yt-dlp).")
+    if not shutil.which("ffmpeg"):
+        raise RuntimeError("Cần cài ffmpeg để chuyển đổi audio.")
+    tmpdir = Path(tempfile.mkdtemp(prefix="download_media_"))
+    out_tmpl = tmpdir / "%(title)s.%(ext)s"
+    cmd = [
+        "yt-dlp",
+        "-x",
+        "--audio-format",
+        "wav",
+        "--audio-quality",
+        "0",
+        "-o",
+        str(out_tmpl),
+        url,
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"Tải audio thất bại: {result.stderr.strip()}")
+    wav_files = list(tmpdir.glob("*.wav"))
+    if not wav_files:
+        raise RuntimeError("Không tìm thấy file WAV sau khi tải.")
+    return wav_files[0], tmpdir

test_gradio.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import gradio as gr
+def greet(name):
+    return f"Hello {name}!"
+demo = gr.Interface(fn=greet, inputs="text", outputs="text")
+if __name__ == "__main__":
+    print("Starting Gradio server...")
+    demo.launch(server_name="0.0.0.0", server_port=7860)
+    print("Server started!")