Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion scripts/fetch-caption-model.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ import { fileURLToPath } from "node:url";

const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "..");
const OUT = path.join(ROOT, "caption-assets");
const MODEL_ID = "Xenova/whisper-tiny";
// whisper-small: tiny's transcription quality (esp. with language autodetect on
// non-English audio) was too unreliable to ship as the only captioning path.
const MODEL_ID = "Xenova/whisper-small";
const HF_BASE = `https://huggingface.co/${MODEL_ID}/resolve/main`;

// Small config/tokenizer/preprocessor files plus the quantized ONNX the ASR pipeline loads by
Expand Down
58 changes: 57 additions & 1 deletion src/components/video-editor/VideoEditor.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,45 @@ function getVideoDurationMs(sourcePath: string): Promise<number> {

const CAPTION_WORD_CHOICES = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] as const;

// Values are Whisper language names (transformers.js); labels are native names so
// they need no translation. Forcing the language skips Whisper's detection pass.
const CAPTION_LANGUAGES = [
{ value: "english", label: "English" },
{ value: "portuguese", label: "Português" },
{ value: "spanish", label: "Español" },
{ value: "french", label: "Français" },
{ value: "italian", label: "Italiano" },
{ value: "german", label: "Deutsch" },
{ value: "japanese", label: "日本語" },
{ value: "korean", label: "한국어" },
{ value: "russian", label: "Русский" },
{ value: "turkish", label: "Türkçe" },
{ value: "vietnamese", label: "Tiếng Việt" },
{ value: "chinese", label: "中文" },
{ value: "arabic", label: "العربية" },
{ value: "hindi", label: "हिन्दी" },
] as const;

const LOCALE_TO_CAPTION_LANGUAGE: Record<string, string> = {
en: "english",
"pt-BR": "portuguese",
es: "spanish",
fr: "french",
it: "italian",
"ja-JP": "japanese",
"ko-KR": "korean",
ru: "russian",
tr: "turkish",
vi: "vietnamese",
"zh-CN": "chinese",
"zh-TW": "chinese",
ar: "arabic",
};

function captionLanguageForLocale(locale: string): string {
return LOCALE_TO_CAPTION_LANGUAGE[locale] ?? "english";
}

export default function VideoEditor() {
const {
state: editorState,
Expand Down Expand Up @@ -353,6 +392,7 @@ export default function VideoEditor() {
const effectiveShowCursor = showCursor && hasEditableCursorRecording;
const showCursorSettings = hasEditableCursorRecording;
const { locale, setLocale, t: rawT } = useI18n();
const [captionLanguage, setCaptionLanguage] = useState(() => captionLanguageForLocale(locale));
const t = useScopedT("editor");
const ts = useScopedT("settings");
const availableLocales = getAvailableLocales();
Expand Down Expand Up @@ -2606,6 +2646,7 @@ export default function VideoEditor() {
const trimRegionsForTranscribe = shiftTrimRegionsMsForCaptionBuffer(trimRegions, trimMs);

const transcribeOptions = {
language: captionLanguage,
onStatus: (phase: "model" | "transcribe") => {
if (phase === "model") {
toast.loading(t("autoCaptions.loadingModel"), {
Expand Down Expand Up @@ -2700,7 +2741,7 @@ export default function VideoEditor() {
setIsAutoCaptioning(false);
}
},
[videoPath, trimRegions, pushState, t],
[videoPath, trimRegions, pushState, t, captionLanguage],
);

const handleSaveDiagnostic = useCallback(async () => {
Expand Down Expand Up @@ -2780,6 +2821,21 @@ export default function VideoEditor() {
<DialogDescription>{t("autoCaptions.dialogDescription")}</DialogDescription>
</DialogHeader>
<div className="grid gap-4 py-2">
<div className="grid gap-2">
<Label htmlFor="caption-language">{t("autoCaptions.language")}</Label>
<Select value={captionLanguage} onValueChange={setCaptionLanguage}>
<SelectTrigger id="caption-language" className="h-9">
<SelectValue />
</SelectTrigger>
<SelectContent>
{CAPTION_LANGUAGES.map((lang) => (
<SelectItem key={lang.value} value={lang.value}>
{lang.label}
</SelectItem>
))}
</SelectContent>
</Select>
</div>
<div className="grid gap-2">
<Label htmlFor="caption-min-words">{t("autoCaptions.minWords")}</Label>
<Select
Expand Down
3 changes: 2 additions & 1 deletion src/i18n/locales/ar/editor.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
"noneHeard": "لم يتم الكشف عن أي كلام.",
"noAudio": "لا يحتوي هذا الفيديو على صوت صالح للنسخ.",
"failed": "تعذّر توليد التسميات.",
"truncated": "تم نسخ الدقائق الأولى فقط: {{minutes}} دقيقة."
"truncated": "تم نسخ الدقائق الأولى فقط: {{minutes}} دقيقة.",
"language": "Spoken language"
},
"emptyState": {
"title": "لا يوجد مشروع مفتوح",
Expand Down
3 changes: 2 additions & 1 deletion src/i18n/locales/en/editor.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
"noneHeard": "No speech was detected.",
"noAudio": "This video has no usable audio to transcribe.",
"failed": "Could not generate captions.",
"truncated": "Only the first {{minutes}} minutes were transcribed."
"truncated": "Only the first {{minutes}} minutes were transcribed.",
"language": "Spoken language"
},
"emptyState": {
"title": "No project open",
Expand Down
3 changes: 2 additions & 1 deletion src/i18n/locales/es/editor.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
"noneHeard": "No se detectó voz.",
"noAudio": "Este video no tiene audio utilizable para transcribir.",
"failed": "No se pudieron generar los subtítulos.",
"truncated": "Solo se transcribieron los primeros {{minutes}} minutos."
"truncated": "Solo se transcribieron los primeros {{minutes}} minutos.",
"language": "Spoken language"
},
"emptyState": {
"title": "No hay proyecto abierto",
Expand Down
3 changes: 2 additions & 1 deletion src/i18n/locales/fr/editor.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
"noneHeard": "Aucune parole n'a été détectée.",
"noAudio": "Cette vidéo ne contient pas d'audio exploitable pour la transcription.",
"failed": "Impossible de générer les sous-titres.",
"truncated": "Seules les {{minutes}} premières minutes ont été transcrites."
"truncated": "Seules les {{minutes}} premières minutes ont été transcrites.",
"language": "Spoken language"
},
"emptyState": {
"title": "Aucun projet ouvert",
Expand Down
3 changes: 2 additions & 1 deletion src/i18n/locales/it/editor.json
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@
"noneHeard": "Nessun parlato rilevato.",
"noAudio": "Questo video non contiene audio utilizzabile per la trascrizione.",
"failed": "Impossibile generare i sottotitoli.",
"truncated": "Sono stati trascritti solo i primi {{minutes}} minuti."
"truncated": "Sono stati trascritti solo i primi {{minutes}} minuti.",
"language": "Spoken language"
},
"loadingEditor": "Loading editor...",
"emptyState": {
Expand Down
3 changes: 2 additions & 1 deletion src/i18n/locales/ja-JP/editor.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
"noneHeard": "音声が検出されませんでした。",
"noAudio": "この動画には書き起こしに使える音声がありません。",
"failed": "キャプションを生成できませんでした。",
"truncated": "最初の {{minutes}} 分のみが書き起こされました。"
"truncated": "最初の {{minutes}} 分のみが書き起こされました。",
"language": "Spoken language"
},
"emptyState": {
"title": "プロジェクトが開かれていません",
Expand Down
3 changes: 2 additions & 1 deletion src/i18n/locales/ko-KR/editor.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
"noneHeard": "음성이 감지되지 않았습니다.",
"noAudio": "이 동영상에는 전사에 사용할 수 있는 음성이 없습니다.",
"failed": "자막을 생성할 수 없습니다.",
"truncated": "처음 {{minutes}}분만 전사되었습니다."
"truncated": "처음 {{minutes}}분만 전사되었습니다.",
"language": "Spoken language"
},
"emptyState": {
"title": "열린 프로젝트 없음",
Expand Down
3 changes: 2 additions & 1 deletion src/i18n/locales/pt-BR/editor.json
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@
"noneHeard": "Nenhuma fala foi detectada.",
"noAudio": "Este vídeo não tem áudio utilizável para transcrição.",
"failed": "Não foi possível gerar as legendas.",
"truncated": "Apenas os primeiros {{minutes}} minutos foram transcritos."
"truncated": "Apenas os primeiros {{minutes}} minutos foram transcritos.",
"language": "Spoken language"
},
"loadingEditor": "Loading editor...",
"emptyState": {
Expand Down
3 changes: 2 additions & 1 deletion src/i18n/locales/ru/editor.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
"noneHeard": "Речь не обнаружена.",
"noAudio": "В этом видео нет звука, пригодного для расшифровки.",
"failed": "Не удалось создать субтитры.",
"truncated": "Расшифрованы только первые {{minutes}} мин."
"truncated": "Расшифрованы только первые {{minutes}} мин.",
"language": "Spoken language"
},
"emptyState": {
"title": "Нет открытых проектов",
Expand Down
3 changes: 2 additions & 1 deletion src/i18n/locales/tr/editor.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
"noneHeard": "Konuşma algılanmadı.",
"noAudio": "Bu videoda yazıya dökülebilecek kullanılabilir bir ses yok.",
"failed": "Altyazılar oluşturulamadı.",
"truncated": "Yalnızca ilk {{minutes}} dakika yazıya döküldü."
"truncated": "Yalnızca ilk {{minutes}} dakika yazıya döküldü.",
"language": "Spoken language"
},
"emptyState": {
"title": "Açık proje yok",
Expand Down
3 changes: 2 additions & 1 deletion src/i18n/locales/vi/editor.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
"noneHeard": "Không phát hiện thấy lời nói.",
"noAudio": "Video này không có âm thanh dùng được để chuyển thành văn bản.",
"failed": "Không thể tạo phụ đề.",
"truncated": "Chỉ {{minutes}} phút đầu tiên được chuyển thành văn bản."
"truncated": "Chỉ {{minutes}} phút đầu tiên được chuyển thành văn bản.",
"language": "Spoken language"
},
"emptyState": {
"title": "Không có dự án nào được mở",
Expand Down
3 changes: 2 additions & 1 deletion src/i18n/locales/zh-CN/editor.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
"noneHeard": "未检测到语音。",
"noAudio": "此视频没有可用于转写的音频。",
"failed": "无法生成字幕。",
"truncated": "仅转写了最前 {{minutes}} 分钟。"
"truncated": "仅转写了最前 {{minutes}} 分钟。",
"language": "Spoken language"
},
"emptyState": {
"title": "未打开任何项目",
Expand Down
3 changes: 2 additions & 1 deletion src/i18n/locales/zh-TW/editor.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@
"noneHeard": "未偵測到語音。",
"noAudio": "此影片沒有可用於轉寫的音訊。",
"failed": "無法產生字幕。",
"truncated": "僅轉寫了最前 {{minutes}} 分鐘。"
"truncated": "僅轉寫了最前 {{minutes}} 分鐘。",
"language": "Spoken language"
},
"emptyState": {
"title": "未開啟任何專案",
Expand Down
5 changes: 5 additions & 0 deletions src/lib/captioning/transcribe.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ export interface TranscribeWorkerRequest {
useLocalModels: boolean;
/** Base URL of bundled resources (packaged: resourcesPath file:// URL); used when `useLocalModels`. */
assetBaseUrl?: string;
/** Whisper language name (e.g. "portuguese"); skips autodetection when set. */
language?: string;
}

/** Messages the transcription worker posts back to the renderer. */
Expand All @@ -47,6 +49,8 @@ export function transcribeMono16kToSegments(
trimRegions?: TrimRegion[];
onStatus?: (phase: "model" | "transcribe") => void;
signal?: AbortSignal;
/** Whisper language name (e.g. "portuguese"); skips autodetection when set. */
language?: string;
},
): Promise<TranscribeMono16kResult> {
if (options?.signal?.aborted) {
Expand Down Expand Up @@ -100,6 +104,7 @@ export function transcribeMono16kToSegments(
trimRegions: options?.trimRegions ?? [],
useLocalModels,
assetBaseUrl,
language: options?.language,
};
worker.postMessage(request);
});
Expand Down
11 changes: 7 additions & 4 deletions src/lib/captioning/transcribe.worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,18 +63,20 @@ async function loadTranscriber(opts: {
// Dev (http://localhost): fetch from the remote CDN, which works there.
env.allowLocalModels = false;
}
// Default tiny weights only: the `output_attentions` revision regresses inference in
// some environments (empty chunks, thrown errors) while phrase mode works on this model.
// Default weights only: the `output_attentions` revision regresses inference in
// some environments (empty chunks, thrown errors) while phrase mode works.
// whisper-small over tiny: tiny's accuracy (especially on non-English audio)
// was too unreliable; small matches the fork's proven extract-subtitles setup.
const transcriber = (await pipeline(
"automatic-speech-recognition",
"Xenova/whisper-tiny",
"Xenova/whisper-small",
)) as unknown as TranscriberFn;
return transcriber;
});
}

self.onmessage = async (event: MessageEvent<TranscribeWorkerRequest>) => {
const { samples, trimRegions, useLocalModels, assetBaseUrl } = event.data;
const { samples, trimRegions, useLocalModels, assetBaseUrl, language } = event.data;
try {
post({ type: "status", phase: "model" });
const transcriber = await loadTranscriber({ useLocalModels, assetBaseUrl });
Expand All @@ -84,6 +86,7 @@ self.onmessage = async (event: MessageEvent<TranscribeWorkerRequest>) => {
transcriber,
samples,
trimRegions ?? [],
language,
);

post({ type: "result", segments, granularity });
Expand Down
9 changes: 8 additions & 1 deletion src/lib/captioning/transcribeCore.ts
Original file line number Diff line number Diff line change
Expand Up @@ -127,15 +127,19 @@ function segmentsFromTranscriberChunks(
async function runTranscriberOnSlice(
transcriber: TranscriberFn,
samples: Float32Array,
opts: { forceFullSequences: boolean; timestampMode: "word" | "phrase" },
opts: { forceFullSequences: boolean; timestampMode: "word" | "phrase"; language?: string },
): Promise<unknown> {
const durationSec = samples.length / 16_000;
// Only chunk long clips; short-audio chunking regressed some Whisper.js runs (empty chunks).
const chunking = durationSec > 30 ? { chunk_length_s: 30, stride_length_s: 5 } : {};
// Forcing the language skips Whisper's detection pass, which is the dominant
// failure mode on non-English audio (a misdetect degrades the whole transcript).
const language = opts.language ? { language: opts.language, task: "transcribe" } : {};
return transcriber(samples, {
return_timestamps: opts.timestampMode === "word" ? "word" : true,
force_full_sequences: opts.forceFullSequences,
...chunking,
...language,
});
}

Expand Down Expand Up @@ -185,6 +189,7 @@ export async function runTranscription(
transcriber: TranscriberFn,
samples: Float32Array,
trims: TrimRegion[],
language?: string,
): Promise<TranscribeMono16kResult> {
const transcribeOne = async (
ignoreTrims: boolean,
Expand All @@ -198,6 +203,7 @@ export async function runTranscription(
const result = await runTranscriberOnSlice(transcriber, slice, {
forceFullSequences,
timestampMode,
language,
});
return segmentsFromTranscriberChunks(
extractChunksFromAsrResult(result),
Expand All @@ -223,6 +229,7 @@ export async function runTranscription(
const result = await runTranscriberOnSlice(transcriber, slice, {
forceFullSequences,
timestampMode,
language,
});
const tOff = offset / 16_000;
all.push(
Expand Down
Loading