diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 32211545c..04bff26aa 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -39,7 +39,7 @@ jobs: strategy: fail-fast: false matrix: - test_suite: [ types-package, client, api-1, api-2, api-3, api-4, api-5, cli-plugin, lint, external-plugins ] + test_suite: [ types-package, client, api-1, api-2, api-3, api-4, api-5, transcription, cli-plugin, lint, external-plugins ] env: PGUSER: peertube diff --git a/.gitignore b/.gitignore index 55707fb80..6865442eb 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ node_modules *npm-debug.log yarn-error.log +*-ci.log .yarn # Testing diff --git a/apps/peertube-runner/tsconfig.json b/apps/peertube-runner/tsconfig.json index 03660b0eb..776e1ab5c 100644 --- a/apps/peertube-runner/tsconfig.json +++ b/apps/peertube-runner/tsconfig.json @@ -11,6 +11,7 @@ { "path": "../../packages/ffmpeg" }, { "path": "../../packages/models" }, { "path": "../../packages/node-utils" }, - { "path": "../../packages/server-commands" } + { "path": "../../packages/server-commands" }, + { "path": "../../packages/transcription" }, ] } diff --git a/package.json b/package.json index cbabdfc35..e04151785 100644 --- a/package.json +++ b/package.json @@ -25,6 +25,7 @@ ], "scripts": { "benchmark-server": "tsx --conditions=peertube:tsx ./scripts/benchmark.ts", + "benchmark-transcription": "tsx --conditions=peertube:tsx --tsconfig ./packages/transcription/tsconfig.json ./packages/transcription/src/benchmark.ts", "build:client": "bash ./scripts/build/client.sh", "build:embed": "bash ./scripts/build/embed.sh", "build:peertube-cli": "bash ./scripts/build/peertube-cli.sh", diff --git a/packages/core-utils/src/common/date.ts b/packages/core-utils/src/common/date.ts index bddd5da23..ffb959868 100644 --- a/packages/core-utils/src/common/date.ts +++ b/packages/core-utils/src/common/date.ts @@ -125,7 +125,7 @@ function secondsToTime (options: { else if (minutes >= 1) time += formatNumber(minutes) + minuteSymbol else if (format === 'full') time += '00' + minuteSymbol - seconds %= 60 + seconds = Math.round(seconds) % 60 if (seconds >= 1 && seconds < 10 && format === 'full') time += '0' + seconds + secondsSymbol else if (seconds >= 1) time += formatNumber(seconds) + secondsSymbol else if (format === 'full') time += '00' @@ -133,6 +133,14 @@ function secondsToTime (options: { return time } +function millisecondsToTime (options: { + seconds: number + format: 'short' | 'full' | 'locale-string' // default 'short' + symbol?: string +} | number) { + return secondsToTime(typeof options === 'number' ? options / 1000 : { ...options, seconds: options.seconds / 1000 }) +} + // --------------------------------------------------------------------------- export { @@ -143,7 +151,8 @@ export { isLastMonth, isLastWeek, timeToInt, - secondsToTime + secondsToTime, + millisecondsToTime } // --------------------------------------------------------------------------- diff --git a/packages/jiwer/README.md b/packages/jiwer/README.md new file mode 100644 index 000000000..663c399bf --- /dev/null +++ b/packages/jiwer/README.md @@ -0,0 +1,37 @@ +JiWER +===== +__JiWER__ CLI NodeJs wrapper. + +> *JiWER is a python tool for computing the word-error-rate of ASR systems.* +> https://jitsi.github.io/jiwer/cli/ + +__JiWER__ serves as a reference implementation to calculate errors rates between 2 text files: +- WER (Word Error Rate) +- CER (Character Error Rate) + +Build +----- + +```sh +npm run build +``` + +Usage +----- +```typescript +const jiwerCLI = new JiwerClI('./reference.txt', './hypothesis.txt') + +// WER as a percentage, ex: 0.03 -> 3% +console.log(await jiwerCLI.wer()) + +// CER as a percentage: 0.01 -> 1% +console.log(await jiwerCLI.cer()) + +// Detailed comparison report +console.log(await jiwerCLI.alignment()) +``` + +Resources +--------- +- https://jitsi.github.io/jiwer/ +- https://github.com/rapidfuzz/RapidFuzz diff --git a/packages/jiwer/package.json b/packages/jiwer/package.json new file mode 100644 index 000000000..b01476956 --- /dev/null +++ b/packages/jiwer/package.json @@ -0,0 +1,20 @@ +{ + "name": "@peertube/peertube-jiwer", + "private": true, + "version": "0.0.0", + "main": "dist/index.js", + "files": [ "dist" ], + "exports": { + "types": "./dist/index.d.ts", + "peertube:tsx": "./src/index.ts", + "default": "./dist/index.js" + }, + "type": "module", + "devDependencies": {}, + "scripts": { + "preinstall": "pip install -r requirements.txt", + "build": "tsc", + "watch": "tsc -w" + }, + "dependencies": {} +} diff --git a/packages/jiwer/requirements.txt b/packages/jiwer/requirements.txt new file mode 100644 index 000000000..53d9fb34b --- /dev/null +++ b/packages/jiwer/requirements.txt @@ -0,0 +1 @@ +jiwer==3.0.4 diff --git a/packages/jiwer/src/index.ts b/packages/jiwer/src/index.ts new file mode 100644 index 000000000..586fd0843 --- /dev/null +++ b/packages/jiwer/src/index.ts @@ -0,0 +1 @@ +export * from './jiwer-cli.js' diff --git a/packages/jiwer/src/jiwer-cli.ts b/packages/jiwer/src/jiwer-cli.ts new file mode 100644 index 000000000..8ce87f2b1 --- /dev/null +++ b/packages/jiwer/src/jiwer-cli.ts @@ -0,0 +1,69 @@ +import { $ } from 'execa' + +export class JiwerClI { + referenceFilePath: string + hypothesisFilePath: string + + constructor (referenceFilePath: string, hypothesisFilePath: string) { + this.referenceFilePath = referenceFilePath + this.hypothesisFilePath = hypothesisFilePath + } + + /** + * @param referenceFilePath Path to new-line delimited text file of reference sentences. + * @param hypothesisFilePath Path to new-line delimited text file of hypothesis sentences. + * @param args + */ + static buildArgs (referenceFilePath: string, hypothesisFilePath: string, ...args: string[]) { + return [ + '--reference', + referenceFilePath, + '--hypothesis', + hypothesisFilePath, + ...args + ] + } + + buildArgs (...args: string[]) { + return JiwerClI.buildArgs(this.referenceFilePath, this.hypothesisFilePath, ...args) + } + + /** + * WER: Word Error Rate as a percentage, ex: 0.03 -> 3% + */ + static async wer (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise { + const { stdout: wer } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, global && '-g')}` + + return Number(wer) + } + + async wer (global = true) { + return await JiwerClI.wer(this.hypothesisFilePath, this.referenceFilePath, global) + } + + /** + * CER: Character Error Rate + */ + static async cer (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise { + const { stdout: cer } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, '--cer', global && '-g')}` + + return Number(cer) + } + + async cer (global = true) { + return await JiwerClI.cer(this.hypothesisFilePath, this.referenceFilePath, global) + } + + /** + * Print alignment of each sentence. + */ + static async alignment (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise { + const { stdout: alignment } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, '--align', global && '-g')}` + + return alignment + } + + async alignment (global = true) { + return await JiwerClI.alignment(this.hypothesisFilePath, this.referenceFilePath, global) + } +} diff --git a/packages/jiwer/tsconfig.json b/packages/jiwer/tsconfig.json new file mode 100644 index 000000000..58fa2330b --- /dev/null +++ b/packages/jiwer/tsconfig.json @@ -0,0 +1,8 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "outDir": "./dist", + "rootDir": "src", + "tsBuildInfoFile": "./dist/.tsbuildinfo" + } +} diff --git a/packages/node-utils/src/uuid.ts b/packages/node-utils/src/uuid.ts index f158ec487..68110eb0e 100644 --- a/packages/node-utils/src/uuid.ts +++ b/packages/node-utils/src/uuid.ts @@ -1,4 +1,4 @@ -import short from 'short-uuid' +import short, { SUUID } from 'short-uuid' const translator = short() @@ -6,6 +6,10 @@ function buildUUID () { return short.uuid() } +function buildSUUID (): SUUID { + return short.generate() +} + function uuidToShort (uuid: string) { if (!uuid) return uuid @@ -26,7 +30,10 @@ function isShortUUID (value: string) { export { buildUUID, + buildSUUID, uuidToShort, shortToUUID, isShortUUID } + +export type { SUUID } diff --git a/packages/server-commands/src/requests/requests.ts b/packages/server-commands/src/requests/requests.ts index 9e77ff282..49b7f9ce6 100644 --- a/packages/server-commands/src/requests/requests.ts +++ b/packages/server-commands/src/requests/requests.ts @@ -59,6 +59,15 @@ export function makeRawRequest (options: { return makeGetRequest(reqOptions) } +export const makeFileRequest = (url: string) => { + return makeRawRequest({ + url, + responseType: 'arraybuffer', + redirects: 1, + expectedStatus: HttpStatusCode.OK_200 + }) +} + export function makeGetRequest (options: CommonRequestParams & { query?: any rawQuery?: string diff --git a/packages/tests/fixtures/transcription/hello_world.zip b/packages/tests/fixtures/transcription/hello_world.zip new file mode 100644 index 000000000..c99f72a38 Binary files /dev/null and b/packages/tests/fixtures/transcription/hello_world.zip differ diff --git a/packages/tests/fixtures/transcription/videos/README.md b/packages/tests/fixtures/transcription/videos/README.md new file mode 100644 index 000000000..b892a7553 --- /dev/null +++ b/packages/tests/fixtures/transcription/videos/README.md @@ -0,0 +1,16 @@ +đŸ‡«đŸ‡· DRANE Occitanie - Communiquer lors d'une classe transplantĂ©e +[./communiquer-lors-dune-classe-transplantee.mp4](communiquer-lors-dune-classe-transplantee.mp4) +> https://podeduc.apps.education.fr/numerique-educatif/video/21893-communiquer-lors-dune-classe-transplantee/ +> +> CC BY-NC-SA 4.0 Deed +> Attribution-NonCommercial-ShareAlike 4.0 International + +đŸ‡«đŸ‡· [Accompagner la victime d'une dĂ©rive sectaire ou d'une emprise mentale](https://www.fun-mooc.fr/fr/cours/accompagner-la-victime-de-derive-sectaire/) +> Centre Contre les Manipulations Mentales (CCMM) +> [CC BY-NC-ND 4.0 Deed](https://creativecommons.org/licenses/by-nc-nd/4.0/) +> Attribution-NonCommercial-NoDerivs 4.0 International + +đŸ‡ș🇾 [The Last Man On Earth (1964)](https://archive.org/details/TheLastManOnEarthHD) +> PDM 1.0 Deed +> Public Domain Mark 1.0 Universal +> https://creativecommons.org/publicdomain/mark/1.0/ diff --git a/packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.mp4 b/packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.mp4 new file mode 100644 index 000000000..4ef5b6ee1 Binary files /dev/null and b/packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.mp4 differ diff --git a/packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.txt b/packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.txt new file mode 100644 index 000000000..1c700efa1 --- /dev/null +++ b/packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.txt @@ -0,0 +1,10 @@ +Communiquer lors d'une classe transplantĂ©e. Utiliser les photos prises lors de cette classe pour raconter quotidiennement le sĂ©jour vĂ©cu. +C'est le scĂ©nario pĂ©dagogique prĂ©sentĂ© par Monsieur Navoli, professeur en cycle 3 sur une Ă©cole Ă©lĂ©mentaire de Montpellier. +La premiĂšre application utilisĂ©e sera la mĂ©diathĂšque. L'enseignant va alors transfĂ©rer les diffĂ©rentes photos rĂ©alisĂ©es lors de la classe transplantĂ©e. +Dans un dossier spĂ©cifique pour que les Ă©lĂšves puissent le retrouver plus facilement. Il tĂ©lĂ©verse donc ses photos dans le dossier, dans l'ENT, dans la mĂ©diathĂšque de la classe. +Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser. +Les Ă©lĂšves par la suite utiliseront le blog, Ă  partir de leurs notes, il pourront, seul ou Ă  2 par poste rĂ©diger un article dans leur ENT. +Ils illustreront ces articles Ă  l'aide des photos et documents numĂ©riques mis en accĂšs libre dans l'ENT. +Pour ce faire, il pourront utiliser l'Ă©diteur avancĂ© qui les renverra directement dans la mĂ©diathĂšque de la classe, oĂč ils pourront retrouver le dossier crĂ©Ă© par leur enseignant. +Une fois leur article terminĂ©, les Ă©lĂšves soumettront celui-ci au professeur qui pourra soit l'annoter pour correction ou le publier. +Ensuite, il pourront lire et commenter ceux de leurs camarades, ou rĂ©pondre aux commentaires de la veille. diff --git a/packages/tests/fixtures/transcription/videos/derive_sectaire.mp4 b/packages/tests/fixtures/transcription/videos/derive_sectaire.mp4 new file mode 100644 index 000000000..f6969a62b Binary files /dev/null and b/packages/tests/fixtures/transcription/videos/derive_sectaire.mp4 differ diff --git a/packages/tests/fixtures/transcription/videos/derive_sectaire.srt b/packages/tests/fixtures/transcription/videos/derive_sectaire.srt new file mode 100644 index 000000000..d7d14d2e4 --- /dev/null +++ b/packages/tests/fixtures/transcription/videos/derive_sectaire.srt @@ -0,0 +1,165 @@ +ï»ż +1 +00:00:03,640 --> 00:00:05,640 +-Bonjour et bienvenue sur FUN MOOC. + +2 +00:00:05,960 --> 00:00:09,000 +Notre MOOC "Comment parler +Ă  une victime d'emprise mentale + +3 +00:00:09,320 --> 00:00:10,400 +ou de dĂ©rive sectaire" + +4 +00:00:10,720 --> 00:00:13,840 +s'adresse Ă  tout professionnel +du domaine de la santĂ©, + +5 +00:00:14,160 --> 00:00:15,920 +de l'associatif, du juridique, + +6 +00:00:16,240 --> 00:00:18,800 +qui pourra ĂȘtre en contact +avec une victime de telles dĂ©rives. + +7 +00:00:21,720 --> 00:00:23,840 +Il sera composĂ© de 14 leçons vidĂ©o + +8 +00:00:24,160 --> 00:00:26,040 +d'une dizaine de minutes + +9 +00:00:26,360 --> 00:00:28,600 +divisĂ©es en quatre blocs. + +10 +00:00:31,800 --> 00:00:34,960 +Le premier bloc vous informera +de ce que sont exactement + +11 +00:00:35,280 --> 00:00:37,720 +l'emprise mentale +et une dĂ©rive sectaire. + +12 +00:00:38,040 --> 00:00:42,440 +-Ça consiste toujours +en une forme de manipulation + +13 +00:00:43,520 --> 00:00:47,320 +qui conduit Ă  une dĂ©pendance, +Ă  une sorte de cercle vicieux, + +14 +00:00:47,640 --> 00:00:51,200 +oĂč les personnes ne parviennent pas +Ă  se dĂ©sengager d'un processus + +15 +00:00:51,520 --> 00:00:54,120 +qui les conduit +soit Ă  donner de l'argent, + +16 +00:00:54,440 --> 00:00:56,160 +Ă  se livrer Ă  des actes + +17 +00:00:56,480 --> 00:00:58,480 +qu'en rĂ©alitĂ© +ils n'auraient pas acceptĂ©s, + +18 +00:00:58,800 --> 00:01:02,160 +ou, tout simplement, Ă  accepter +de participer Ă  une organisation + +19 +00:01:02,480 --> 00:01:03,760 +dont ils ne partagent pas + +20 +00:01:04,080 --> 00:01:06,040 +toutes les mĂ©thodes +ou tous les points de vue. + +21 +00:01:06,360 --> 00:01:10,080 +-Le deuxiĂšme bloc vous informera +des bonnes techniques d'Ă©coute + +22 +00:01:10,400 --> 00:01:12,680 +d'une personne +ayant vĂ©cu de tels traumatismes. + +23 +00:01:13,000 --> 00:01:14,760 +-C'est un sujet actuel + +24 +00:01:15,080 --> 00:01:17,320 +parce que ce phĂ©nomĂšne +est en croissance. + +25 +00:01:17,640 --> 00:01:20,000 +Il y a une augmentation trĂšs importante, +un doublement, + +26 +00:01:20,320 --> 00:01:21,400 +en l'espace de quelques annĂ©es, + +27 +00:01:21,720 --> 00:01:22,960 +en moins de 10 ans. + +28 +00:01:27,200 --> 00:01:31,000 +-Le bloc 3, lui, +sera conçu par nos juristes + +29 +00:01:31,320 --> 00:01:34,080 +pour vous indiquer +quelles sont les grandes infractions + +30 +00:01:34,400 --> 00:01:36,960 +en lien avec l'emprise mentale, + +31 +00:01:37,280 --> 00:01:39,120 +et surtout, pouvoir faire +une analyse perspicace + +32 +00:01:39,440 --> 00:01:41,640 +d'une situation individuelle. + +33 +00:01:43,760 --> 00:01:46,960 +Enfin, le bloc 4 vous assistera + +34 +00:01:47,280 --> 00:01:50,320 +pour savoir comment aiguiller +une victime + +35 +00:01:50,640 --> 00:01:52,400 +vers les bons professionnels. + +36 +00:01:53,160 --> 00:01:54,040 +Bonne formation. + diff --git a/packages/tests/fixtures/transcription/videos/derive_sectaire.txt b/packages/tests/fixtures/transcription/videos/derive_sectaire.txt new file mode 100644 index 000000000..4f85cd324 --- /dev/null +++ b/packages/tests/fixtures/transcription/videos/derive_sectaire.txt @@ -0,0 +1,11 @@ +ï»ż-Bonjour et bienvenue sur FUN MOOC. +Notre MOOC "Comment parler Ă  une victime d'emprise mentale ou de dĂ©rive sectaire" s'adresse Ă  tout professionnel du domaine de la santĂ©, de l'associatif, du juridique, qui pourra ĂȘtre en contact avec une victime de telles dĂ©rives. +Il sera composĂ© de 14 leçons vidĂ©o d'une dizaine de minutes divisĂ©es en quatre blocs. +Le premier bloc vous informera de ce que sont exactement l'emprise mentale et une dĂ©rive sectaire. +-Ça consiste toujours en une forme de manipulation qui conduit Ă  une dĂ©pendance, Ă  une sorte de cercle vicieux, oĂč les personnes ne parviennent pas Ă  se dĂ©sengager d'un processus qui les conduit soit Ă  donner de l'argent, Ă  se livrer Ă  des actes qu'en rĂ©alitĂ© ils n'auraient pas acceptĂ©s, ou, tout simplement, Ă  accepter de participer Ă  une organisation dont ils ne partagent pas toutes les mĂ©thodes ou tous les points de vue. +-Le deuxiĂšme bloc vous informera des bonnes techniques d'Ă©coute d'une personne ayant vĂ©cu de tels traumatismes. +-C'est un sujet actuel parce que ce phĂ©nomĂšne est en croissance. +Il y a une augmentation trĂšs importante, un doublement, en l'espace de quelques annĂ©es, en moins de 10 ans. +-Le bloc 3, lui, sera conçu par nos juristes pour vous indiquer quelles sont les grandes infractions en lien avec l'emprise mentale, et surtout, pouvoir faire une analyse perspicace d'une situation individuelle. +Enfin, le bloc 4 vous assistera pour savoir comment aiguiller une victime vers les bons professionnels. +Bonne formation. \ No newline at end of file diff --git a/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.mp4 b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.mp4 new file mode 100644 index 000000000..45ef4325e Binary files /dev/null and b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.mp4 differ diff --git a/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.srt b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.srt new file mode 100644 index 000000000..d1ceebe10 --- /dev/null +++ b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.srt @@ -0,0 +1,17 @@ +1 +00:00:00,000 --> 00:00:01,940 +December, 1965. + +2 +00:00:03,460 --> 00:00:06,660 +Is that all it has been since +I inherited the world? + +3 +00:00:07,020 --> 00:00:08,900 +Only three years. + +4 +00:00:09,940 --> 00:00:11,760 +Seems like a hundred million. + diff --git a/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.txt b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.txt new file mode 100644 index 000000000..2a8ed1a11 --- /dev/null +++ b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.txt @@ -0,0 +1,5 @@ +December, 1965. +Is that all it has been since +I inherited the world? +Only three years. +It seems like a hundred million. diff --git a/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.vtt b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.vtt new file mode 100644 index 000000000..62888b29c --- /dev/null +++ b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.vtt @@ -0,0 +1,14 @@ +WEBVTT + +00:00.000 --> 00:01.940 +December, 1965. + +00:03.460 --> 00:06.660 +Is that all it has been since I inherited the world? + +00:07.020 --> 00:08.900 +Only three years. + +00:09.940 --> 00:11.760 +Seems like a hundred million. + diff --git a/packages/tests/src/core-utils/date.ts b/packages/tests/src/core-utils/date.ts new file mode 100644 index 000000000..90600187e --- /dev/null +++ b/packages/tests/src/core-utils/date.ts @@ -0,0 +1,29 @@ +import { millisecondsToTime, secondsToTime } from '@peertube/peertube-core-utils' +import { expect } from 'chai' + +describe('Seconds to time', function () { + it('Outputs a human readable time', function () { + expect(secondsToTime(61.1335)).to.equals('1m1s') + }) + + it('Rounds the number of seconds to the nearest integer', function () { + expect(secondsToTime(61.4)).to.equals('1m1s') + expect(secondsToTime(61.6)).to.equals('1m2s') + expect(secondsToTime(61.51)).to.equals('1m2s') + }) +}) + +describe('Milliseconds to time', function () { + it('Outputs a human readable time', function () { + expect(millisecondsToTime(60_000)).to.equals('1m') + }) + + it('Rounds the number of seconds to the nearest integer', function () { + expect(millisecondsToTime(60_100)).to.equals('1m') + expect(millisecondsToTime(60_501)).to.equals('1m1s') + }) + + it('Time inferior to 500ms appears as empty string', function () { + expect(millisecondsToTime(499)).to.equals('') + }) +}) diff --git a/packages/tests/src/jiwer/jiwer-cli.spec.ts b/packages/tests/src/jiwer/jiwer-cli.spec.ts new file mode 100644 index 000000000..ccc2ab26b --- /dev/null +++ b/packages/tests/src/jiwer/jiwer-cli.spec.ts @@ -0,0 +1,48 @@ +/* eslint-disable max-len */ +import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils' +import { join } from 'path' +import { mkdir, rm, writeFile } from 'node:fs/promises' +import { expect } from 'chai' +import { JiwerClI } from '@peertube/peertube-jiwer' + +describe('Jiwer CLI', function () { + const transcriptDirectory = buildAbsoluteFixturePath('transcription/transcript-evaluator') + const referenceTranscriptFilePath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.txt') + const hypothesis = join(transcriptDirectory, 'openai.txt') + const jiwerCLI = new JiwerClI(referenceTranscriptFilePath, hypothesis) + + before(async function () { + await mkdir(transcriptDirectory, { recursive: true }) + await writeFile(join(transcriptDirectory, 'openai.txt'), `Communiquez lors d'une classe transplante. Utilisez les photos prises lors de cette classe pour raconter quotidiennement le sĂ©jour vĂ©cu. +C'est le scĂ©nario P-Dagujic prĂ©sentĂ© par monsieur Navoli, professeur ainsi que le 3 sur une Ă©cole alimentaire de Montpellier. +La premiĂšre application a utilisĂ© ce ralame dĂ©atec. L'enseignant va alors transfĂ©rer les diffĂ©rentes photos rĂ©alisĂ©s lors de la classe transplante. +Dans un dossier, spĂ©cifique pour que les Ă©lĂšves puissent le retrouver plus facilement. Il tĂ©lĂ©verse donc ses photos dans le dossier, dans le ventĂ©, dans la mĂ©diatĂšque de la classe. +Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser. +Les Ă©lĂšves par la suite utilisera le blog. A partir de leurs nantes, il pourront se loi de parposte rĂ©digeant un article d'un reintĂ©. +Ils illustront ses articles Ă  l'aide des photos de que mon numĂ©rique mise Ă  n'accĂ©lier dans le ventĂ©. +Pour se faire, il pourront utiliser les diteurs avancĂ©s qui les renvĂšrent directement dans la mĂ©diatĂšque de la classe oĂč il pourront retrouver le dossier crĂ©Ă© par leurs enseignants. +Une fois leur article terminĂ©e, les Ă©lĂšves soumĂ©tront se lui-ci au professeur qui pourra soit la notĂ© pour correction ou le public. +Ensuite, il pourront lire et commenter ce de leurs camarades ou rĂ©pondre aux commentaires de la veille. +`) + }) + + it(`returns coherent wer`, async function () { + const wer = await jiwerCLI.wer() + expect(wer).to.be.below(30 / 100) + expect(wer).to.be.greaterThan(0 / 100) + }) + + it(`returns coherent cer`, async function () { + const cer = await jiwerCLI.cer() + expect(cer).to.be.below(10 / 100) + expect(cer).to.be.greaterThan(9 / 100) + }) + + it(`print alignment`, async function () { + console.log(await jiwerCLI.alignment()) + }) + + after(async function () { + await rm(transcriptDirectory, { recursive: true, force: true }) + }) +}) diff --git a/packages/tests/src/shared/fixture-urls.ts b/packages/tests/src/shared/fixture-urls.ts index 56638e13b..b4c659e76 100644 --- a/packages/tests/src/shared/fixture-urls.ts +++ b/packages/tests/src/shared/fixture-urls.ts @@ -29,5 +29,7 @@ export const FIXTURE_URLS = { chatersVideo: 'https://download.cpy.re/peertube/video_chapters.mp4', - file4K: 'https://download.cpy.re/peertube/4k_file.txt' + file4K: 'https://download.cpy.re/peertube/4k_file.txt', + + transcriptionModels: 'https://download.cpy.re/peertube/transcription-models.zip' } diff --git a/packages/tests/src/transcription/levenshtein-distance.spec.ts b/packages/tests/src/transcription/levenshtein-distance.spec.ts new file mode 100644 index 000000000..d4c502a02 --- /dev/null +++ b/packages/tests/src/transcription/levenshtein-distance.spec.ts @@ -0,0 +1,18 @@ +import { expect } from 'chai' +import { levenshteinDistance } from '@peertube/peertube-transcription' + +describe('Levenshtein distance', function () { + it(`equals 1 when there is only one character difference`, function () { + expect(levenshteinDistance('abcd', 'abce')).equals(1) + }) + + it(`may calculate a distance on a txt subtitle content `, function () { + expect(levenshteinDistance(`December, 1965. +Is that all it has been since +I inherited the world? +Only three years. +Seems like a hundred million. + +`, 'December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.')).equals(13) + }) +}) diff --git a/packages/tests/src/transcription/subtitle.spec.ts b/packages/tests/src/transcription/subtitle.spec.ts new file mode 100644 index 000000000..4a0ee4d95 --- /dev/null +++ b/packages/tests/src/transcription/subtitle.spec.ts @@ -0,0 +1,33 @@ +import { srtToTxt } from '@peertube/peertube-transcription' +import { expect } from 'chai' + +describe('srt to txt', function () { + it(`Transforms the content of a srt subtitle to a pure text version`, function () { + const txt = srtToTxt(`1 +00:00:00,000 --> 00:00:01,940 +December, 1965. + +2 +00:00:03,460 --> 00:00:06,660 +Is that all it has been since +I inherited the world? + +3 +00:00:07,020 --> 00:00:08,900 +Only three years. + +4 +00:00:09,940 --> 00:00:11,760 +Seems like a hundred million. + +`) + + expect(txt).equals(`December, 1965. +Is that all it has been since +I inherited the world? +Only three years. +Seems like a hundred million. + +`) + }) +}) diff --git a/packages/tests/src/transcription/transcriber-factory.spec.ts b/packages/tests/src/transcription/transcriber-factory.spec.ts new file mode 100644 index 000000000..cd9d9f29f --- /dev/null +++ b/packages/tests/src/transcription/transcriber-factory.spec.ts @@ -0,0 +1,17 @@ +import { transcriberFactory } from '@peertube/peertube-transcription' + +describe('Transcriber factory', function () { + const transcribers = [ + 'openai-whisper', + 'whisper-ctranslate2', + 'whisper-timestamped' + ] + + describe('Should be able to create a transcriber for each available transcription engine', function () { + transcribers.forEach(function (transcriberName) { + it(`Should be able to create a(n) ${transcriberName} transcriber`, function () { + transcriberFactory.createFromEngineName(transcriberName) + }) + }) + }) +}) diff --git a/packages/tests/src/transcription/transcript/transcript-file-evaluator.spec.ts b/packages/tests/src/transcription/transcript/transcript-file-evaluator.spec.ts new file mode 100644 index 000000000..93df638db --- /dev/null +++ b/packages/tests/src/transcription/transcript/transcript-file-evaluator.spec.ts @@ -0,0 +1,67 @@ +/* eslint-disable @typescript-eslint/no-unused-expressions, no-new, max-len */ +import { TranscriptFile, TranscriptFileEvaluator } from '@peertube/peertube-transcription' +import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils' +import { join } from 'node:path' +import { mkdir, rm } from 'node:fs/promises' +import { tmpdir } from 'node:os' +import { expect } from 'chai' + +describe('Transcript File Evaluator', function () { + const transcriptDirectory = join(tmpdir(), 'peertube-transcription', 'transcript-file-evaluator') + const referenceTranscriptFilePath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.txt') + + before(async function () { + await mkdir(transcriptDirectory, { recursive: true }) + }) + + it(`may not compare files in another format than txt`, async function () { + const vttReference = await TranscriptFile.write({ + path: join(transcriptDirectory, 'reference.vtt'), + format: 'vtt', + content: '' + }) + const vttHypothesis = await TranscriptFile.write({ + path: join(transcriptDirectory, 'hypothesis.vtt'), + format: 'vtt', + content: '' + }) + expect(() => new TranscriptFileEvaluator(vttReference, vttHypothesis)).to.throw('Can only evaluate txt transcript file') + }) + + it(`evaluation must return coherent wer & cer`, async function () { + const reference = new TranscriptFile({ + path: referenceTranscriptFilePath, + language: 'fr', + format: 'txt' + }) + const hypothesis = await TranscriptFile.write({ + path: join(transcriptDirectory, 'openai.txt'), + content: `Communiquez lors d'une classe transplante. Utilisez les photos prises lors de cette classe pour raconter quotidiennement le sĂ©jour vĂ©cu. +C'est le scĂ©nario P-Dagujic prĂ©sentĂ© par monsieur Navoli, professeur ainsi que le 3 sur une Ă©cole alimentaire de Montpellier. +La premiĂšre application a utilisĂ© ce ralame dĂ©atec. L'enseignant va alors transfĂ©rer les diffĂ©rentes photos rĂ©alisĂ©s lors de la classe transplante. +Dans un dossier, spĂ©cifique pour que les Ă©lĂšves puissent le retrouver plus facilement. Il tĂ©lĂ©verse donc ses photos dans le dossier, dans le ventĂ©, dans la mĂ©diatĂšque de la classe. +Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser. +Les Ă©lĂšves par la suite utilisera le blog. A partir de leurs nantes, il pourront se loi de parposte rĂ©digeant un article d'un reintĂ©. +Ils illustront ses articles Ă  l'aide des photos de que mon numĂ©rique mise Ă  n'accĂ©lier dans le ventĂ©. +Pour se faire, il pourront utiliser les diteurs avancĂ©s qui les renvĂšrent directement dans la mĂ©diatĂšque de la classe oĂč il pourront retrouver le dossier crĂ©Ă© par leurs enseignants. +Une fois leur article terminĂ©e, les Ă©lĂšves soumĂ©tront se lui-ci au professeur qui pourra soit la notĂ© pour correction ou le public. +Ensuite, il pourront lire et commenter ce de leurs camarades ou rĂ©pondre aux commentaires de la veille. +`, + format: 'txt', + language: 'fr' + }) + const evaluator = new TranscriptFileEvaluator(reference, hypothesis) + const wer = await evaluator.wer() + expect(wer).to.be.greaterThan(0 / 100) + expect(wer).to.be.below(30 / 100) + + const cer = await evaluator.cer() + expect(cer).to.be.greaterThan(9 / 100) + expect(cer).to.be.below(10 / 100) + console.log(await evaluator.alignment()) + }) + + after(async function () { + await rm(transcriptDirectory, { recursive: true, force: true }) + }) +}) diff --git a/packages/tests/src/transcription/transcript/transcript-file.spec.ts b/packages/tests/src/transcription/transcript/transcript-file.spec.ts new file mode 100644 index 000000000..112c246b3 --- /dev/null +++ b/packages/tests/src/transcription/transcript/transcript-file.spec.ts @@ -0,0 +1,44 @@ +/* eslint-disable @typescript-eslint/no-unused-expressions */ +import { expect } from 'chai' +import { join } from 'node:path' +import { mkdir, rm } from 'node:fs/promises' +import { TranscriptFile } from '@peertube/peertube-transcription' +import { tmpdir } from 'node:os' +import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils' + +describe('Transcript File', function () { + const transcriptFileDirectory = join(tmpdir(), 'peertube-transcription', 'transcript-file') + before(async function () { + await mkdir(transcriptFileDirectory, { recursive: true }) + }) + + it(`may creates a new transcript file from scratch`, async function () { + const transcript1 = await TranscriptFile.write({ + path: join(transcriptFileDirectory, 'test1.txt'), + content: 'test2', + format: 'txt' + }) + const transcript2 = await TranscriptFile.write({ + path: join(transcriptFileDirectory, 'test2.txt'), + content: 'test2', + format: 'txt' + }) + + expect(await transcript1.equals(transcript2)).to.be.true + }) + + it(`may creates a txt transcript file object from a transcript without providing the format explicitly`, function () { + TranscriptFile.fromPath(buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.srt'), 'en') + TranscriptFile.fromPath(buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.txt'), 'en') + }) + + it(`fails when loading a file which is obviously not a transcript`, function () { + + expect(() => TranscriptFile.fromPath(buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4'), 'en')) + .to.throw(`Couldn't guess transcript format from extension "mp4". Valid formats are: txt, vtt, srt.`) + }) + + after(async function () { + await rm(transcriptFileDirectory, { recursive: true, force: true }) + }) +}) diff --git a/packages/tests/src/transcription/transcription-run.spec.ts b/packages/tests/src/transcription/transcription-run.spec.ts new file mode 100644 index 000000000..d877fbded --- /dev/null +++ b/packages/tests/src/transcription/transcription-run.spec.ts @@ -0,0 +1 @@ +describe('Transcription run', function () {}) diff --git a/packages/tests/src/transcription/utils.spec.ts b/packages/tests/src/transcription/utils.spec.ts new file mode 100644 index 000000000..487b9aeda --- /dev/null +++ b/packages/tests/src/transcription/utils.spec.ts @@ -0,0 +1,44 @@ +import { cp, lstat, mkdir, rm } from 'node:fs/promises' +import { join } from 'node:path' +import { tmpdir } from 'node:os' +import { expect } from 'chai' +import { downloadFile, unzip } from '@peertube/peertube-transcription' +import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils' + +describe('downloadFile', function () { + const testDirectory = join(tmpdir(), 'peertube-transcription', 'utils') + before(async function () { + await mkdir(testDirectory, { recursive: true }) + }) + + it(`Downloads a file and write it to the disk `, async function () { + const filePath = await downloadFile('https://download.cpy.re/peertube/4k_file.txt', testDirectory) + + expect(await lstat(filePath).then(stats => stats.isFile())).equals(true) + }) + + after(async function () { + await rm(testDirectory, { recursive: true, force: true }) + }) +}) + +describe('unzip', function () { + const zipFixtureFileName = 'hello_world.zip' + const zipFixtureFilePath = buildAbsoluteFixturePath(`transcription/${zipFixtureFileName}`) + const testDirectory = join(tmpdir(), 'peertube-transcription', 'utils') + before(async function () { + await mkdir(testDirectory, { recursive: true }) + }) + + it(`Extract zip archive to directory`, async function () { + const zipFilePath = join(testDirectory, zipFixtureFileName) + await cp(zipFixtureFilePath, zipFilePath) + const unzippedDirectory = await unzip(zipFilePath) + + expect(await lstat(unzippedDirectory).then(stats => stats.isDirectory())).equals(true) + }) + + after(async function () { + await rm(testDirectory, { recursive: true, force: true }) + }) +}) diff --git a/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts b/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts new file mode 100644 index 000000000..d3f3f9ca7 --- /dev/null +++ b/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts @@ -0,0 +1,125 @@ +/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */ +import { expect, config } from 'chai' +import { createLogger } from 'winston' +import { join } from 'node:path' +import { mkdir, rm } from 'node:fs/promises' +import { tmpdir } from 'node:os' +import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils' +import { + downloadFile, + levenshteinDistance, + OpenaiTranscriber, + TranscriptFile, + TranscriptFileEvaluator, + TranscriptionModel, + unzip, + WhisperBuiltinModel +} from '@peertube/peertube-transcription' +import { FIXTURE_URLS } from '@tests/shared/fixture-urls.js' + +config.truncateThreshold = 0 + +describe('Open AI Whisper transcriber', function () { + const tmpDirectory = join(tmpdir(), 'peertube-transcription') + const transcriptDirectory = join(tmpDirectory, 'transcriber', 'openai') + const modelsDirectory = join(tmpDirectory, 'models') + const shortVideoPath = buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4') + const frVideoPath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4') + const referenceTranscriptFile = new TranscriptFile({ + path: buildAbsoluteFixturePath('transcription/videos/derive_sectaire.txt'), + language: 'fr', + format: 'txt' + }) + const transcriber = new OpenaiTranscriber( + { + name: 'openai-whisper', + requirements: [], + type: 'binary', + binary: 'whisper', + supportedModelFormats: [ 'PyTorch' ], + languageDetection: true + }, + createLogger(), + transcriptDirectory + ) + + before(async function () { + this.timeout(1 * 1000 * 60) + await mkdir(transcriptDirectory, { recursive: true }) + await unzip(await downloadFile(FIXTURE_URLS.transcriptionModels, tmpDirectory)) + }) + + it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () { + this.timeout(3 * 1000 * 60) + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' }) + + expect(transcript.format).to.equals('vtt') + expect(transcript.language).to.equals('en') + expect(await transcript.read()).not.to.be.empty + }) + + it('May produce a transcript file in the `srt` format', async function () { + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' }) + + expect(transcript.format).to.equals('srt') + expect(transcript.language).to.equals('en') + expect(await transcript.read()).not.to.be.empty + }) + + it('May produce a transcript file in the `txt` format', async function () { + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' }) + + expect(transcript.format).to.equals('txt') + expect(transcript.language).to.equals('en') + expect(await transcript.read()).not.to.be.empty + expect(levenshteinDistance( + (await transcript.read()).toString(), + 'December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.' + )).to.be.below(3) + }) + + it('May transcribe a media file using a local PyTorch model', async function () { + this.timeout(2 * 1000 * 60) + await transcriber.transcribe({ + mediaFilePath: shortVideoPath, + model: await TranscriptionModel.fromPath(join(modelsDirectory, 'tiny.pt')), + language: 'en' + }) + }) + + it('May transcribe a media file in french', async function () { + this.timeout(3 * 1000 * 60) + const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath, language: 'fr', format: 'txt' }) + + expect(transcript.format).to.equals('txt') + expect(transcript.language).to.equals('fr') + expect(await transcript.read()).not.to.be.empty + }) + + it('Guesses the video language if not provided', async function () { + this.timeout(3 * 1000 * 60) + const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath }) + + expect(transcript.language).to.equals('fr') + }) + + it('May transcribe a media file in french with small model', async function () { + this.timeout(6 * 1000 * 60) + const transcript = await transcriber.transcribe({ + mediaFilePath: frVideoPath, + language: 'fr', + format: 'txt', + model: new WhisperBuiltinModel('small') + }) + + expect(transcript.language).to.equals('fr') + + const transcriptFileEvaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcript) + const cer = await transcriptFileEvaluator.cer() + expect(cer).to.be.below(6 / 100) + }) + + after(async function () { + await rm(transcriptDirectory, { recursive: true, force: true }) + }) +}) diff --git a/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts b/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts new file mode 100644 index 000000000..bf4adee9e --- /dev/null +++ b/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts @@ -0,0 +1,133 @@ +/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */ +import { expect, config } from 'chai' +import { createLogger } from 'winston' +import { join } from 'node:path' +import { mkdir, rm } from 'node:fs/promises' +import { tmpdir } from 'node:os' +import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils' +import { + OpenaiTranscriber, + WhisperTimestampedTranscriber, + TranscriptFileEvaluator, + TranscriptionModel, + WhisperTranscribeArgs, + levenshteinDistance, downloadFile, unzip +} from '@peertube/peertube-transcription' +import { FIXTURE_URLS } from '@tests/shared/fixture-urls.js' + +config.truncateThreshold = 0 + +describe('Linto timestamped Whisper transcriber', function () { + const tmpDirectory = join(tmpdir(), 'peertube-transcription') + const transcriptDirectory = join(tmpDirectory, 'transcriber', 'timestamped') + const modelsDirectory = join(tmpDirectory, 'models') + const shortVideoPath = buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4') + const frVideoPath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4') + const transcriber = new WhisperTimestampedTranscriber( + { + name: 'whisper-timestamped', + requirements: [], + type: 'binary', + binary: 'whisper_timestamped', + supportedModelFormats: [ 'PyTorch' ], + languageDetection: true + }, + createLogger(), + transcriptDirectory + ) + + before(async function () { + this.timeout(1 * 1000 * 60) + await mkdir(transcriptDirectory, { recursive: true }) + await unzip(await downloadFile(FIXTURE_URLS.transcriptionModels, tmpDirectory)) + }) + + it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () { + this.timeout(1 * 1000 * 60) + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' }) + + expect(transcript.format).to.equals('vtt') + expect(transcript.language).to.equals('en') + expect(await transcript.read()).not.to.be.empty + }) + + it('May produce a transcript file in the `srt` format with a ms precision', async function () { + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' }) + + expect(transcript.format).to.equals('srt') + expect(transcript.language).to.equals('en') + expect(await transcript.read()).not.to.be.empty + }) + + it('May produce a transcript file in `txt` format', async function () { + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' }) + + expect(transcript.format).to.equals('txt') + expect(transcript.language).to.equals('en') + expect(await transcript.read()).not.to.be.empty + expect(levenshteinDistance( + (await transcript.read()).toString(), + 'December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.' + )).to.be.below(10) + }) + + it('May transcribe a media file using a local PyTorch model file', async function () { + this.timeout(2 * 1000 * 60) + await transcriber.transcribe({ + mediaFilePath: shortVideoPath, + model: await TranscriptionModel.fromPath(join(modelsDirectory, 'tiny.pt')), + language: 'en' + }) + }) + + it('May transcribe a media file in french', async function () { + this.timeout(2 * 1000 * 60) + const transcript = await transcriber.transcribe({ + mediaFilePath: frVideoPath, + language: 'fr', + format: 'txt' + }) + + expect(transcript.format).to.equals('txt') + expect(transcript.language).to.equals('fr') + expect(await transcript.read()).not.to.be.empty + }) + + it('Guesses the video language if not provided', async function () { + this.timeout(2 * 1000 * 60) + const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath }) + expect(transcript.language).to.equals('fr') + }) + + it('Should produce a text transcript similar to openai-whisper implementation', async function () { + this.timeout(11 * 1000 * 60) + const transcribeArgs: WhisperTranscribeArgs = { + mediaFilePath: frVideoPath, + model: await TranscriptionModel.fromPath(join(modelsDirectory, 'tiny.pt')), + language: 'fr', + format: 'txt' + } + const transcript = await transcriber.transcribe(transcribeArgs) + + const openaiTranscriber = new OpenaiTranscriber( + { + name: 'openai-whisper', + requirements: [], + type: 'binary', + binary: 'whisper', + supportedModelFormats: [ 'PyTorch' ] + }, + createLogger(), + join(transcriptDirectory, 'openai-whisper') + ) + const openaiTranscript = await openaiTranscriber.transcribe(transcribeArgs) + + const transcriptFileEvaluator = new TranscriptFileEvaluator(openaiTranscript, transcript) + expect(await transcriptFileEvaluator.wer()).to.be.below(25 / 100) + expect(await transcriptFileEvaluator.cer()).to.be.below(15 / 100) + }) + + after(async function () { + await rm(transcriptDirectory, { recursive: true, force: true }) + }) +}) diff --git a/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts b/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts new file mode 100644 index 000000000..3a9984e87 --- /dev/null +++ b/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts @@ -0,0 +1,137 @@ +/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */ +import { expect, config } from 'chai' +import { createLogger } from 'winston' +import { join } from 'node:path' +import { mkdir, rm } from 'node:fs/promises' +import { tmpdir } from 'node:os' +import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils' +import { + Ctranslate2Transcriber, downloadFile, + levenshteinDistance, + OpenaiTranscriber, + TranscriptFile, + TranscriptFileEvaluator, + TranscriptionModel, unzip, + WhisperTranscribeArgs +} from '@peertube/peertube-transcription' +import { FIXTURE_URLS } from '@tests/shared/fixture-urls.js' + +config.truncateThreshold = 0 + +describe('Whisper CTranslate2 transcriber', function () { + const tmpDirectory = join(tmpdir(), 'peertube-transcription') + const transcriptDirectory = join(tmpDirectory, 'transcriber', 'ctranslate2') + const modelsDirectory = join(tmpDirectory, 'models') + const shortVideoPath = buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4') + const frVideoPath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4') + const transcriber = new Ctranslate2Transcriber( + { + name: 'anyNameShouldBeFineReally', + requirements: [], + type: 'binary', + binary: 'whisper-ctranslate2', + supportedModelFormats: [], + languageDetection: true + }, + createLogger(), + transcriptDirectory + ) + + before(async function () { + this.timeout(1 * 1000 * 60) + await mkdir(transcriptDirectory, { recursive: true }) + await unzip(await downloadFile(FIXTURE_URLS.transcriptionModels, tmpDirectory)) + }) + + it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () { + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' }) + + expect(transcript.format).to.equals('vtt') + expect(transcript.language).to.equals('en') + expect(await transcript.read()).not.to.be.empty + }) + + it('May produce a transcript file in the `srt` format', async function () { + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' }) + + expect(transcript.format).to.equals('srt') + expect(transcript.language).to.equals('en') + expect(await transcript.read()).not.to.be.empty + }) + + it('May produce a transcript file in the `txt` format', async function () { + const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' }) + expect(await transcript.equals(new TranscriptFile({ + path: join(transcriptDirectory, 'the_last_man_on_earth.txt'), + format: 'txt', + language: 'en' + }))).to.be.true + + expect(transcript.format).to.equals('txt') + expect(transcript.language).to.equals('en') + expect(await transcript.read()).not.to.be.empty + expect(levenshteinDistance( + (await transcript.read()).toString(), + 'December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.' + )).to.be.below(5) + }) + + it('May transcribe a media file using a local CTranslate2 model', async function () { + this.timeout(2 * 1000 * 60) + const transcript = await transcriber.transcribe({ + mediaFilePath: shortVideoPath, + model: await TranscriptionModel.fromPath(join(modelsDirectory, 'faster-whisper-tiny')), + language: 'en', + format: 'txt' + }) + + expect(transcript.format).to.equals('txt') + expect(transcript.language).to.equals('en') + expect(await transcript.read()).not.to.be.empty + }) + + it('May transcribe a media file in french', async function () { + this.timeout(5 * 1000 * 60) + const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath, language: 'fr', format: 'txt' }) + + expect(transcript.format).to.equals('txt') + expect(transcript.language).to.equals('fr') + expect(await transcript.read()).not.to.be.empty + }) + + it('Guesses the video language if not provided', async function () { + this.timeout(2 * 1000 * 60) + const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath }) + expect(transcript.language).to.equals('fr') + }) + + it('Should produce a text transcript similar to openai-whisper implementation', async function () { + this.timeout(10 * 1000 * 60) + const transcribeArgs: WhisperTranscribeArgs = { + mediaFilePath: frVideoPath, + language: 'fr', + format: 'txt' + } + const transcript = await transcriber.transcribe(transcribeArgs) + const openaiTranscriber = new OpenaiTranscriber( + { + name: 'openai-whisper', + requirements: [], + type: 'binary', + binary: 'whisper', + supportedModelFormats: [ 'PyTorch' ] + }, + createLogger(), + join(transcriptDirectory, 'openai-whisper') + ) + const openaiTranscript = await openaiTranscriber.transcribe(transcribeArgs) + + const transcriptFileEvaluator = new TranscriptFileEvaluator(openaiTranscript, transcript) + expect(await transcriptFileEvaluator.wer()).to.be.below(20 / 100) + expect(await transcriptFileEvaluator.cer()).to.be.below(10 / 100) + }) + + after(async function () { + await rm(transcriptDirectory, { recursive: true, force: true }) + }) +}) diff --git a/packages/tests/tsconfig.json b/packages/tests/tsconfig.json index fc3490da4..5b8e2a6c4 100644 --- a/packages/tests/tsconfig.json +++ b/packages/tests/tsconfig.json @@ -6,16 +6,20 @@ "tsBuildInfoFile": "./dist/.tsbuildinfo", "paths": { "@tests/*": [ "./src/*" ], - "@server/*": [ "../../server/core/*" ] + "@server/*": [ "../../server/core/*" ], + "@peertube/peertube-transcription": [ "../transcription" ], + "@peertube/peertube-jiwer": [ "../jiwer" ], } }, "references": [ { "path": "../core-utils" }, { "path": "../ffmpeg" }, + { "path": "../jiwer" }, { "path": "../models" }, { "path": "../node-utils" }, { "path": "../typescript-utils" }, { "path": "../server-commands" }, + { "path": "../transcription" }, { "path": "../../server/tsconfig.lib.json" } ], "include": [ diff --git a/packages/transcription/README.md b/packages/transcription/README.md new file mode 100644 index 000000000..b16ff75d4 --- /dev/null +++ b/packages/transcription/README.md @@ -0,0 +1,99 @@ +# Transcription + +Video **transcription** consists in transcribing the audio content of a video to a text. +> This process might be called __Automatic Speech Recognition__ or __Speech to Text__ in more general context. + +Provide a common API to many transcription backend, currently : +- `openai-whisper` CLI +- `faster-whisper` (*via* `whisper-ctranslate2` CLI) +- `whisper-timestamped` + +> Potential candidates could be: whisper-cpp, vosk, ... + +## Requirements +- Python +- PIP + +And at least one of the following transcription backend: +- Python : + - `openai-whisper` + - `whisper-ctranslate2>=0.4.3` + - `whisper-timestamped>=1.15.4` + +And to run the transcript evaluation tests : +- Python + - `jiwer>=3.04` + +## Usage + +Create a transcriber manually : +```typescript +import { OpenaiTranscriber } from '@peertube/peertube-transcription' + +(async () => { + // create a transcriber powered by OpeanAI Whisper CLI + const transcriber = new OpenaiTranscriber({ + name: 'openai-whisper', + binary: 'whisper', + languageDetection: true + }); + + const transcriptFile = await transcriber.transcribe({ + mediaFilePath: './myVideo.mp4', + format: 'txt' + }); + + console.log(transcriptFile.path); + console.log(await transcriptFile.read()); +})(); +``` + +Using a local model file: + +```typescript +import { WhisperBuiltinModel } from '@peertube/peertube-transcription/dist' + +const transcriptFile = await transcriber.transcribe({ + mediaFilePath: './myVideo.mp4', + model: WhisperBuiltinModel.fromPath('./models/large.pt'), + format: 'txt' +}); +``` + +You may use the builtin Factory if you're happy with the default configuration: +```Typescript +import { transcriberFactory } from '@peertube/peertube-transcription' +transcriberFactory.createFromEngineName('openai-whisper') +``` +> For further usage [../tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts](../tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts) + +## Benchmark + +A benchmark of available __transcribers__ might be run with: +```sh +npm run benchmark +``` +``` +┌────────────────────────┬───────────────────────┬───────────────────────┬──────────┬────────┬───────────────────────┐ +│ (index) │ WER │ CER │ duration │ model │ engine │ +├────────────────────────┌───────────────────────┌───────────────────────┌──────────┌────────┌──────────────────────── +│ 5yZGBYqojXe7nuhq1TuHvz │ '28.39506172839506%' │ '9.62457337883959%' │ '41s' │ 'tiny' │ 'openai-whisper' │ +│ x6qREJ2AkTU4e5YmvfivQN │ '29.75206611570248%' │ '10.46195652173913%' │ '15s' │ 'tiny' │ 'whisper-ctranslate2' │ +│ qbt6BekKMVzxq4KCSLCzt3 │ '31.020408163265305%' │ '10.784982935153584%' │ '20s' │ 'tiny' │ 'whisper-timestamped' │ +└────────────────────────┮───────────────────────┮───────────────────────┮──────────┮────────┮───────────────────────┘ +``` + +The benchmark may be run with multiple model builtin sizes: +```sh +MODELS=tiny,small,large npm run benchmark +``` + +## Lexicon +- ONNX: Open Neural Network eXchange. A specification, the ONNX Runtime run these models. +- GPTs: Generative Pre-Trained Transformers +- LLM: Large Language Models +- NLP: Natural Language Processing +- MLP: Multilayer Perceptron +- ASR: Automatic Speech Recognition +- WER: Word Error Rate +- CER: Character Error Rate diff --git a/packages/transcription/package.json b/packages/transcription/package.json new file mode 100644 index 000000000..9138de4dc --- /dev/null +++ b/packages/transcription/package.json @@ -0,0 +1,21 @@ +{ + "name": "@peertube/peertube-transcription", + "private": true, + "version": "0.0.0", + "main": "dist/index.js", + "files": [ "dist" ], + "exports": { + "types": "./dist/index.d.ts", + "peertube:tsx": "./src/index.ts", + "default": "./dist/index.js" + }, + "type": "module", + "devDependencies": {}, + "scripts": { + "preinstall": "pip install -r requirements.txt", + "build": "tsc", + "watch": "tsc -w", + "benchmark": "tsx --conditions=peertube:tsx --tsconfig ./tsconfig.json ./src/benchmark.ts" + }, + "dependencies": {} +} diff --git a/packages/transcription/requirements.txt b/packages/transcription/requirements.txt new file mode 100644 index 000000000..bc457025c --- /dev/null +++ b/packages/transcription/requirements.txt @@ -0,0 +1,3 @@ +openai-whisper==20231117 +whisper-ctranslate2==0.4.4 +whisper-timestamped==1.15.4 diff --git a/packages/transcription/src/abstract-transcriber.ts b/packages/transcription/src/abstract-transcriber.ts new file mode 100644 index 000000000..b9a0f66e5 --- /dev/null +++ b/packages/transcription/src/abstract-transcriber.ts @@ -0,0 +1,69 @@ +import { createLogger, Logger } from 'winston' +import { join } from 'node:path' +import { PerformanceObserver } from 'node:perf_hooks' +import { buildSUUID, SUUID, root } from '@peertube/peertube-node-utils' +import { TranscriptionEngine } from './transcription-engine.js' +import { TranscriptionModel } from './transcription-model.js' +import { TranscriptionRun } from './transcription-run.js' +import { TranscriptFile, TranscriptFormat } from './transcript/index.js' + +export interface TranscribeArgs { + mediaFilePath: string + model: TranscriptionModel + language?: string + format?: TranscriptFormat + runId?: SUUID +} + +export abstract class AbstractTranscriber { + public static DEFAULT_TRANSCRIPT_DIRECTORY = join(root(), 'dist', 'transcripts') + + engine: TranscriptionEngine + logger: Logger + transcriptDirectory: string + performanceObserver?: PerformanceObserver + run?: TranscriptionRun + + constructor ( + engine: TranscriptionEngine, + logger: Logger = createLogger(), + transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY, + performanceObserver?: PerformanceObserver + ) { + this.engine = engine + this.logger = logger + this.transcriptDirectory = transcriptDirectory + this.performanceObserver = performanceObserver + } + + createRun (uuid: SUUID = buildSUUID()) { + this.run = new TranscriptionRun(this.logger, uuid) + } + + startRun () { + this.run.start() + } + + stopRun () { + this.run.stop() + delete this.run + } + + assertLanguageDetectionAvailable (language?: string) { + if (!this.engine.languageDetection && !language) { + throw new Error(`Language detection isn't available in ${this.engine.name}. A language must me provided explicitly.`) + } + } + + supports (model: TranscriptionModel) { + return model.format === 'PyTorch' + } + + abstract transcribe ({ + mediaFilePath, + model, + language, + format = 'vtt', + runId = buildSUUID() + }: TranscribeArgs): Promise +} diff --git a/packages/transcription/src/benchmark.ts b/packages/transcription/src/benchmark.ts new file mode 100644 index 000000000..4cac16449 --- /dev/null +++ b/packages/transcription/src/benchmark.ts @@ -0,0 +1,139 @@ +import { createLogger, transports, format } from 'winston' +import { join } from 'node:path' +import { performance, PerformanceObserver } from 'node:perf_hooks' +import { tmpdir } from 'node:os' +import { rm, mkdir } from 'node:fs/promises' +import { buildAbsoluteFixturePath, buildSUUID, SUUID } from '@peertube/peertube-node-utils' +import { + transcriberFactory, + TranscriptFile, + TranscriptFileEvaluator, + TranscriptionEngine, + TranscriptionModel +} from '@peertube/peertube-transcription' +import { millisecondsToTime } from '@peertube/peertube-core-utils' + +interface BenchmarkResult { + uuid: SUUID + WER?: number + CER?: number + duration?: number + engine?: TranscriptionEngine + model?: string +} + +type Benchmark = Record + +const benchmarkReducer = (benchmark: Benchmark = {}, benchmarkResult: BenchmarkResult) => ({ + ...benchmark, + [benchmarkResult.uuid]: { + ...benchmark[benchmarkResult.uuid], + ...benchmarkResult + } +}) + +const groupBenchmarkResultsByModel = (benchmarkResults: Record) => (benchmarksGroupedByModel, uuid) => ({ + ...benchmarksGroupedByModel, + [benchmarkResults[uuid].model]: { + ...benchmarksGroupedByModel[benchmarkResults[uuid].model], + [uuid]: formatBenchmarkResult(benchmarkResults[uuid]) + } +}) + +interface FormattedBenchmarkResult { + WER?: string + CER?: string + duration?: string + model?: string + engine?: string +} + +const formatBenchmarkResult = ({ WER, CER, duration, engine, model }: Partial): FormattedBenchmarkResult => ({ + WER: WER ? `${WER * 100}%` : undefined, + CER: CER ? `${CER * 100}%` : undefined, + duration: duration ? millisecondsToTime(duration) : undefined, + model, + engine: engine.name +}) + +void (async () => { + const logger = createLogger() + logger.add(new transports.Console({ format: format.printf(log => log.message) })) + const transcribers = [ + 'openai-whisper', + 'whisper-ctranslate2', + 'whisper-timestamped' + ] + const models = process.env.MODELS + ? process.env.MODELS.trim().split(',').map(modelName => modelName.trim()).filter(modelName => modelName) + : [ 'tiny' ] + + const transcriptDirectory = join(tmpdir(), 'peertube-transcription', 'benchmark') + const mediaFilePath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4') + const referenceTranscriptFile = new TranscriptFile({ + path: buildAbsoluteFixturePath('transcription/videos/derive_sectaire.txt'), + language: 'fr', + format: 'txt' + }) + + let benchmarkResults: Record = {} + + // before + await mkdir(transcriptDirectory, { recursive: true }) + const performanceObserver = new PerformanceObserver((items) => { + items + .getEntries() + .forEach((entry) => { + benchmarkResults = benchmarkReducer(benchmarkResults, { + uuid: entry.name as SUUID, + duration: entry.duration + }) + }) + }) + performanceObserver.observe({ type: 'measure' }) + + // benchmark + logger.info(`Running transcribers benchmark with the following models: ${models.join(', ')}`) + for (const transcriberName of transcribers) { + logger.info(`Create "${transcriberName}" transcriber for the benchmark...`) + + const transcriber = transcriberFactory.createFromEngineName( + transcriberName, + createLogger(), + transcriptDirectory + ) + + for (const modelName of models) { + logger.info(`Run benchmark with "${modelName}" model:`) + const model = new TranscriptionModel(modelName) + const uuid = buildSUUID() + const transcriptFile = await transcriber.transcribe({ + mediaFilePath, + model, + language: 'fr', + format: 'txt', + runId: uuid + }) + const evaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcriptFile) + await new Promise(resolve => setTimeout(resolve, 1)) + + benchmarkResults = benchmarkReducer(benchmarkResults, { + uuid, + engine: transcriber.engine, + WER: await evaluator.wer(), + CER: await evaluator.cer(), + model: model.name + }) + } + } + + // display + const benchmarkResultsGroupedByModel = Object + .keys(benchmarkResults) + .reduce(groupBenchmarkResultsByModel(benchmarkResults), {}) + Object.values(benchmarkResultsGroupedByModel).forEach(benchmark => console.table(benchmark)) + + // after + await rm(transcriptDirectory, { recursive: true, force: true }) + performance.clearMarks() +})() diff --git a/packages/transcription/src/index.ts b/packages/transcription/src/index.ts new file mode 100644 index 000000000..44eaa0799 --- /dev/null +++ b/packages/transcription/src/index.ts @@ -0,0 +1,13 @@ +import { TranscriberFactory } from './transcriber-factory.js' +import { engines } from './whisper/index.js' + +export * from './transcript/index.js' +export * from './levenshtein.js' +export * from './subtitle.js' +export * from './transcription-engine.js' +export * from './transcription-model.js' +export * from './transcription-run.js' +export * from './utils.js' +export * from './whisper/index.js' + +export const transcriberFactory = new TranscriberFactory(engines) diff --git a/packages/transcription/src/levenshtein.ts b/packages/transcription/src/levenshtein.ts new file mode 100644 index 000000000..364cae061 --- /dev/null +++ b/packages/transcription/src/levenshtein.ts @@ -0,0 +1,101 @@ +function min (d0: number, d1: number, d2: number, bx: number, ay: number) { + return d0 < d1 || d2 < d1 + ? d0 > d2 + ? d2 + 1 + : d0 + 1 + : bx === ay + ? d1 + : d1 + 1 +} + +/** + * @see https://github.com/gustf/js-levenshtein + */ +export function levenshteinDistance (a: string, b: string): number { + if (a === b) { + return 0 + } + + if (a.length > b.length) { + const tmp = a + a = b + b = tmp + } + + let la = a.length + let lb = b.length + + while (la > 0 && (a.charCodeAt(la - 1) === b.charCodeAt(lb - 1))) { + la-- + lb-- + } + + let offset = 0 + + while (offset < la && (a.charCodeAt(offset) === b.charCodeAt(offset))) { + offset++ + } + + la -= offset + lb -= offset + + if (la === 0 || lb < 3) { + return lb + } + + let x = 0 + let y: number + let d0: number + let d1: number + let d2: number + let d3: number + let dd: number + let dy: number + let ay: number + let bx0: number + let bx1: number + let bx2: number + let bx3: number + + const vector: number[] = [] + + for (y = 0; y < la; y++) { + vector.push(y + 1) + vector.push(a.charCodeAt(offset + y)) + } + + const len = vector.length - 1 + + for (; x < lb - 3;) { + bx0 = b.charCodeAt(offset + (d0 = x)) + bx1 = b.charCodeAt(offset + (d1 = x + 1)) + bx2 = b.charCodeAt(offset + (d2 = x + 2)) + bx3 = b.charCodeAt(offset + (d3 = x + 3)) + dd = (x += 4) + for (y = 0; y < len; y += 2) { + dy = vector[y] + ay = vector[y + 1] + d0 = min(dy, d0, d1, bx0, ay) + d1 = min(d0, d1, d2, bx1, ay) + d2 = min(d1, d2, d3, bx2, ay) + dd = min(d2, d3, dd, bx3, ay) + vector[y] = dd + d3 = d2 + d2 = d1 + d1 = d0 + d0 = dy + } + } + + for (; x < lb;) { + bx0 = b.charCodeAt(offset + (d0 = x)) + dd = ++x + for (y = 0; y < len; y += 2) { + dy = vector[y] + vector[y] = dd = min(dy, d0, dd, bx0, vector[y + 1]) + d0 = dy + } + } + + return dd +} diff --git a/packages/transcription/src/subtitle.ts b/packages/transcription/src/subtitle.ts new file mode 100644 index 000000000..94b080ec3 --- /dev/null +++ b/packages/transcription/src/subtitle.ts @@ -0,0 +1 @@ +export const srtToTxt = (srtContent: string) => srtContent.replace(/^\n*\d+\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/gm, '') diff --git a/packages/transcription/src/transcriber-factory.ts b/packages/transcription/src/transcriber-factory.ts new file mode 100644 index 000000000..cb22d617c --- /dev/null +++ b/packages/transcription/src/transcriber-factory.ts @@ -0,0 +1,49 @@ +import { Logger, createLogger } from 'winston' +import { TranscriptionEngine } from './transcription-engine.js' +import { + Ctranslate2Transcriber, + OpenaiTranscriber, WhisperTimestampedTranscriber +} from './whisper/index.js' +import { AbstractTranscriber } from './abstract-transcriber.js' + +export class TranscriberFactory { + engines: TranscriptionEngine[] + + constructor (engines: TranscriptionEngine[]) { + this.engines = engines + } + + createFromEngineName ( + engineName: string, + logger: Logger = createLogger(), + transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY + ) { + const engine = this.getEngineByName(engineName) + + const transcriberArgs: ConstructorParameters = [ + engine, + logger, + transcriptDirectory + ] + + switch (engineName) { + case 'openai-whisper': + return new OpenaiTranscriber(...transcriberArgs) + case 'whisper-ctranslate2': + return new Ctranslate2Transcriber(...transcriberArgs) + case 'whisper-timestamped': + return new WhisperTimestampedTranscriber(...transcriberArgs) + default: + throw new Error(`Unimplemented engine ${engineName}`) + } + } + + getEngineByName (engineName: string) { + const engine = this.engines.find(({ name }) => name === engineName) + if (!engine) { + throw new Error(`Unknow engine ${engineName}`) + } + + return engine + } +} diff --git a/packages/transcription/src/transcript/index.ts b/packages/transcription/src/transcript/index.ts new file mode 100644 index 000000000..bd76d1a86 --- /dev/null +++ b/packages/transcription/src/transcript/index.ts @@ -0,0 +1,3 @@ +export * from './transcript-file.js' +export * from './transcript-file-evaluator.js' +export * from './transcript-file-interface.js' diff --git a/packages/transcription/src/transcript/transcript-file-evaluator-interface.ts b/packages/transcription/src/transcript/transcript-file-evaluator-interface.ts new file mode 100644 index 000000000..cf51b6551 --- /dev/null +++ b/packages/transcription/src/transcript/transcript-file-evaluator-interface.ts @@ -0,0 +1,12 @@ +export interface TranscriptFileEvaluation { + wer: number + cer: number + alignment: string +} + +export interface TranscriptFileEvaluatorInterface { + wer(): Promise + cer(): Promise + alignment(): Promise + evaluate(): Promise +} diff --git a/packages/transcription/src/transcript/transcript-file-evaluator.ts b/packages/transcription/src/transcript/transcript-file-evaluator.ts new file mode 100644 index 000000000..7ae411ee4 --- /dev/null +++ b/packages/transcription/src/transcript/transcript-file-evaluator.ts @@ -0,0 +1,46 @@ +import assert from 'node:assert' +import { JiwerClI } from '@peertube/peertube-jiwer' +import { TranscriptFileEvaluatorInterface } from './transcript-file-evaluator-interface.js' +import { TranscriptFileInterface } from './index.js' + +export class TranscriptFileEvaluator implements TranscriptFileEvaluatorInterface { + referenceTranscriptFile: TranscriptFileInterface + hypothesisTranscriptFile: TranscriptFileInterface + jiwerCLI: JiwerClI + + constructor (referenceTranscriptFile: TranscriptFileInterface, hypothesisTranscriptFile: TranscriptFileInterface) { + assert(referenceTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file') + assert(hypothesisTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file') + + this.referenceTranscriptFile = referenceTranscriptFile + this.hypothesisTranscriptFile = hypothesisTranscriptFile + + this.jiwerCLI = new JiwerClI(this.referenceTranscriptFile.path, this.hypothesisTranscriptFile.path) + } + + /** + * WER: Word Error Rate + */ + wer () { + return this.jiwerCLI.wer() + } + + /** + * CER: Character Error Rate + */ + cer () { + return this.jiwerCLI.cer() + } + + alignment () { + return this.jiwerCLI.alignment() + } + + async evaluate () { + return { + wer: await this.wer(), + cer: await this.cer(), + alignment: await this.alignment() + } + } +} diff --git a/packages/transcription/src/transcript/transcript-file-interface.ts b/packages/transcription/src/transcript/transcript-file-interface.ts new file mode 100644 index 000000000..d30b6913f --- /dev/null +++ b/packages/transcription/src/transcript/transcript-file-interface.ts @@ -0,0 +1,3 @@ +export type TranscriptFormat = 'txt' | 'vtt' | 'srt' | 'json' + +export type TranscriptFileInterface = { path: string, language?: string, format: TranscriptFormat } diff --git a/packages/transcription/src/transcript/transcript-file.ts b/packages/transcription/src/transcript/transcript-file.ts new file mode 100644 index 000000000..36af9fa8f --- /dev/null +++ b/packages/transcription/src/transcript/transcript-file.ts @@ -0,0 +1,88 @@ +import { statSync } from 'node:fs' +import { readFile, writeFile } from 'node:fs/promises' +import { extname } from 'node:path' +import assert from 'node:assert' +import { TranscriptFileInterface, TranscriptFormat } from './transcript-file-interface.js' +import { TranscriptFileEvaluator } from './transcript-file-evaluator.js' +import { srtToTxt } from '../subtitle.js' +import { levenshteinDistance } from '../levenshtein.js' + +export class TranscriptFile implements TranscriptFileInterface { + path: string + language: string + format: TranscriptFormat = 'vtt' + + constructor ({ path, language, format = 'vtt' }: { path: string, language: string, format?: TranscriptFormat }) { + statSync(path) + + this.path = path + this.language = language + this.format = format + } + + /** + * Asynchronously reads the entire contents of a transcript file. + * @see https://nodejs.org/docs/latest-v18.x/api/fs.html#filehandlereadfileoptions for options + */ + async read (options: Parameters[1] = 'utf8') { + return await readFile(this.path, options) + } + + static fromPath (path: string, language = 'en') { + const format = extname(path).substring(1) + + const guessableFormats = [ 'txt', 'vtt', 'srt' ] + assert( + guessableFormats.includes(format), + `Couldn't guess transcript format from extension "${format}". Valid formats are: ${guessableFormats.join(', ')}."`) + + return new TranscriptFile({ path, language, format: format as TranscriptFormat }) + } + + /** + * Write a transcript file to disk. + */ + static async write ({ + path, + content, + language = 'en', + format = 'vtt' + }: { path: string, content: string, language?: string, format?: TranscriptFormat }): Promise { + await writeFile(path, content) + + return new TranscriptFile({ path, language, format }) + } + + async equals (transcript: TranscriptFile, caseSensitive: boolean = true) { + if (this.language !== transcript.language) { + return false + } + + const content = await this.read() + const transcriptContent = await transcript.read() + + if (!caseSensitive) { + return String(content).toLowerCase() === String(transcriptContent).toLowerCase() + } + + return content === transcriptContent + } + + cer (transcript: TranscriptFile) { + return (new TranscriptFileEvaluator(this, transcript)).cer() + } + + async evaluate (transcript: TranscriptFile) { + const evaluator = new TranscriptFileEvaluator(this, transcript) + + return evaluator.evaluate() + } + + async readAsTxt () { + return srtToTxt(String(await this.read())) + } + + async distance (transcript: TranscriptFile) { + return levenshteinDistance(await this.readAsTxt(), await transcript.readAsTxt()) + } +} diff --git a/packages/transcription/src/transcription-engine.ts b/packages/transcription/src/transcription-engine.ts new file mode 100644 index 000000000..3174e3980 --- /dev/null +++ b/packages/transcription/src/transcription-engine.ts @@ -0,0 +1,23 @@ +import { ModelFormat } from './transcription-model.js' + +/** + * The engine, or framework. + */ +export class TranscriptionEngine { + name: string + description?: string + language?: string + requirements: string[] + type: 'binary' | 'bindings' | 'ws' + binary: string + license?: string + forgeURL?: string + supportedModelFormats: ModelFormat[] + languageDetection?: true + // There could be a default models. + // There could be a list of default models + + constructor (parameters: TranscriptionEngine) { + Object.assign(this, parameters) + } +} diff --git a/packages/transcription/src/transcription-model.ts b/packages/transcription/src/transcription-model.ts new file mode 100644 index 000000000..01f3bdd4a --- /dev/null +++ b/packages/transcription/src/transcription-model.ts @@ -0,0 +1,34 @@ +import assert from 'node:assert' +import { stat } from 'node:fs/promises' +import { parse } from 'node:path' + +export type ModelFormat = 'PyTorch' | 'GGML' | 'ONNX' | 'CTranslate2' // CoreML, OpenVino, Scikit-Learn, TensorFlow/Keras, PySpark + +export class TranscriptionModel { + name: string + format?: ModelFormat + path?: string + + // # - hparams + // # - Number of dimensions (int) + // # - Name length (int) + // # - Dimensions (int[n_dims]) + // # - Name (char[name_length]) + // # - Data (float[n_dims]) + + // # - mel filters + // # - tokenizer vocab + // # - model variables + + constructor (name: string, path?: string, format?: ModelFormat) { + this.name = name + this.path = path + this.format = format + } + + static async fromPath (path: string) { + assert(await stat(path), `${path} doesn't exist.`) + + return new TranscriptionModel(parse(path).name, path) + } +} diff --git a/packages/transcription/src/transcription-run.ts b/packages/transcription/src/transcription-run.ts new file mode 100644 index 000000000..608afbb6a --- /dev/null +++ b/packages/transcription/src/transcription-run.ts @@ -0,0 +1,41 @@ +import { buildSUUID, SUUID } from '@peertube/peertube-node-utils' +import { createLogger, Logger } from 'winston' + +export class TranscriptionRun { + uuid: SUUID + logger: Logger + + constructor (logger = createLogger(), uuid: SUUID = buildSUUID()) { + this.uuid = uuid + this.logger = logger + } + + get runId () { + return this.uuid + } + + start () { + performance.mark(this.getStartPerformanceMarkName()) + } + + stop () { + try { + performance.mark(this.getEndPerformanceMarkName()) + performance.measure( + this.runId, + this.getStartPerformanceMarkName(), + this.getEndPerformanceMarkName() + ) + } catch (e) { + this.logger.log({ level: 'error', message: e }) + } + } + + getStartPerformanceMarkName () { + return `${this.runId}-started` + } + + getEndPerformanceMarkName () { + return `${this.runId}-ended` + } +} diff --git a/packages/transcription/src/utils.ts b/packages/transcription/src/utils.ts new file mode 100644 index 000000000..735f11f13 --- /dev/null +++ b/packages/transcription/src/utils.ts @@ -0,0 +1,32 @@ +import { join, parse } from 'node:path' +import { createWriteStream } from 'node:fs' +import { lstat, unlink } from 'node:fs/promises' +import assert from 'node:assert' +import { $ } from 'execa' +import { makeFileRequest } from '@peertube/peertube-server-commands' + +export const downloadFile = async (url: string, targetDirectory: string) => { + const { base } = parse(url) + const filePath = join(targetDirectory, base) + + const fileStream = createWriteStream(filePath) + const stream = makeFileRequest(url).pipe(fileStream) + + return await new Promise((resolve: (filePath: string) => void, reject) => { + stream.on('finish', () => resolve(filePath)) + stream.on('error', async e => { + fileStream.close() + await unlink(filePath) + reject(e.message) + }) + }) +} + +export const unzip = async (zipFilePath: string) => { + assert(await lstat(zipFilePath).then(stats => stats.isFile()), `${zipFilePath} isn't a file.`) + const { dir, name } = parse(zipFilePath) + + await $`unzip -o ${zipFilePath} -d ${dir}` + + return join(dir, name) +} diff --git a/packages/transcription/src/whisper/README.md b/packages/transcription/src/whisper/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/packages/transcription/src/whisper/engines.ts b/packages/transcription/src/whisper/engines.ts new file mode 100644 index 000000000..711b098b7 --- /dev/null +++ b/packages/transcription/src/whisper/engines.ts @@ -0,0 +1,51 @@ +import { TranscriptionEngine } from '../transcription-engine.js' + +export const engines: TranscriptionEngine[] = [ + { + name : 'whisper-cpp', + description : 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model', + type: 'binary', + binary: 'main', + language : 'cpp', + requirements : [], + forgeURL : 'https://github.com/ggerganov/whisper.cpp', + license : 'MIT', + supportedModelFormats: [ 'ONNX' ] + }, + { + name: 'openai-whisper', + description: 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model', + requirements: [ 'python', 'pyTorch', 'ffmpeg' ], + language: 'python', + type: 'binary', + binary: 'whisper', + forgeURL: 'https://github.com/openai/whisper', + license: 'MIT', + supportedModelFormats: [ 'PyTorch' ], + languageDetection: true + }, + { + name: 'whisper-ctranslate2', + description: '', + requirements: [ 'python' ], + language: 'python', + type: 'binary', + binary: 'whisper-ctranslate2', + forgeURL: 'https://github.com/openai/whisper', + license: 'MIT', + supportedModelFormats: [ 'CTranslate2' ], + languageDetection: true + }, + { + name: 'whisper-timestamped', + description: '', + requirements: [ 'python' ], + language: 'python', + type: 'binary', + binary: 'whisper_timestamped', + forgeURL: 'https://github.com/openai/whisper', + license: 'MIT', + supportedModelFormats: [ 'CTranslate2' ], + languageDetection: true + } +] diff --git a/packages/transcription/src/whisper/index.ts b/packages/transcription/src/whisper/index.ts new file mode 100644 index 000000000..ee9cae725 --- /dev/null +++ b/packages/transcription/src/whisper/index.ts @@ -0,0 +1,3 @@ +export * from './transcriber/index.js' +export * from './engines.js' +export * from './whisper-builtin-model.js' diff --git a/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts b/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts new file mode 100644 index 000000000..01b9739a3 --- /dev/null +++ b/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts @@ -0,0 +1,49 @@ +import { $ } from 'execa' +import { buildSUUID } from '@peertube/peertube-node-utils' +import { lstat } from 'node:fs/promises' +import { OpenaiTranscriber, WhisperTranscribeArgs } from './openai-transcriber.js' +import { TranscriptFile } from '../../transcript/index.js' +import { WhisperBuiltinModel } from '../whisper-builtin-model.js' +import assert from 'node:assert' + +export class Ctranslate2Transcriber extends OpenaiTranscriber { + async transcribe ({ + mediaFilePath, + model = new WhisperBuiltinModel('tiny'), + language, + format = 'vtt', + runId = buildSUUID() + }: WhisperTranscribeArgs): Promise { + this.assertLanguageDetectionAvailable(language) + + const $$ = $({ verbose: process.env.NODE_ENV !== 'production' }) + + if (model.path) { + assert(await lstat(model.path).then(stats => stats.isDirectory()), 'Model path must be a path to a directory.') + } + + const modelArgs = model.path ? [ '--model_directory', model.path ] : [ '--model', model.name ] + const languageArgs = language ? [ '--language', language ] : [] + + this.createRun(runId) + this.startRun() + await $$`${this.engine.binary} ${[ + mediaFilePath, + ...modelArgs, + '--word_timestamps', + 'True', + '--output_format', + 'all', + '--output_dir', + this.transcriptDirectory, + ...languageArgs + ]}` + this.stopRun() + + return new TranscriptFile({ + language: language || await this.getDetectedLanguage(mediaFilePath), + path: this.getTranscriptFilePath(mediaFilePath, format), + format + }) + } +} diff --git a/packages/transcription/src/whisper/transcriber/index.ts b/packages/transcription/src/whisper/transcriber/index.ts new file mode 100644 index 000000000..950c39b07 --- /dev/null +++ b/packages/transcription/src/whisper/transcriber/index.ts @@ -0,0 +1,3 @@ +export * from './ctranslate2-transcriber.js' +export * from './openai-transcriber.js' +export * from './timestamped-transcriber.js' diff --git a/packages/transcription/src/whisper/transcriber/openai-transcriber.ts b/packages/transcription/src/whisper/transcriber/openai-transcriber.ts new file mode 100644 index 000000000..5d9a7ce85 --- /dev/null +++ b/packages/transcription/src/whisper/transcriber/openai-transcriber.ts @@ -0,0 +1,62 @@ +import { join } from 'path' +import { $ } from 'execa' +import { buildSUUID } from '@peertube/peertube-node-utils' +import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js' +import { AbstractTranscriber, TranscribeArgs } from '../../abstract-transcriber.js' +import { WhisperBuiltinModel } from '../whisper-builtin-model.js' +import { TranscriptionModel } from '../../transcription-model.js' +import { readFile } from 'node:fs/promises' +import { parse } from 'node:path' + +export type WhisperTranscribeArgs = Omit & { model?: TranscriptionModel } + +export class OpenaiTranscriber extends AbstractTranscriber { + async transcribe ({ + mediaFilePath, + model = new WhisperBuiltinModel('tiny'), + language, + format = 'vtt', + runId = buildSUUID() + }: WhisperTranscribeArgs): Promise { + this.assertLanguageDetectionAvailable(language) + + const $$ = $({ verbose: process.env.NODE_ENV !== 'production' }) + const languageArgs = language ? [ '--language', language ] : [] + + this.createRun(runId) + this.startRun() + await $$`${this.engine.binary} ${[ + mediaFilePath, + '--word_timestamps', + 'True', + '--model', + model?.path || model.name, + '--output_format', + 'all', + '--output_dir', + this.transcriptDirectory, + ...languageArgs + ]}` + this.stopRun() + + return new TranscriptFile({ + language: language || await this.getDetectedLanguage(mediaFilePath), + path: this.getTranscriptFilePath(mediaFilePath, format), + format + }) + } + + async getDetectedLanguage (mediaFilePath: string) { + const { language } = await this.readJsonTranscriptFile(mediaFilePath) + + return language + } + + async readJsonTranscriptFile (mediaFilePath: string) { + return JSON.parse(await readFile(this.getTranscriptFilePath(mediaFilePath, 'json'), 'utf8')) + } + + getTranscriptFilePath (mediaFilePath: string, format: TranscriptFormat) { + return join(this.transcriptDirectory, `${parse(mediaFilePath).name}.${format}`) + } +} diff --git a/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts b/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts new file mode 100644 index 000000000..fcdd33eb1 --- /dev/null +++ b/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts @@ -0,0 +1,55 @@ +import { $ } from 'execa' +import { buildSUUID } from '@peertube/peertube-node-utils' +import assert from 'node:assert' +import { join, parse } from 'node:path' +import { existsSync } from 'node:fs' +import { rename } from 'node:fs/promises' +import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js' +import { OpenaiTranscriber, WhisperTranscribeArgs } from './openai-transcriber.js' +import { WhisperBuiltinModel } from '../whisper-builtin-model.js' + +export class WhisperTimestampedTranscriber extends OpenaiTranscriber { + async transcribe ({ + mediaFilePath, + model = new WhisperBuiltinModel('tiny'), + language, + format = 'vtt', + runId = buildSUUID() + }: WhisperTranscribeArgs): Promise { + this.assertLanguageDetectionAvailable(language) + + const $$ = $({ verbose: process.env.NODE_ENV !== 'production' }) + const languageArgs = language ? [ '--language', language ] : [] + + this.createRun(runId) + this.startRun() + await $$`${this.engine.binary} ${[ + mediaFilePath, + '--model', + model?.path || model.name, + '--output_format', + 'all', + '--output_dir', + this.transcriptDirectory, + ...languageArgs + ]}` + this.stopRun() + + const internalTranscriptPath = this.getTranscriptFilePath(mediaFilePath, format, false) + const transcriptPath = join(this.transcriptDirectory, `${parse(mediaFilePath).name}.${format}`) + // Whisper timestamped output files with the video file extension by defaults, ex: video.mp4.vtt + // @see https://github.com/linto-ai/whisper-timestamped/issues/189 + assert(existsSync(internalTranscriptPath), `${internalTranscriptPath} file doesn't exist.`) + await rename(internalTranscriptPath, transcriptPath) + // communiquer-lors-dune-classe-transplantee.mp4.words.json + return new TranscriptFile({ + language: language || await this.getDetectedLanguage(mediaFilePath), + path: transcriptPath, + format + }) + } + + getTranscriptFilePath (mediaFilePath: string, format: TranscriptFormat, words = true) { + return join(this.transcriptDirectory, `${parse(mediaFilePath).base}${words ? '.words' : ''}.${format}`) + } +} diff --git a/packages/transcription/src/whisper/whisper-builtin-model.ts b/packages/transcription/src/whisper/whisper-builtin-model.ts new file mode 100644 index 000000000..32981ad20 --- /dev/null +++ b/packages/transcription/src/whisper/whisper-builtin-model.ts @@ -0,0 +1,11 @@ +import { TranscriptionModel } from '../transcription-model.js' + +export type WhisperBuiltinModelName = 'tiny' | 'base' | 'small' | 'medium' | 'large' | 'large-v2' | 'large-v3' + +export class WhisperBuiltinModel extends TranscriptionModel { + + // eslint-disable-next-line @typescript-eslint/no-useless-constructor + constructor (name: WhisperBuiltinModelName) { + super(name) + } +} diff --git a/packages/transcription/tsconfig.json b/packages/transcription/tsconfig.json new file mode 100644 index 000000000..94971d65a --- /dev/null +++ b/packages/transcription/tsconfig.json @@ -0,0 +1,15 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "outDir": "./dist", + "rootDir": "src", + "tsBuildInfoFile": "./dist/.tsbuildinfo" + }, + "references": [ + { "path": "../models" }, + { "path": "../core-utils" }, + { "path": "../node-utils" }, + { "path": "../jiwer" }, + { "path": "../server-commands" } + ] +} diff --git a/packages/transcription/tsconfig.types.json b/packages/transcription/tsconfig.types.json new file mode 100644 index 000000000..9edb53ece --- /dev/null +++ b/packages/transcription/tsconfig.types.json @@ -0,0 +1,10 @@ +{ + "extends": "./tsconfig.json", + "compilerOptions": { + "outDir": "../types-generator/dist/peertube-transcription", + "tsBuildInfoFile": "../types-generator/dist/peertube-transcription/.tsbuildinfo", + "stripInternal": true, + "removeComments": false, + "emitDeclarationOnly": true + } +} diff --git a/scripts/ci.sh b/scripts/ci.sh index 3d29b7ae9..8d483b42c 100755 --- a/scripts/ci.sh +++ b/scripts/ci.sh @@ -146,4 +146,13 @@ elif [ "$1" = "lint" ]; then npm run swagger-cli -- validate support/doc/api/openapi.yaml ( cd client && npm run lint ) +elif [ "$1" = "transcription" ]; then + npm run preinstall --workspace=@peertube/peertube-transcription --workspace=@peertube/peertube-jiwer + npm run build:server + npm run build:tests + + transcriptionFiles=$(findTestFiles ./packages/tests/dist/transcription) + jiwerFiles=$(findTestFiles ./packages/tests/dist/jiwer) + + MOCHA_PARALLEL=true runJSTest "$1" $((3*$speedFactor)) $transcriptionFiles $jiwerFiles fi diff --git a/server/tsconfig.json b/server/tsconfig.json index 21442d082..ed0bfca48 100644 --- a/server/tsconfig.json +++ b/server/tsconfig.json @@ -14,6 +14,7 @@ { "path": "../packages/ffmpeg" }, { "path": "../packages/models" }, { "path": "../packages/node-utils" }, + { "path": "../packages/transcription" }, { "path": "../packages/typescript-utils" } ], "include": [ diff --git a/tsconfig.eslint.json b/tsconfig.eslint.json index c2e868173..61542e14e 100644 --- a/tsconfig.eslint.json +++ b/tsconfig.eslint.json @@ -24,9 +24,11 @@ { "path": "./apps/peertube-cli" }, { "path": "./packages/core-utils" }, { "path": "./packages/ffmpeg" }, + { "path": "./packages/jiwer" }, { "path": "./packages/models" }, { "path": "./packages/node-utils" }, { "path": "./packages/server-commands" }, + { "path": "./packages/transcription" }, { "path": "./packages/typescript-utils" } ] }