diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 32211545c..04bff26aa 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -39,7 +39,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        test_suite: [ types-package, client, api-1, api-2, api-3, api-4, api-5, cli-plugin, lint, external-plugins ]
+        test_suite: [ types-package, client, api-1, api-2, api-3, api-4, api-5, transcription, cli-plugin, lint, external-plugins ]
 
     env:
       PGUSER: peertube
diff --git a/.gitignore b/.gitignore
index 55707fb80..6865442eb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 node_modules
 *npm-debug.log
 yarn-error.log
+*-ci.log
 .yarn
 
 # Testing
diff --git a/apps/peertube-runner/tsconfig.json b/apps/peertube-runner/tsconfig.json
index 03660b0eb..776e1ab5c 100644
--- a/apps/peertube-runner/tsconfig.json
+++ b/apps/peertube-runner/tsconfig.json
@@ -11,6 +11,7 @@
     { "path": "../../packages/ffmpeg" },
     { "path": "../../packages/models" },
     { "path": "../../packages/node-utils" },
-    { "path": "../../packages/server-commands" }
+    { "path": "../../packages/server-commands" },
+    { "path": "../../packages/transcription" },
   ]
 }
diff --git a/package.json b/package.json
index cbabdfc35..e04151785 100644
--- a/package.json
+++ b/package.json
@@ -25,6 +25,7 @@
   ],
   "scripts": {
     "benchmark-server": "tsx --conditions=peertube:tsx ./scripts/benchmark.ts",
+    "benchmark-transcription": "tsx --conditions=peertube:tsx --tsconfig ./packages/transcription/tsconfig.json ./packages/transcription/src/benchmark.ts",
     "build:client": "bash ./scripts/build/client.sh",
     "build:embed": "bash ./scripts/build/embed.sh",
     "build:peertube-cli": "bash ./scripts/build/peertube-cli.sh",
diff --git a/packages/core-utils/src/common/date.ts b/packages/core-utils/src/common/date.ts
index bddd5da23..ffb959868 100644
--- a/packages/core-utils/src/common/date.ts
+++ b/packages/core-utils/src/common/date.ts
@@ -125,7 +125,7 @@ function secondsToTime (options: {
   else if (minutes >= 1) time += formatNumber(minutes) + minuteSymbol
   else if (format === 'full') time += '00' + minuteSymbol
 
-  seconds %= 60
+  seconds = Math.round(seconds) % 60
   if (seconds >= 1 && seconds < 10 && format === 'full') time += '0' + seconds + secondsSymbol
   else if (seconds >= 1) time += formatNumber(seconds) + secondsSymbol
   else if (format === 'full') time += '00'
@@ -133,6 +133,14 @@ function secondsToTime (options: {
   return time
 }
 
+function millisecondsToTime (options: {
+  seconds: number
+  format: 'short' | 'full' | 'locale-string' // default 'short'
+  symbol?: string
+} | number) {
+  return secondsToTime(typeof options === 'number' ? options / 1000 : { ...options, seconds: options.seconds / 1000 })
+}
+
 // ---------------------------------------------------------------------------
 
 export {
@@ -143,7 +151,8 @@ export {
   isLastMonth,
   isLastWeek,
   timeToInt,
-  secondsToTime
+  secondsToTime,
+  millisecondsToTime
 }
 
 // ---------------------------------------------------------------------------
diff --git a/packages/jiwer/README.md b/packages/jiwer/README.md
new file mode 100644
index 000000000..663c399bf
--- /dev/null
+++ b/packages/jiwer/README.md
@@ -0,0 +1,37 @@
+JiWER
+=====
+__JiWER__ CLI NodeJs wrapper.
+
+> *JiWER is a python tool for computing the word-error-rate of ASR systems.*
+> https://jitsi.github.io/jiwer/cli/
+
+__JiWER__ serves as a reference implementation to calculate errors rates between 2 text files:
+- WER (Word Error Rate)
+- CER (Character Error Rate)
+
+Build
+-----
+
+```sh
+npm run build
+```
+
+Usage
+-----
+```typescript
+const jiwerCLI = new JiwerClI('./reference.txt', './hypothesis.txt')
+
+// WER as a percentage, ex: 0.03 -> 3%
+console.log(await jiwerCLI.wer())
+
+// CER as a percentage: 0.01 -> 1%
+console.log(await jiwerCLI.cer())
+
+// Detailed comparison report
+console.log(await jiwerCLI.alignment())
+```
+
+Resources
+---------
+- https://jitsi.github.io/jiwer/
+- https://github.com/rapidfuzz/RapidFuzz
diff --git a/packages/jiwer/package.json b/packages/jiwer/package.json
new file mode 100644
index 000000000..b01476956
--- /dev/null
+++ b/packages/jiwer/package.json
@@ -0,0 +1,20 @@
+{
+  "name": "@peertube/peertube-jiwer",
+  "private": true,
+  "version": "0.0.0",
+  "main": "dist/index.js",
+  "files": [ "dist" ],
+  "exports": {
+    "types": "./dist/index.d.ts",
+    "peertube:tsx": "./src/index.ts",
+    "default": "./dist/index.js"
+  },
+  "type": "module",
+  "devDependencies": {},
+  "scripts": {
+    "preinstall": "pip install -r requirements.txt",
+    "build": "tsc",
+    "watch": "tsc -w"
+  },
+  "dependencies": {}
+}
diff --git a/packages/jiwer/requirements.txt b/packages/jiwer/requirements.txt
new file mode 100644
index 000000000..53d9fb34b
--- /dev/null
+++ b/packages/jiwer/requirements.txt
@@ -0,0 +1 @@
+jiwer==3.0.4
diff --git a/packages/jiwer/src/index.ts b/packages/jiwer/src/index.ts
new file mode 100644
index 000000000..586fd0843
--- /dev/null
+++ b/packages/jiwer/src/index.ts
@@ -0,0 +1 @@
+export * from './jiwer-cli.js'
diff --git a/packages/jiwer/src/jiwer-cli.ts b/packages/jiwer/src/jiwer-cli.ts
new file mode 100644
index 000000000..8ce87f2b1
--- /dev/null
+++ b/packages/jiwer/src/jiwer-cli.ts
@@ -0,0 +1,69 @@
+import { $ } from 'execa'
+
+export class JiwerClI {
+  referenceFilePath: string
+  hypothesisFilePath: string
+
+  constructor (referenceFilePath: string, hypothesisFilePath: string) {
+    this.referenceFilePath = referenceFilePath
+    this.hypothesisFilePath = hypothesisFilePath
+  }
+
+  /**
+   * @param referenceFilePath Path to new-line delimited text file of reference sentences.
+   * @param hypothesisFilePath Path to new-line delimited text file of hypothesis sentences.
+   * @param args
+   */
+  static buildArgs (referenceFilePath: string, hypothesisFilePath: string, ...args: string[]) {
+    return [
+      '--reference',
+      referenceFilePath,
+      '--hypothesis',
+      hypothesisFilePath,
+      ...args
+    ]
+  }
+
+  buildArgs (...args: string[]) {
+    return JiwerClI.buildArgs(this.referenceFilePath, this.hypothesisFilePath, ...args)
+  }
+
+  /**
+   * WER: Word Error Rate as a percentage, ex: 0.03 -> 3%
+   */
+  static async wer (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<number> {
+    const { stdout: wer } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, global && '-g')}`
+
+    return Number(wer)
+  }
+
+  async wer (global = true) {
+    return await JiwerClI.wer(this.hypothesisFilePath, this.referenceFilePath, global)
+  }
+
+  /**
+   * CER: Character Error Rate
+   */
+  static async cer (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<number> {
+    const { stdout: cer } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, '--cer', global && '-g')}`
+
+    return Number(cer)
+  }
+
+  async cer (global = true) {
+    return await JiwerClI.cer(this.hypothesisFilePath, this.referenceFilePath, global)
+  }
+
+  /**
+   * Print alignment of each sentence.
+   */
+  static async alignment (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<string> {
+    const { stdout: alignment } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, '--align', global && '-g')}`
+
+    return alignment
+  }
+
+  async alignment (global = true) {
+    return await JiwerClI.alignment(this.hypothesisFilePath, this.referenceFilePath, global)
+  }
+}
diff --git a/packages/jiwer/tsconfig.json b/packages/jiwer/tsconfig.json
new file mode 100644
index 000000000..58fa2330b
--- /dev/null
+++ b/packages/jiwer/tsconfig.json
@@ -0,0 +1,8 @@
+{
+  "extends": "../../tsconfig.base.json",
+  "compilerOptions": {
+    "outDir": "./dist",
+    "rootDir": "src",
+    "tsBuildInfoFile": "./dist/.tsbuildinfo"
+  }
+}
diff --git a/packages/node-utils/src/uuid.ts b/packages/node-utils/src/uuid.ts
index f158ec487..68110eb0e 100644
--- a/packages/node-utils/src/uuid.ts
+++ b/packages/node-utils/src/uuid.ts
@@ -1,4 +1,4 @@
-import short from 'short-uuid'
+import short, { SUUID } from 'short-uuid'
 
 const translator = short()
 
@@ -6,6 +6,10 @@ function buildUUID () {
   return short.uuid()
 }
 
+function buildSUUID (): SUUID {
+  return short.generate()
+}
+
 function uuidToShort (uuid: string) {
   if (!uuid) return uuid
 
@@ -26,7 +30,10 @@ function isShortUUID (value: string) {
 
 export {
   buildUUID,
+  buildSUUID,
   uuidToShort,
   shortToUUID,
   isShortUUID
 }
+
+export type { SUUID }
diff --git a/packages/server-commands/src/requests/requests.ts b/packages/server-commands/src/requests/requests.ts
index 9e77ff282..49b7f9ce6 100644
--- a/packages/server-commands/src/requests/requests.ts
+++ b/packages/server-commands/src/requests/requests.ts
@@ -59,6 +59,15 @@ export function makeRawRequest (options: {
   return makeGetRequest(reqOptions)
 }
 
+export const makeFileRequest = (url: string) => {
+  return makeRawRequest({
+    url,
+    responseType: 'arraybuffer',
+    redirects: 1,
+    expectedStatus: HttpStatusCode.OK_200
+  })
+}
+
 export function makeGetRequest (options: CommonRequestParams & {
   query?: any
   rawQuery?: string
diff --git a/packages/tests/fixtures/transcription/hello_world.zip b/packages/tests/fixtures/transcription/hello_world.zip
new file mode 100644
index 000000000..c99f72a38
Binary files /dev/null and b/packages/tests/fixtures/transcription/hello_world.zip differ
diff --git a/packages/tests/fixtures/transcription/videos/README.md b/packages/tests/fixtures/transcription/videos/README.md
new file mode 100644
index 000000000..b892a7553
--- /dev/null
+++ b/packages/tests/fixtures/transcription/videos/README.md
@@ -0,0 +1,16 @@
+🇫🇷 DRANE Occitanie - Communiquer lors d'une classe transplantée
+[./communiquer-lors-dune-classe-transplantee.mp4](communiquer-lors-dune-classe-transplantee.mp4)
+> https://podeduc.apps.education.fr/numerique-educatif/video/21893-communiquer-lors-dune-classe-transplantee/
+>
+> CC BY-NC-SA 4.0 Deed
+> Attribution-NonCommercial-ShareAlike 4.0 International
+
+🇫🇷 [Accompagner la victime d'une dérive sectaire ou d'une emprise mentale](https://www.fun-mooc.fr/fr/cours/accompagner-la-victime-de-derive-sectaire/)
+> Centre Contre les Manipulations Mentales (CCMM)
+> [CC BY-NC-ND 4.0 Deed](https://creativecommons.org/licenses/by-nc-nd/4.0/)
+> Attribution-NonCommercial-NoDerivs 4.0 International
+
+🇺🇸 [The Last Man On Earth (1964)](https://archive.org/details/TheLastManOnEarthHD)
+> PDM 1.0 Deed
+> Public Domain Mark 1.0 Universal
+> https://creativecommons.org/publicdomain/mark/1.0/
diff --git a/packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.mp4 b/packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.mp4
new file mode 100644
index 000000000..4ef5b6ee1
Binary files /dev/null and b/packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.mp4 differ
diff --git a/packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.txt b/packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.txt
new file mode 100644
index 000000000..1c700efa1
--- /dev/null
+++ b/packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.txt
@@ -0,0 +1,10 @@
+Communiquer lors d'une classe transplantée. Utiliser les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
+C'est le scénario pédagogique présenté par Monsieur Navoli, professeur en cycle 3 sur une école élémentaire de Montpellier.
+La première application utilisée sera la médiathèque. L'enseignant va alors transférer les différentes photos réalisées lors de la classe transplantée.
+Dans un dossier spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans l'ENT, dans la médiathèque de la classe.
+Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
+Les élèves par la suite utiliseront le blog, à partir de leurs notes, il pourront, seul ou à 2 par poste rédiger un article dans leur ENT.
+Ils illustreront ces articles à l'aide des photos et documents numériques mis en accès libre dans l'ENT.
+Pour ce faire, il pourront utiliser l'éditeur avancé qui les renverra directement dans la médiathèque de la classe, où ils pourront retrouver le dossier créé par leur enseignant.
+Une fois leur article terminé, les élèves soumettront celui-ci au professeur qui pourra soit l'annoter pour correction ou le publier.
+Ensuite, il pourront lire et commenter ceux de leurs camarades, ou répondre aux commentaires de la veille.
diff --git a/packages/tests/fixtures/transcription/videos/derive_sectaire.mp4 b/packages/tests/fixtures/transcription/videos/derive_sectaire.mp4
new file mode 100644
index 000000000..f6969a62b
Binary files /dev/null and b/packages/tests/fixtures/transcription/videos/derive_sectaire.mp4 differ
diff --git a/packages/tests/fixtures/transcription/videos/derive_sectaire.srt b/packages/tests/fixtures/transcription/videos/derive_sectaire.srt
new file mode 100644
index 000000000..d7d14d2e4
--- /dev/null
+++ b/packages/tests/fixtures/transcription/videos/derive_sectaire.srt
@@ -0,0 +1,165 @@
+﻿
+1
+00:00:03,640 --> 00:00:05,640
+-Bonjour et bienvenue sur FUN MOOC.
+
+2
+00:00:05,960 --> 00:00:09,000
+Notre MOOC "Comment parler
+à une victime d'emprise mentale
+
+3
+00:00:09,320 --> 00:00:10,400
+ou de dérive sectaire"
+
+4
+00:00:10,720 --> 00:00:13,840
+s'adresse à tout professionnel
+du domaine de la santé,
+
+5
+00:00:14,160 --> 00:00:15,920
+de l'associatif, du juridique,
+
+6
+00:00:16,240 --> 00:00:18,800
+qui pourra être en contact
+avec une victime de telles dérives.
+
+7
+00:00:21,720 --> 00:00:23,840
+Il sera composé de 14 leçons vidéo
+
+8
+00:00:24,160 --> 00:00:26,040
+d'une dizaine de minutes
+
+9
+00:00:26,360 --> 00:00:28,600
+divisées en quatre blocs.
+
+10
+00:00:31,800 --> 00:00:34,960
+Le premier bloc vous informera
+de ce que sont exactement
+
+11
+00:00:35,280 --> 00:00:37,720
+l'emprise mentale
+et une dérive sectaire.
+
+12
+00:00:38,040 --> 00:00:42,440
+-Ça consiste toujours
+en une forme de manipulation
+
+13
+00:00:43,520 --> 00:00:47,320
+qui conduit à une dépendance,
+à une sorte de cercle vicieux,
+
+14
+00:00:47,640 --> 00:00:51,200
+où les personnes ne parviennent pas
+à se désengager d'un processus
+
+15
+00:00:51,520 --> 00:00:54,120
+qui les conduit
+soit à donner de l'argent,
+
+16
+00:00:54,440 --> 00:00:56,160
+à se livrer à des actes
+
+17
+00:00:56,480 --> 00:00:58,480
+qu'en réalité
+ils n'auraient pas acceptés,
+
+18
+00:00:58,800 --> 00:01:02,160
+ou, tout simplement, à accepter
+de participer à une organisation
+
+19
+00:01:02,480 --> 00:01:03,760
+dont ils ne partagent pas
+
+20
+00:01:04,080 --> 00:01:06,040
+toutes les méthodes
+ou tous les points de vue.
+
+21
+00:01:06,360 --> 00:01:10,080
+-Le deuxième bloc vous informera
+des bonnes techniques d'écoute
+
+22
+00:01:10,400 --> 00:01:12,680
+d'une personne
+ayant vécu de tels traumatismes.
+
+23
+00:01:13,000 --> 00:01:14,760
+-C'est un sujet actuel
+
+24
+00:01:15,080 --> 00:01:17,320
+parce que ce phénomène
+est en croissance.
+
+25
+00:01:17,640 --> 00:01:20,000
+Il y a une augmentation très importante,
+un doublement,
+
+26
+00:01:20,320 --> 00:01:21,400
+en l'espace de quelques années,
+
+27
+00:01:21,720 --> 00:01:22,960
+en moins de 10 ans.
+
+28
+00:01:27,200 --> 00:01:31,000
+-Le bloc 3, lui,
+sera conçu par nos juristes
+
+29
+00:01:31,320 --> 00:01:34,080
+pour vous indiquer
+quelles sont les grandes infractions
+
+30
+00:01:34,400 --> 00:01:36,960
+en lien avec l'emprise mentale,
+
+31
+00:01:37,280 --> 00:01:39,120
+et surtout, pouvoir faire
+une analyse perspicace
+
+32
+00:01:39,440 --> 00:01:41,640
+d'une situation individuelle.
+
+33
+00:01:43,760 --> 00:01:46,960
+Enfin, le bloc 4 vous assistera
+
+34
+00:01:47,280 --> 00:01:50,320
+pour savoir comment aiguiller
+une victime
+
+35
+00:01:50,640 --> 00:01:52,400
+vers les bons professionnels.
+
+36
+00:01:53,160 --> 00:01:54,040
+Bonne formation.
+
diff --git a/packages/tests/fixtures/transcription/videos/derive_sectaire.txt b/packages/tests/fixtures/transcription/videos/derive_sectaire.txt
new file mode 100644
index 000000000..4f85cd324
--- /dev/null
+++ b/packages/tests/fixtures/transcription/videos/derive_sectaire.txt
@@ -0,0 +1,11 @@
+﻿-Bonjour et bienvenue sur FUN MOOC.
+Notre MOOC "Comment parler à une victime d'emprise mentale ou de dérive sectaire" s'adresse à tout professionnel du domaine de la santé, de l'associatif, du juridique, qui pourra être en contact avec une victime de telles dérives.
+Il sera composé de 14 leçons vidéo d'une dizaine de minutes divisées en quatre blocs.
+Le premier bloc vous informera de ce que sont exactement l'emprise mentale et une dérive sectaire.
+-Ça consiste toujours en une forme de manipulation qui conduit à une dépendance, à une sorte de cercle vicieux, où les personnes ne parviennent pas à se désengager d'un processus qui les conduit soit à donner de l'argent, à se livrer à des actes qu'en réalité ils n'auraient pas acceptés, ou, tout simplement, à accepter de participer à une organisation dont ils ne partagent pas toutes les méthodes ou tous les points de vue.
+-Le deuxième bloc vous informera des bonnes techniques d'écoute d'une personne ayant vécu de tels traumatismes.
+-C'est un sujet actuel parce que ce phénomène est en croissance.
+Il y a une augmentation très importante, un doublement, en l'espace de quelques années, en moins de 10 ans.
+-Le bloc 3, lui, sera conçu par nos juristes pour vous indiquer quelles sont les grandes infractions en lien avec l'emprise mentale, et surtout, pouvoir faire une analyse perspicace d'une situation individuelle.
+Enfin, le bloc 4 vous assistera pour savoir comment aiguiller une victime vers les bons professionnels.
+Bonne formation.
\ No newline at end of file
diff --git a/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.mp4 b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.mp4
new file mode 100644
index 000000000..45ef4325e
Binary files /dev/null and b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.mp4 differ
diff --git a/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.srt b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.srt
new file mode 100644
index 000000000..d1ceebe10
--- /dev/null
+++ b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.srt
@@ -0,0 +1,17 @@
+1
+00:00:00,000 --> 00:00:01,940
+December, 1965.
+
+2
+00:00:03,460 --> 00:00:06,660
+Is that all it has been since
+I inherited the world?
+
+3
+00:00:07,020 --> 00:00:08,900
+Only three years.
+
+4
+00:00:09,940 --> 00:00:11,760
+Seems like a hundred million.
+
diff --git a/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.txt b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.txt
new file mode 100644
index 000000000..2a8ed1a11
--- /dev/null
+++ b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.txt
@@ -0,0 +1,5 @@
+December, 1965.
+Is that all it has been since
+I inherited the world?
+Only three years.
+It seems like a hundred million.
diff --git a/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.vtt b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.vtt
new file mode 100644
index 000000000..62888b29c
--- /dev/null
+++ b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.vtt
@@ -0,0 +1,14 @@
+WEBVTT
+
+00:00.000 --> 00:01.940
+December, 1965.
+
+00:03.460 --> 00:06.660
+Is that all it has been since I inherited the world?
+
+00:07.020 --> 00:08.900
+Only three years.
+
+00:09.940 --> 00:11.760
+Seems like a hundred million.
+
diff --git a/packages/tests/src/core-utils/date.ts b/packages/tests/src/core-utils/date.ts
new file mode 100644
index 000000000..90600187e
--- /dev/null
+++ b/packages/tests/src/core-utils/date.ts
@@ -0,0 +1,29 @@
+import { millisecondsToTime, secondsToTime } from '@peertube/peertube-core-utils'
+import { expect } from 'chai'
+
+describe('Seconds to time', function () {
+  it('Outputs a human readable time', function () {
+    expect(secondsToTime(61.1335)).to.equals('1m1s')
+  })
+
+  it('Rounds the number of seconds to the nearest integer', function () {
+    expect(secondsToTime(61.4)).to.equals('1m1s')
+    expect(secondsToTime(61.6)).to.equals('1m2s')
+    expect(secondsToTime(61.51)).to.equals('1m2s')
+  })
+})
+
+describe('Milliseconds to time', function () {
+  it('Outputs a human readable time', function () {
+    expect(millisecondsToTime(60_000)).to.equals('1m')
+  })
+
+  it('Rounds the number of seconds to the nearest integer', function () {
+    expect(millisecondsToTime(60_100)).to.equals('1m')
+    expect(millisecondsToTime(60_501)).to.equals('1m1s')
+  })
+
+  it('Time inferior to 500ms appears as empty string', function () {
+    expect(millisecondsToTime(499)).to.equals('')
+  })
+})
diff --git a/packages/tests/src/jiwer/jiwer-cli.spec.ts b/packages/tests/src/jiwer/jiwer-cli.spec.ts
new file mode 100644
index 000000000..ccc2ab26b
--- /dev/null
+++ b/packages/tests/src/jiwer/jiwer-cli.spec.ts
@@ -0,0 +1,48 @@
+/* eslint-disable max-len */
+import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
+import { join } from 'path'
+import { mkdir, rm, writeFile } from 'node:fs/promises'
+import { expect } from 'chai'
+import { JiwerClI } from '@peertube/peertube-jiwer'
+
+describe('Jiwer CLI', function () {
+  const transcriptDirectory = buildAbsoluteFixturePath('transcription/transcript-evaluator')
+  const referenceTranscriptFilePath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.txt')
+  const hypothesis = join(transcriptDirectory, 'openai.txt')
+  const jiwerCLI = new JiwerClI(referenceTranscriptFilePath, hypothesis)
+
+  before(async function () {
+    await mkdir(transcriptDirectory, { recursive: true })
+    await writeFile(join(transcriptDirectory, 'openai.txt'), `Communiquez lors d'une classe transplante. Utilisez les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
+C'est le scénario P-Dagujic présenté par monsieur Navoli, professeur ainsi que le 3 sur une école alimentaire de Montpellier.
+La première application a utilisé ce ralame déatec. L'enseignant va alors transférer les différentes photos réalisés lors de la classe transplante.
+Dans un dossier, spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans le venté, dans la médiatèque de la classe.
+Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
+Les élèves par la suite utilisera le blog. A partir de leurs nantes, il pourront se loi de parposte rédigeant un article d'un reinté.
+Ils illustront ses articles à l'aide des photos de que mon numérique mise à n'accélier dans le venté.
+Pour se faire, il pourront utiliser les diteurs avancés qui les renvèrent directement dans la médiatèque de la classe où il pourront retrouver le dossier créé par leurs enseignants.
+Une fois leur article terminée, les élèves soumétront se lui-ci au professeur qui pourra soit la noté pour correction ou le public.
+Ensuite, il pourront lire et commenter ce de leurs camarades ou répondre aux commentaires de la veille.
+`)
+  })
+
+  it(`returns coherent wer`, async function () {
+    const wer = await jiwerCLI.wer()
+    expect(wer).to.be.below(30 / 100)
+    expect(wer).to.be.greaterThan(0 / 100)
+  })
+
+  it(`returns coherent cer`, async function () {
+    const cer = await jiwerCLI.cer()
+    expect(cer).to.be.below(10 / 100)
+    expect(cer).to.be.greaterThan(9 / 100)
+  })
+
+  it(`print alignment`, async function () {
+    console.log(await jiwerCLI.alignment())
+  })
+
+  after(async function () {
+    await rm(transcriptDirectory, { recursive: true, force: true })
+  })
+})
diff --git a/packages/tests/src/shared/fixture-urls.ts b/packages/tests/src/shared/fixture-urls.ts
index 56638e13b..b4c659e76 100644
--- a/packages/tests/src/shared/fixture-urls.ts
+++ b/packages/tests/src/shared/fixture-urls.ts
@@ -29,5 +29,7 @@ export const FIXTURE_URLS = {
 
   chatersVideo: 'https://download.cpy.re/peertube/video_chapters.mp4',
 
-  file4K: 'https://download.cpy.re/peertube/4k_file.txt'
+  file4K: 'https://download.cpy.re/peertube/4k_file.txt',
+
+  transcriptionModels: 'https://download.cpy.re/peertube/transcription-models.zip'
 }
diff --git a/packages/tests/src/transcription/levenshtein-distance.spec.ts b/packages/tests/src/transcription/levenshtein-distance.spec.ts
new file mode 100644
index 000000000..d4c502a02
--- /dev/null
+++ b/packages/tests/src/transcription/levenshtein-distance.spec.ts
@@ -0,0 +1,18 @@
+import { expect } from 'chai'
+import { levenshteinDistance } from '@peertube/peertube-transcription'
+
+describe('Levenshtein distance', function () {
+  it(`equals 1 when there is only one character difference`, function () {
+    expect(levenshteinDistance('abcd', 'abce')).equals(1)
+  })
+
+  it(`may calculate a distance on a txt subtitle content `, function () {
+    expect(levenshteinDistance(`December, 1965.
+Is that all it has been since
+I inherited the world?
+Only three years.
+Seems like a hundred million.
+
+`, 'December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.')).equals(13)
+  })
+})
diff --git a/packages/tests/src/transcription/subtitle.spec.ts b/packages/tests/src/transcription/subtitle.spec.ts
new file mode 100644
index 000000000..4a0ee4d95
--- /dev/null
+++ b/packages/tests/src/transcription/subtitle.spec.ts
@@ -0,0 +1,33 @@
+import { srtToTxt } from '@peertube/peertube-transcription'
+import { expect } from 'chai'
+
+describe('srt to txt', function () {
+  it(`Transforms the content of a srt subtitle to a pure text version`, function () {
+    const txt = srtToTxt(`1
+00:00:00,000 --> 00:00:01,940
+December, 1965.
+
+2
+00:00:03,460 --> 00:00:06,660
+Is that all it has been since
+I inherited the world?
+
+3
+00:00:07,020 --> 00:00:08,900
+Only three years.
+
+4
+00:00:09,940 --> 00:00:11,760
+Seems like a hundred million.
+
+`)
+
+    expect(txt).equals(`December, 1965.
+Is that all it has been since
+I inherited the world?
+Only three years.
+Seems like a hundred million.
+
+`)
+  })
+})
diff --git a/packages/tests/src/transcription/transcriber-factory.spec.ts b/packages/tests/src/transcription/transcriber-factory.spec.ts
new file mode 100644
index 000000000..cd9d9f29f
--- /dev/null
+++ b/packages/tests/src/transcription/transcriber-factory.spec.ts
@@ -0,0 +1,17 @@
+import { transcriberFactory } from '@peertube/peertube-transcription'
+
+describe('Transcriber factory', function () {
+  const transcribers = [
+    'openai-whisper',
+    'whisper-ctranslate2',
+    'whisper-timestamped'
+  ]
+
+  describe('Should be able to create a transcriber for each available transcription engine', function () {
+    transcribers.forEach(function (transcriberName) {
+      it(`Should be able to create a(n) ${transcriberName} transcriber`, function () {
+        transcriberFactory.createFromEngineName(transcriberName)
+      })
+    })
+  })
+})
diff --git a/packages/tests/src/transcription/transcript/transcript-file-evaluator.spec.ts b/packages/tests/src/transcription/transcript/transcript-file-evaluator.spec.ts
new file mode 100644
index 000000000..93df638db
--- /dev/null
+++ b/packages/tests/src/transcription/transcript/transcript-file-evaluator.spec.ts
@@ -0,0 +1,67 @@
+/* eslint-disable @typescript-eslint/no-unused-expressions, no-new, max-len */
+import { TranscriptFile, TranscriptFileEvaluator } from '@peertube/peertube-transcription'
+import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
+import { join } from 'node:path'
+import { mkdir, rm } from 'node:fs/promises'
+import { tmpdir } from 'node:os'
+import { expect } from 'chai'
+
+describe('Transcript File Evaluator', function () {
+  const transcriptDirectory = join(tmpdir(), 'peertube-transcription', 'transcript-file-evaluator')
+  const referenceTranscriptFilePath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.txt')
+
+  before(async function () {
+    await mkdir(transcriptDirectory, { recursive: true })
+  })
+
+  it(`may not compare files in another format than txt`, async function () {
+    const vttReference = await TranscriptFile.write({
+      path: join(transcriptDirectory, 'reference.vtt'),
+      format: 'vtt',
+      content: ''
+    })
+    const vttHypothesis = await TranscriptFile.write({
+      path: join(transcriptDirectory, 'hypothesis.vtt'),
+      format: 'vtt',
+      content: ''
+    })
+    expect(() => new TranscriptFileEvaluator(vttReference, vttHypothesis)).to.throw('Can only evaluate txt transcript file')
+  })
+
+  it(`evaluation must return coherent wer & cer`, async function () {
+    const reference = new TranscriptFile({
+      path: referenceTranscriptFilePath,
+      language: 'fr',
+      format: 'txt'
+    })
+    const hypothesis = await TranscriptFile.write({
+      path: join(transcriptDirectory, 'openai.txt'),
+      content: `Communiquez lors d'une classe transplante. Utilisez les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
+C'est le scénario P-Dagujic présenté par monsieur Navoli, professeur ainsi que le 3 sur une école alimentaire de Montpellier.
+La première application a utilisé ce ralame déatec. L'enseignant va alors transférer les différentes photos réalisés lors de la classe transplante.
+Dans un dossier, spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans le venté, dans la médiatèque de la classe.
+Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
+Les élèves par la suite utilisera le blog. A partir de leurs nantes, il pourront se loi de parposte rédigeant un article d'un reinté.
+Ils illustront ses articles à l'aide des photos de que mon numérique mise à n'accélier dans le venté.
+Pour se faire, il pourront utiliser les diteurs avancés qui les renvèrent directement dans la médiatèque de la classe où il pourront retrouver le dossier créé par leurs enseignants.
+Une fois leur article terminée, les élèves soumétront se lui-ci au professeur qui pourra soit la noté pour correction ou le public.
+Ensuite, il pourront lire et commenter ce de leurs camarades ou répondre aux commentaires de la veille.
+`,
+      format: 'txt',
+      language: 'fr'
+    })
+    const evaluator = new TranscriptFileEvaluator(reference, hypothesis)
+    const wer = await evaluator.wer()
+    expect(wer).to.be.greaterThan(0 / 100)
+    expect(wer).to.be.below(30 / 100)
+
+    const cer = await evaluator.cer()
+    expect(cer).to.be.greaterThan(9 / 100)
+    expect(cer).to.be.below(10 / 100)
+    console.log(await evaluator.alignment())
+  })
+
+  after(async function () {
+    await rm(transcriptDirectory, { recursive: true, force: true })
+  })
+})
diff --git a/packages/tests/src/transcription/transcript/transcript-file.spec.ts b/packages/tests/src/transcription/transcript/transcript-file.spec.ts
new file mode 100644
index 000000000..112c246b3
--- /dev/null
+++ b/packages/tests/src/transcription/transcript/transcript-file.spec.ts
@@ -0,0 +1,44 @@
+/* eslint-disable @typescript-eslint/no-unused-expressions */
+import { expect } from 'chai'
+import { join } from 'node:path'
+import { mkdir, rm } from 'node:fs/promises'
+import { TranscriptFile } from '@peertube/peertube-transcription'
+import { tmpdir } from 'node:os'
+import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
+
+describe('Transcript File', function () {
+  const transcriptFileDirectory = join(tmpdir(), 'peertube-transcription', 'transcript-file')
+  before(async function () {
+    await mkdir(transcriptFileDirectory, { recursive: true })
+  })
+
+  it(`may creates a new transcript file from scratch`, async function () {
+    const transcript1 = await TranscriptFile.write({
+      path: join(transcriptFileDirectory, 'test1.txt'),
+      content: 'test2',
+      format: 'txt'
+    })
+    const transcript2 = await TranscriptFile.write({
+      path: join(transcriptFileDirectory, 'test2.txt'),
+      content: 'test2',
+      format: 'txt'
+    })
+
+    expect(await transcript1.equals(transcript2)).to.be.true
+  })
+
+  it(`may creates a txt transcript file object from a transcript without providing the format explicitly`, function () {
+    TranscriptFile.fromPath(buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.srt'), 'en')
+    TranscriptFile.fromPath(buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.txt'), 'en')
+  })
+
+  it(`fails when loading a file which is obviously not a transcript`, function () {
+
+    expect(() => TranscriptFile.fromPath(buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4'), 'en'))
+      .to.throw(`Couldn't guess transcript format from extension "mp4". Valid formats are: txt, vtt, srt.`)
+  })
+
+  after(async function () {
+    await rm(transcriptFileDirectory, { recursive: true, force: true })
+  })
+})
diff --git a/packages/tests/src/transcription/transcription-run.spec.ts b/packages/tests/src/transcription/transcription-run.spec.ts
new file mode 100644
index 000000000..d877fbded
--- /dev/null
+++ b/packages/tests/src/transcription/transcription-run.spec.ts
@@ -0,0 +1 @@
+describe('Transcription run', function () {})
diff --git a/packages/tests/src/transcription/utils.spec.ts b/packages/tests/src/transcription/utils.spec.ts
new file mode 100644
index 000000000..487b9aeda
--- /dev/null
+++ b/packages/tests/src/transcription/utils.spec.ts
@@ -0,0 +1,44 @@
+import { cp, lstat, mkdir, rm } from 'node:fs/promises'
+import { join } from 'node:path'
+import { tmpdir } from 'node:os'
+import { expect } from 'chai'
+import { downloadFile, unzip } from '@peertube/peertube-transcription'
+import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
+
+describe('downloadFile', function () {
+  const testDirectory = join(tmpdir(), 'peertube-transcription', 'utils')
+  before(async function () {
+    await mkdir(testDirectory, { recursive: true })
+  })
+
+  it(`Downloads a file and write it to the disk `, async function () {
+    const filePath = await downloadFile('https://download.cpy.re/peertube/4k_file.txt', testDirectory)
+
+    expect(await lstat(filePath).then(stats => stats.isFile())).equals(true)
+  })
+
+  after(async function () {
+    await rm(testDirectory, { recursive: true, force: true })
+  })
+})
+
+describe('unzip', function () {
+  const zipFixtureFileName = 'hello_world.zip'
+  const zipFixtureFilePath = buildAbsoluteFixturePath(`transcription/${zipFixtureFileName}`)
+  const testDirectory = join(tmpdir(), 'peertube-transcription', 'utils')
+  before(async function () {
+    await mkdir(testDirectory, { recursive: true })
+  })
+
+  it(`Extract zip archive to directory`, async function () {
+    const zipFilePath = join(testDirectory, zipFixtureFileName)
+    await cp(zipFixtureFilePath, zipFilePath)
+    const unzippedDirectory = await unzip(zipFilePath)
+
+    expect(await lstat(unzippedDirectory).then(stats => stats.isDirectory())).equals(true)
+  })
+
+  after(async function () {
+    await rm(testDirectory, { recursive: true, force: true })
+  })
+})
diff --git a/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts b/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts
new file mode 100644
index 000000000..d3f3f9ca7
--- /dev/null
+++ b/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts
@@ -0,0 +1,125 @@
+/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */
+import { expect, config } from 'chai'
+import { createLogger } from 'winston'
+import { join } from 'node:path'
+import { mkdir, rm } from 'node:fs/promises'
+import { tmpdir } from 'node:os'
+import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
+import {
+  downloadFile,
+  levenshteinDistance,
+  OpenaiTranscriber,
+  TranscriptFile,
+  TranscriptFileEvaluator,
+  TranscriptionModel,
+  unzip,
+  WhisperBuiltinModel
+} from '@peertube/peertube-transcription'
+import { FIXTURE_URLS } from '@tests/shared/fixture-urls.js'
+
+config.truncateThreshold = 0
+
+describe('Open AI Whisper transcriber', function () {
+  const tmpDirectory = join(tmpdir(), 'peertube-transcription')
+  const transcriptDirectory = join(tmpDirectory, 'transcriber', 'openai')
+  const modelsDirectory = join(tmpDirectory, 'models')
+  const shortVideoPath = buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4')
+  const frVideoPath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
+  const referenceTranscriptFile = new TranscriptFile({
+    path: buildAbsoluteFixturePath('transcription/videos/derive_sectaire.txt'),
+    language: 'fr',
+    format: 'txt'
+  })
+  const transcriber = new OpenaiTranscriber(
+    {
+      name: 'openai-whisper',
+      requirements: [],
+      type: 'binary',
+      binary: 'whisper',
+      supportedModelFormats: [ 'PyTorch' ],
+      languageDetection: true
+    },
+    createLogger(),
+    transcriptDirectory
+  )
+
+  before(async function () {
+    this.timeout(1 * 1000 * 60)
+    await mkdir(transcriptDirectory, { recursive: true })
+    await unzip(await downloadFile(FIXTURE_URLS.transcriptionModels, tmpDirectory))
+  })
+
+  it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () {
+    this.timeout(3 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' })
+
+    expect(transcript.format).to.equals('vtt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('May produce a transcript file in the `srt` format', async function () {
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' })
+
+    expect(transcript.format).to.equals('srt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('May produce a transcript file in the `txt` format', async function () {
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' })
+
+    expect(transcript.format).to.equals('txt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+    expect(levenshteinDistance(
+      (await transcript.read()).toString(),
+      'December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.'
+    )).to.be.below(3)
+  })
+
+  it('May transcribe a media file using a local PyTorch model', async function () {
+    this.timeout(2 * 1000 * 60)
+    await transcriber.transcribe({
+      mediaFilePath: shortVideoPath,
+      model: await TranscriptionModel.fromPath(join(modelsDirectory, 'tiny.pt')),
+      language: 'en'
+    })
+  })
+
+  it('May transcribe a media file in french', async function () {
+    this.timeout(3 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath, language: 'fr', format: 'txt' })
+
+    expect(transcript.format).to.equals('txt')
+    expect(transcript.language).to.equals('fr')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('Guesses the video language if not provided', async function () {
+    this.timeout(3 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath })
+
+    expect(transcript.language).to.equals('fr')
+  })
+
+  it('May transcribe a media file in french with small model', async function () {
+    this.timeout(6 * 1000 * 60)
+    const transcript = await transcriber.transcribe({
+      mediaFilePath: frVideoPath,
+      language: 'fr',
+      format: 'txt',
+      model: new WhisperBuiltinModel('small')
+    })
+
+    expect(transcript.language).to.equals('fr')
+
+    const transcriptFileEvaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcript)
+    const cer = await transcriptFileEvaluator.cer()
+    expect(cer).to.be.below(6 / 100)
+  })
+
+  after(async function () {
+    await rm(transcriptDirectory, { recursive: true, force: true })
+  })
+})
diff --git a/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts b/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts
new file mode 100644
index 000000000..bf4adee9e
--- /dev/null
+++ b/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts
@@ -0,0 +1,133 @@
+/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */
+import { expect, config } from 'chai'
+import { createLogger } from 'winston'
+import { join } from 'node:path'
+import { mkdir, rm } from 'node:fs/promises'
+import { tmpdir } from 'node:os'
+import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
+import {
+  OpenaiTranscriber,
+  WhisperTimestampedTranscriber,
+  TranscriptFileEvaluator,
+  TranscriptionModel,
+  WhisperTranscribeArgs,
+  levenshteinDistance, downloadFile, unzip
+} from '@peertube/peertube-transcription'
+import { FIXTURE_URLS } from '@tests/shared/fixture-urls.js'
+
+config.truncateThreshold = 0
+
+describe('Linto timestamped Whisper transcriber', function () {
+  const tmpDirectory = join(tmpdir(), 'peertube-transcription')
+  const transcriptDirectory = join(tmpDirectory, 'transcriber', 'timestamped')
+  const modelsDirectory = join(tmpDirectory, 'models')
+  const shortVideoPath = buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4')
+  const frVideoPath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
+  const transcriber = new WhisperTimestampedTranscriber(
+    {
+      name: 'whisper-timestamped',
+      requirements: [],
+      type: 'binary',
+      binary: 'whisper_timestamped',
+      supportedModelFormats: [ 'PyTorch' ],
+      languageDetection: true
+    },
+    createLogger(),
+    transcriptDirectory
+  )
+
+  before(async function () {
+    this.timeout(1 * 1000 * 60)
+    await mkdir(transcriptDirectory, { recursive: true })
+    await unzip(await downloadFile(FIXTURE_URLS.transcriptionModels, tmpDirectory))
+  })
+
+  it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () {
+    this.timeout(1 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' })
+
+    expect(transcript.format).to.equals('vtt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('May produce a transcript file in the `srt` format with a ms precision', async function () {
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' })
+
+    expect(transcript.format).to.equals('srt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('May produce a transcript file in `txt` format', async function () {
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' })
+
+    expect(transcript.format).to.equals('txt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+    expect(levenshteinDistance(
+      (await transcript.read()).toString(),
+      'December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.'
+    )).to.be.below(10)
+  })
+
+  it('May transcribe a media file using a local PyTorch model file', async function () {
+    this.timeout(2 * 1000 * 60)
+    await transcriber.transcribe({
+      mediaFilePath: shortVideoPath,
+      model: await TranscriptionModel.fromPath(join(modelsDirectory, 'tiny.pt')),
+      language: 'en'
+    })
+  })
+
+  it('May transcribe a media file in french', async function () {
+    this.timeout(2 * 1000 * 60)
+    const transcript = await transcriber.transcribe({
+      mediaFilePath: frVideoPath,
+      language: 'fr',
+      format: 'txt'
+    })
+
+    expect(transcript.format).to.equals('txt')
+    expect(transcript.language).to.equals('fr')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('Guesses the video language if not provided', async function () {
+    this.timeout(2 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath })
+    expect(transcript.language).to.equals('fr')
+  })
+
+  it('Should produce a text transcript similar to openai-whisper implementation', async function () {
+    this.timeout(11 * 1000 * 60)
+    const transcribeArgs: WhisperTranscribeArgs = {
+      mediaFilePath: frVideoPath,
+      model: await TranscriptionModel.fromPath(join(modelsDirectory, 'tiny.pt')),
+      language: 'fr',
+      format: 'txt'
+    }
+    const transcript = await transcriber.transcribe(transcribeArgs)
+
+    const openaiTranscriber = new OpenaiTranscriber(
+      {
+        name: 'openai-whisper',
+        requirements: [],
+        type: 'binary',
+        binary: 'whisper',
+        supportedModelFormats: [ 'PyTorch' ]
+      },
+      createLogger(),
+      join(transcriptDirectory, 'openai-whisper')
+    )
+    const openaiTranscript = await openaiTranscriber.transcribe(transcribeArgs)
+
+    const transcriptFileEvaluator = new TranscriptFileEvaluator(openaiTranscript, transcript)
+    expect(await transcriptFileEvaluator.wer()).to.be.below(25 / 100)
+    expect(await transcriptFileEvaluator.cer()).to.be.below(15 / 100)
+  })
+
+  after(async function () {
+    await rm(transcriptDirectory, { recursive: true, force: true })
+  })
+})
diff --git a/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts b/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts
new file mode 100644
index 000000000..3a9984e87
--- /dev/null
+++ b/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts
@@ -0,0 +1,137 @@
+/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */
+import { expect, config } from 'chai'
+import { createLogger } from 'winston'
+import { join } from 'node:path'
+import { mkdir, rm } from 'node:fs/promises'
+import { tmpdir } from 'node:os'
+import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
+import {
+  Ctranslate2Transcriber, downloadFile,
+  levenshteinDistance,
+  OpenaiTranscriber,
+  TranscriptFile,
+  TranscriptFileEvaluator,
+  TranscriptionModel, unzip,
+  WhisperTranscribeArgs
+} from '@peertube/peertube-transcription'
+import { FIXTURE_URLS } from '@tests/shared/fixture-urls.js'
+
+config.truncateThreshold = 0
+
+describe('Whisper CTranslate2 transcriber', function () {
+  const tmpDirectory = join(tmpdir(), 'peertube-transcription')
+  const transcriptDirectory = join(tmpDirectory, 'transcriber', 'ctranslate2')
+  const modelsDirectory = join(tmpDirectory, 'models')
+  const shortVideoPath = buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4')
+  const frVideoPath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
+  const transcriber = new Ctranslate2Transcriber(
+    {
+      name: 'anyNameShouldBeFineReally',
+      requirements: [],
+      type: 'binary',
+      binary: 'whisper-ctranslate2',
+      supportedModelFormats: [],
+      languageDetection: true
+    },
+    createLogger(),
+    transcriptDirectory
+  )
+
+  before(async function () {
+    this.timeout(1 * 1000 * 60)
+    await mkdir(transcriptDirectory, { recursive: true })
+    await unzip(await downloadFile(FIXTURE_URLS.transcriptionModels, tmpDirectory))
+  })
+
+  it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () {
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' })
+
+    expect(transcript.format).to.equals('vtt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('May produce a transcript file in the `srt` format', async function () {
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' })
+
+    expect(transcript.format).to.equals('srt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('May produce a transcript file in the `txt` format', async function () {
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' })
+    expect(await transcript.equals(new TranscriptFile({
+      path: join(transcriptDirectory, 'the_last_man_on_earth.txt'),
+      format: 'txt',
+      language: 'en'
+    }))).to.be.true
+
+    expect(transcript.format).to.equals('txt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+    expect(levenshteinDistance(
+      (await transcript.read()).toString(),
+      'December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.'
+    )).to.be.below(5)
+  })
+
+  it('May transcribe a media file using a local CTranslate2 model', async function () {
+    this.timeout(2 * 1000 * 60)
+    const transcript = await transcriber.transcribe({
+      mediaFilePath: shortVideoPath,
+      model: await TranscriptionModel.fromPath(join(modelsDirectory, 'faster-whisper-tiny')),
+      language: 'en',
+      format: 'txt'
+    })
+
+    expect(transcript.format).to.equals('txt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('May transcribe a media file in french', async function () {
+    this.timeout(5 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath, language: 'fr', format: 'txt' })
+
+    expect(transcript.format).to.equals('txt')
+    expect(transcript.language).to.equals('fr')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('Guesses the video language if not provided', async function () {
+    this.timeout(2 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath })
+    expect(transcript.language).to.equals('fr')
+  })
+
+  it('Should produce a text transcript similar to openai-whisper implementation', async function () {
+    this.timeout(10 * 1000 * 60)
+    const transcribeArgs: WhisperTranscribeArgs = {
+      mediaFilePath: frVideoPath,
+      language: 'fr',
+      format: 'txt'
+    }
+    const transcript = await transcriber.transcribe(transcribeArgs)
+    const openaiTranscriber = new OpenaiTranscriber(
+      {
+        name: 'openai-whisper',
+        requirements: [],
+        type: 'binary',
+        binary: 'whisper',
+        supportedModelFormats: [ 'PyTorch' ]
+      },
+      createLogger(),
+      join(transcriptDirectory, 'openai-whisper')
+    )
+    const openaiTranscript = await openaiTranscriber.transcribe(transcribeArgs)
+
+    const transcriptFileEvaluator = new TranscriptFileEvaluator(openaiTranscript, transcript)
+    expect(await transcriptFileEvaluator.wer()).to.be.below(20 / 100)
+    expect(await transcriptFileEvaluator.cer()).to.be.below(10 / 100)
+  })
+
+  after(async function () {
+    await rm(transcriptDirectory, { recursive: true, force: true })
+  })
+})
diff --git a/packages/tests/tsconfig.json b/packages/tests/tsconfig.json
index fc3490da4..5b8e2a6c4 100644
--- a/packages/tests/tsconfig.json
+++ b/packages/tests/tsconfig.json
@@ -6,16 +6,20 @@
     "tsBuildInfoFile": "./dist/.tsbuildinfo",
     "paths": {
       "@tests/*": [ "./src/*" ],
-      "@server/*": [ "../../server/core/*" ]
+      "@server/*": [ "../../server/core/*" ],
+      "@peertube/peertube-transcription": [ "../transcription" ],
+      "@peertube/peertube-jiwer": [ "../jiwer" ],
     }
   },
   "references": [
     { "path": "../core-utils" },
     { "path": "../ffmpeg" },
+    { "path": "../jiwer" },
     { "path": "../models" },
     { "path": "../node-utils" },
     { "path": "../typescript-utils" },
     { "path": "../server-commands" },
+    { "path": "../transcription" },
     { "path": "../../server/tsconfig.lib.json" }
   ],
   "include": [
diff --git a/packages/transcription/README.md b/packages/transcription/README.md
new file mode 100644
index 000000000..b16ff75d4
--- /dev/null
+++ b/packages/transcription/README.md
@@ -0,0 +1,99 @@
+# Transcription
+
+Video **transcription** consists in transcribing the audio content of a video to a text.
+> This process might be called __Automatic Speech Recognition__ or __Speech to Text__ in more general context.
+
+Provide a common API to many transcription backend, currently :
+- `openai-whisper` CLI
+- `faster-whisper` (*via* `whisper-ctranslate2` CLI)
+- `whisper-timestamped`
+
+> Potential candidates could be: whisper-cpp, vosk, ...
+
+## Requirements
+- Python
+- PIP
+
+And at least one of the following transcription backend:
+- Python :
+  - `openai-whisper`
+  - `whisper-ctranslate2>=0.4.3`
+  - `whisper-timestamped>=1.15.4`
+
+And to run the transcript evaluation tests :
+- Python
+  - `jiwer>=3.04`
+
+## Usage
+
+Create a transcriber manually :
+```typescript
+import { OpenaiTranscriber } from '@peertube/peertube-transcription'
+
+(async () => {
+  // create a transcriber powered by OpeanAI Whisper CLI
+  const transcriber = new OpenaiTranscriber({
+    name: 'openai-whisper',
+    binary: 'whisper',
+    languageDetection: true
+  });
+
+  const transcriptFile = await transcriber.transcribe({
+    mediaFilePath: './myVideo.mp4',
+    format: 'txt'
+  });
+
+  console.log(transcriptFile.path);
+  console.log(await transcriptFile.read());
+})();
+```
+
+Using a local model file:
+
+```typescript
+import { WhisperBuiltinModel } from '@peertube/peertube-transcription/dist'
+
+const transcriptFile = await transcriber.transcribe({
+  mediaFilePath: './myVideo.mp4',
+  model: WhisperBuiltinModel.fromPath('./models/large.pt'),
+  format: 'txt'
+});
+```
+
+You may use the builtin Factory if you're happy with the default configuration:
+```Typescript
+import { transcriberFactory } from '@peertube/peertube-transcription'
+transcriberFactory.createFromEngineName('openai-whisper')
+```
+> For further usage [../tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts](../tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts)
+
+## Benchmark
+
+A benchmark of available __transcribers__ might be run with:
+```sh
+npm run benchmark
+```
+```
+┌────────────────────────┬───────────────────────┬───────────────────────┬──────────┬────────┬───────────────────────┐
+│        (index)         │          WER          │          CER          │ duration │ model  │        engine         │
+├────────────────────────┼───────────────────────┼───────────────────────┼──────────┼────────┼───────────────────────┤
+│ 5yZGBYqojXe7nuhq1TuHvz │ '28.39506172839506%'  │  '9.62457337883959%'  │  '41s'   │ 'tiny' │   'openai-whisper'    │
+│ x6qREJ2AkTU4e5YmvfivQN │ '29.75206611570248%'  │ '10.46195652173913%'  │  '15s'   │ 'tiny' │ 'whisper-ctranslate2' │
+│ qbt6BekKMVzxq4KCSLCzt3 │ '31.020408163265305%' │ '10.784982935153584%' │  '20s'   │ 'tiny' │ 'whisper-timestamped' │
+└────────────────────────┴───────────────────────┴───────────────────────┴──────────┴────────┴───────────────────────┘
+```
+
+The benchmark may be run with multiple model builtin sizes:
+```sh
+MODELS=tiny,small,large npm run benchmark
+```
+
+## Lexicon
+- ONNX: Open Neural Network eXchange. A specification, the ONNX Runtime run these models.
+- GPTs: Generative Pre-Trained Transformers
+- LLM: Large Language Models
+- NLP: Natural Language Processing
+- MLP: Multilayer Perceptron
+- ASR: Automatic Speech Recognition
+- WER: Word Error Rate
+- CER: Character Error Rate
diff --git a/packages/transcription/package.json b/packages/transcription/package.json
new file mode 100644
index 000000000..9138de4dc
--- /dev/null
+++ b/packages/transcription/package.json
@@ -0,0 +1,21 @@
+{
+  "name": "@peertube/peertube-transcription",
+  "private": true,
+  "version": "0.0.0",
+  "main": "dist/index.js",
+  "files": [ "dist" ],
+  "exports": {
+    "types": "./dist/index.d.ts",
+    "peertube:tsx": "./src/index.ts",
+    "default": "./dist/index.js"
+  },
+  "type": "module",
+  "devDependencies": {},
+  "scripts": {
+    "preinstall": "pip install -r requirements.txt",
+    "build": "tsc",
+    "watch": "tsc -w",
+    "benchmark": "tsx --conditions=peertube:tsx --tsconfig ./tsconfig.json ./src/benchmark.ts"
+  },
+  "dependencies": {}
+}
diff --git a/packages/transcription/requirements.txt b/packages/transcription/requirements.txt
new file mode 100644
index 000000000..bc457025c
--- /dev/null
+++ b/packages/transcription/requirements.txt
@@ -0,0 +1,3 @@
+openai-whisper==20231117
+whisper-ctranslate2==0.4.4
+whisper-timestamped==1.15.4
diff --git a/packages/transcription/src/abstract-transcriber.ts b/packages/transcription/src/abstract-transcriber.ts
new file mode 100644
index 000000000..b9a0f66e5
--- /dev/null
+++ b/packages/transcription/src/abstract-transcriber.ts
@@ -0,0 +1,69 @@
+import { createLogger, Logger } from 'winston'
+import { join } from 'node:path'
+import { PerformanceObserver } from 'node:perf_hooks'
+import { buildSUUID, SUUID, root } from '@peertube/peertube-node-utils'
+import { TranscriptionEngine } from './transcription-engine.js'
+import { TranscriptionModel } from './transcription-model.js'
+import { TranscriptionRun } from './transcription-run.js'
+import { TranscriptFile, TranscriptFormat } from './transcript/index.js'
+
+export interface TranscribeArgs {
+  mediaFilePath: string
+  model: TranscriptionModel
+  language?: string
+  format?: TranscriptFormat
+  runId?: SUUID
+}
+
+export abstract class AbstractTranscriber {
+  public static DEFAULT_TRANSCRIPT_DIRECTORY = join(root(), 'dist', 'transcripts')
+
+  engine: TranscriptionEngine
+  logger: Logger
+  transcriptDirectory: string
+  performanceObserver?: PerformanceObserver
+  run?: TranscriptionRun
+
+  constructor (
+    engine: TranscriptionEngine,
+    logger: Logger = createLogger(),
+    transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY,
+    performanceObserver?: PerformanceObserver
+  ) {
+    this.engine = engine
+    this.logger = logger
+    this.transcriptDirectory = transcriptDirectory
+    this.performanceObserver = performanceObserver
+  }
+
+  createRun (uuid: SUUID = buildSUUID()) {
+    this.run = new TranscriptionRun(this.logger, uuid)
+  }
+
+  startRun () {
+    this.run.start()
+  }
+
+  stopRun () {
+    this.run.stop()
+    delete this.run
+  }
+
+  assertLanguageDetectionAvailable (language?: string) {
+    if (!this.engine.languageDetection && !language) {
+      throw new Error(`Language detection isn't available in ${this.engine.name}. A language must me provided explicitly.`)
+    }
+  }
+
+  supports (model: TranscriptionModel) {
+    return model.format === 'PyTorch'
+  }
+
+  abstract transcribe ({
+    mediaFilePath,
+    model,
+    language,
+    format = 'vtt',
+    runId = buildSUUID()
+  }: TranscribeArgs): Promise<TranscriptFile>
+}
diff --git a/packages/transcription/src/benchmark.ts b/packages/transcription/src/benchmark.ts
new file mode 100644
index 000000000..4cac16449
--- /dev/null
+++ b/packages/transcription/src/benchmark.ts
@@ -0,0 +1,139 @@
+import { createLogger, transports, format } from 'winston'
+import { join } from 'node:path'
+import { performance, PerformanceObserver } from 'node:perf_hooks'
+import { tmpdir } from 'node:os'
+import { rm, mkdir } from 'node:fs/promises'
+import { buildAbsoluteFixturePath, buildSUUID, SUUID } from '@peertube/peertube-node-utils'
+import {
+  transcriberFactory,
+  TranscriptFile,
+  TranscriptFileEvaluator,
+  TranscriptionEngine,
+  TranscriptionModel
+} from '@peertube/peertube-transcription'
+import { millisecondsToTime } from '@peertube/peertube-core-utils'
+
+interface BenchmarkResult {
+  uuid: SUUID
+  WER?: number
+  CER?: number
+  duration?: number
+  engine?: TranscriptionEngine
+  model?: string
+}
+
+type Benchmark = Record<SUUID, BenchmarkResult>
+
+const benchmarkReducer = (benchmark: Benchmark = {}, benchmarkResult: BenchmarkResult) => ({
+  ...benchmark,
+  [benchmarkResult.uuid]:  {
+    ...benchmark[benchmarkResult.uuid],
+    ...benchmarkResult
+  }
+})
+
+const groupBenchmarkResultsByModel = (benchmarkResults: Record<string, BenchmarkResult>) => (benchmarksGroupedByModel, uuid) => ({
+  ...benchmarksGroupedByModel,
+  [benchmarkResults[uuid].model]: {
+    ...benchmarksGroupedByModel[benchmarkResults[uuid].model],
+    [uuid]: formatBenchmarkResult(benchmarkResults[uuid])
+  }
+})
+
+interface FormattedBenchmarkResult {
+  WER?: string
+  CER?: string
+  duration?: string
+  model?: string
+  engine?: string
+}
+
+const formatBenchmarkResult = ({ WER, CER, duration, engine, model }: Partial<BenchmarkResult>): FormattedBenchmarkResult => ({
+  WER: WER ? `${WER * 100}%` : undefined,
+  CER: CER ? `${CER * 100}%` : undefined,
+  duration: duration ? millisecondsToTime(duration) : undefined,
+  model,
+  engine: engine.name
+})
+
+void (async () => {
+  const logger = createLogger()
+  logger.add(new transports.Console({ format: format.printf(log => log.message) }))
+  const transcribers = [
+    'openai-whisper',
+    'whisper-ctranslate2',
+    'whisper-timestamped'
+  ]
+  const models = process.env.MODELS
+    ? process.env.MODELS.trim().split(',').map(modelName => modelName.trim()).filter(modelName => modelName)
+    : [ 'tiny' ]
+
+  const transcriptDirectory = join(tmpdir(), 'peertube-transcription', 'benchmark')
+  const mediaFilePath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
+  const referenceTranscriptFile = new TranscriptFile({
+    path: buildAbsoluteFixturePath('transcription/videos/derive_sectaire.txt'),
+    language: 'fr',
+    format: 'txt'
+  })
+
+  let benchmarkResults: Record<string, BenchmarkResult> = {}
+
+  // before
+  await mkdir(transcriptDirectory, { recursive: true })
+  const performanceObserver = new PerformanceObserver((items) => {
+    items
+      .getEntries()
+      .forEach((entry) => {
+        benchmarkResults = benchmarkReducer(benchmarkResults, {
+          uuid: entry.name as SUUID,
+          duration: entry.duration
+        })
+      })
+  })
+  performanceObserver.observe({ type: 'measure' })
+
+  // benchmark
+  logger.info(`Running transcribers benchmark with the following models: ${models.join(', ')}`)
+  for (const transcriberName of transcribers) {
+    logger.info(`Create "${transcriberName}" transcriber for the benchmark...`)
+
+    const transcriber = transcriberFactory.createFromEngineName(
+      transcriberName,
+      createLogger(),
+      transcriptDirectory
+    )
+
+    for (const modelName of models) {
+      logger.info(`Run benchmark with "${modelName}" model:`)
+      const model = new TranscriptionModel(modelName)
+      const uuid = buildSUUID()
+      const transcriptFile = await transcriber.transcribe({
+        mediaFilePath,
+        model,
+        language: 'fr',
+        format: 'txt',
+        runId: uuid
+      })
+      const evaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcriptFile)
+      await new Promise(resolve => setTimeout(resolve, 1))
+
+      benchmarkResults = benchmarkReducer(benchmarkResults, {
+        uuid,
+        engine: transcriber.engine,
+        WER: await evaluator.wer(),
+        CER: await evaluator.cer(),
+        model: model.name
+      })
+    }
+  }
+
+  // display
+  const benchmarkResultsGroupedByModel = Object
+    .keys(benchmarkResults)
+    .reduce(groupBenchmarkResultsByModel(benchmarkResults), {})
+  Object.values(benchmarkResultsGroupedByModel).forEach(benchmark => console.table(benchmark))
+
+  // after
+  await rm(transcriptDirectory, { recursive: true, force: true })
+  performance.clearMarks()
+})()
diff --git a/packages/transcription/src/index.ts b/packages/transcription/src/index.ts
new file mode 100644
index 000000000..44eaa0799
--- /dev/null
+++ b/packages/transcription/src/index.ts
@@ -0,0 +1,13 @@
+import { TranscriberFactory } from './transcriber-factory.js'
+import { engines } from './whisper/index.js'
+
+export * from './transcript/index.js'
+export * from './levenshtein.js'
+export * from './subtitle.js'
+export * from './transcription-engine.js'
+export * from './transcription-model.js'
+export * from './transcription-run.js'
+export * from './utils.js'
+export * from './whisper/index.js'
+
+export const transcriberFactory = new TranscriberFactory(engines)
diff --git a/packages/transcription/src/levenshtein.ts b/packages/transcription/src/levenshtein.ts
new file mode 100644
index 000000000..364cae061
--- /dev/null
+++ b/packages/transcription/src/levenshtein.ts
@@ -0,0 +1,101 @@
+function min (d0: number, d1: number, d2: number, bx: number, ay: number) {
+  return d0 < d1 || d2 < d1
+    ? d0 > d2
+      ? d2 + 1
+      : d0 + 1
+    : bx === ay
+      ? d1
+      : d1 + 1
+}
+
+/**
+ * @see https://github.com/gustf/js-levenshtein
+ */
+export function levenshteinDistance (a: string, b: string): number {
+  if (a === b) {
+    return 0
+  }
+
+  if (a.length > b.length) {
+    const tmp = a
+    a = b
+    b = tmp
+  }
+
+  let la = a.length
+  let lb = b.length
+
+  while (la > 0 && (a.charCodeAt(la - 1) === b.charCodeAt(lb - 1))) {
+    la--
+    lb--
+  }
+
+  let offset = 0
+
+  while (offset < la && (a.charCodeAt(offset) === b.charCodeAt(offset))) {
+    offset++
+  }
+
+  la -= offset
+  lb -= offset
+
+  if (la === 0 || lb < 3) {
+    return lb
+  }
+
+  let x = 0
+  let y: number
+  let d0: number
+  let d1: number
+  let d2: number
+  let d3: number
+  let dd: number
+  let dy: number
+  let ay: number
+  let bx0: number
+  let bx1: number
+  let bx2: number
+  let bx3: number
+
+  const vector: number[] = []
+
+  for (y = 0; y < la; y++) {
+    vector.push(y + 1)
+    vector.push(a.charCodeAt(offset + y))
+  }
+
+  const len = vector.length - 1
+
+  for (; x < lb - 3;) {
+    bx0 = b.charCodeAt(offset + (d0 = x))
+    bx1 = b.charCodeAt(offset + (d1 = x + 1))
+    bx2 = b.charCodeAt(offset + (d2 = x + 2))
+    bx3 = b.charCodeAt(offset + (d3 = x + 3))
+    dd = (x += 4)
+    for (y = 0; y < len; y += 2) {
+      dy = vector[y]
+      ay = vector[y + 1]
+      d0 = min(dy, d0, d1, bx0, ay)
+      d1 = min(d0, d1, d2, bx1, ay)
+      d2 = min(d1, d2, d3, bx2, ay)
+      dd = min(d2, d3, dd, bx3, ay)
+      vector[y] = dd
+      d3 = d2
+      d2 = d1
+      d1 = d0
+      d0 = dy
+    }
+  }
+
+  for (; x < lb;) {
+    bx0 = b.charCodeAt(offset + (d0 = x))
+    dd = ++x
+    for (y = 0; y < len; y += 2) {
+      dy = vector[y]
+      vector[y] = dd = min(dy, d0, dd, bx0, vector[y + 1])
+      d0 = dy
+    }
+  }
+
+  return dd
+}
diff --git a/packages/transcription/src/subtitle.ts b/packages/transcription/src/subtitle.ts
new file mode 100644
index 000000000..94b080ec3
--- /dev/null
+++ b/packages/transcription/src/subtitle.ts
@@ -0,0 +1 @@
+export const srtToTxt = (srtContent: string) => srtContent.replace(/^\n*\d+\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/gm, '')
diff --git a/packages/transcription/src/transcriber-factory.ts b/packages/transcription/src/transcriber-factory.ts
new file mode 100644
index 000000000..cb22d617c
--- /dev/null
+++ b/packages/transcription/src/transcriber-factory.ts
@@ -0,0 +1,49 @@
+import { Logger, createLogger } from 'winston'
+import { TranscriptionEngine } from './transcription-engine.js'
+import {
+  Ctranslate2Transcriber,
+  OpenaiTranscriber, WhisperTimestampedTranscriber
+} from './whisper/index.js'
+import { AbstractTranscriber } from './abstract-transcriber.js'
+
+export class TranscriberFactory {
+  engines: TranscriptionEngine[]
+
+  constructor (engines: TranscriptionEngine[]) {
+    this.engines = engines
+  }
+
+  createFromEngineName (
+    engineName: string,
+    logger: Logger = createLogger(),
+    transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY
+  ) {
+    const engine = this.getEngineByName(engineName)
+
+    const transcriberArgs: ConstructorParameters<typeof AbstractTranscriber> = [
+      engine,
+      logger,
+      transcriptDirectory
+    ]
+
+    switch (engineName) {
+      case 'openai-whisper':
+        return new OpenaiTranscriber(...transcriberArgs)
+      case 'whisper-ctranslate2':
+        return new Ctranslate2Transcriber(...transcriberArgs)
+      case 'whisper-timestamped':
+        return new WhisperTimestampedTranscriber(...transcriberArgs)
+      default:
+        throw new Error(`Unimplemented engine ${engineName}`)
+    }
+  }
+
+  getEngineByName (engineName: string) {
+    const engine = this.engines.find(({ name }) => name === engineName)
+    if (!engine) {
+      throw new Error(`Unknow engine ${engineName}`)
+    }
+
+    return engine
+  }
+}
diff --git a/packages/transcription/src/transcript/index.ts b/packages/transcription/src/transcript/index.ts
new file mode 100644
index 000000000..bd76d1a86
--- /dev/null
+++ b/packages/transcription/src/transcript/index.ts
@@ -0,0 +1,3 @@
+export * from './transcript-file.js'
+export * from './transcript-file-evaluator.js'
+export * from './transcript-file-interface.js'
diff --git a/packages/transcription/src/transcript/transcript-file-evaluator-interface.ts b/packages/transcription/src/transcript/transcript-file-evaluator-interface.ts
new file mode 100644
index 000000000..cf51b6551
--- /dev/null
+++ b/packages/transcription/src/transcript/transcript-file-evaluator-interface.ts
@@ -0,0 +1,12 @@
+export interface TranscriptFileEvaluation {
+  wer: number
+  cer: number
+  alignment: string
+}
+
+export interface TranscriptFileEvaluatorInterface {
+  wer(): Promise<number>
+  cer(): Promise<number>
+  alignment(): Promise<string>
+  evaluate(): Promise<TranscriptFileEvaluation>
+}
diff --git a/packages/transcription/src/transcript/transcript-file-evaluator.ts b/packages/transcription/src/transcript/transcript-file-evaluator.ts
new file mode 100644
index 000000000..7ae411ee4
--- /dev/null
+++ b/packages/transcription/src/transcript/transcript-file-evaluator.ts
@@ -0,0 +1,46 @@
+import assert from 'node:assert'
+import { JiwerClI } from '@peertube/peertube-jiwer'
+import { TranscriptFileEvaluatorInterface } from './transcript-file-evaluator-interface.js'
+import { TranscriptFileInterface } from './index.js'
+
+export class TranscriptFileEvaluator implements TranscriptFileEvaluatorInterface {
+  referenceTranscriptFile: TranscriptFileInterface
+  hypothesisTranscriptFile: TranscriptFileInterface
+  jiwerCLI: JiwerClI
+
+  constructor (referenceTranscriptFile: TranscriptFileInterface, hypothesisTranscriptFile: TranscriptFileInterface) {
+    assert(referenceTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file')
+    assert(hypothesisTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file')
+
+    this.referenceTranscriptFile = referenceTranscriptFile
+    this.hypothesisTranscriptFile = hypothesisTranscriptFile
+
+    this.jiwerCLI = new JiwerClI(this.referenceTranscriptFile.path, this.hypothesisTranscriptFile.path)
+  }
+
+  /**
+   * WER: Word Error Rate
+   */
+  wer () {
+    return this.jiwerCLI.wer()
+  }
+
+  /**
+   * CER: Character Error Rate
+   */
+  cer () {
+    return this.jiwerCLI.cer()
+  }
+
+  alignment () {
+    return this.jiwerCLI.alignment()
+  }
+
+  async evaluate () {
+    return {
+      wer: await this.wer(),
+      cer: await this.cer(),
+      alignment: await this.alignment()
+    }
+  }
+}
diff --git a/packages/transcription/src/transcript/transcript-file-interface.ts b/packages/transcription/src/transcript/transcript-file-interface.ts
new file mode 100644
index 000000000..d30b6913f
--- /dev/null
+++ b/packages/transcription/src/transcript/transcript-file-interface.ts
@@ -0,0 +1,3 @@
+export type TranscriptFormat = 'txt' | 'vtt' | 'srt' | 'json'
+
+export type TranscriptFileInterface = { path: string, language?: string, format: TranscriptFormat }
diff --git a/packages/transcription/src/transcript/transcript-file.ts b/packages/transcription/src/transcript/transcript-file.ts
new file mode 100644
index 000000000..36af9fa8f
--- /dev/null
+++ b/packages/transcription/src/transcript/transcript-file.ts
@@ -0,0 +1,88 @@
+import { statSync } from 'node:fs'
+import { readFile, writeFile } from 'node:fs/promises'
+import { extname } from 'node:path'
+import assert from 'node:assert'
+import { TranscriptFileInterface, TranscriptFormat } from './transcript-file-interface.js'
+import { TranscriptFileEvaluator } from './transcript-file-evaluator.js'
+import { srtToTxt } from '../subtitle.js'
+import { levenshteinDistance } from '../levenshtein.js'
+
+export class TranscriptFile implements TranscriptFileInterface {
+  path: string
+  language: string
+  format: TranscriptFormat = 'vtt'
+
+  constructor ({ path, language, format = 'vtt' }: { path: string, language: string, format?: TranscriptFormat }) {
+    statSync(path)
+
+    this.path = path
+    this.language = language
+    this.format = format
+  }
+
+  /**
+   * Asynchronously reads the entire contents of a transcript file.
+   * @see https://nodejs.org/docs/latest-v18.x/api/fs.html#filehandlereadfileoptions for options
+   */
+  async read (options: Parameters<typeof readFile>[1] = 'utf8') {
+    return await readFile(this.path, options)
+  }
+
+  static fromPath (path: string, language = 'en') {
+    const format = extname(path).substring(1)
+
+    const guessableFormats = [ 'txt', 'vtt', 'srt' ]
+    assert(
+      guessableFormats.includes(format),
+      `Couldn't guess transcript format from extension "${format}". Valid formats are: ${guessableFormats.join(', ')}."`)
+
+    return new TranscriptFile({ path, language, format: format as TranscriptFormat })
+  }
+
+  /**
+   * Write a transcript file to disk.
+   */
+  static async write ({
+    path,
+    content,
+    language = 'en',
+    format = 'vtt'
+  }: { path: string, content: string, language?: string, format?: TranscriptFormat }): Promise<TranscriptFile> {
+    await writeFile(path, content)
+
+    return new TranscriptFile({ path, language, format })
+  }
+
+  async equals (transcript: TranscriptFile, caseSensitive: boolean = true) {
+    if (this.language !== transcript.language) {
+      return false
+    }
+
+    const content = await this.read()
+    const transcriptContent = await transcript.read()
+
+    if (!caseSensitive) {
+      return String(content).toLowerCase() === String(transcriptContent).toLowerCase()
+    }
+
+    return content === transcriptContent
+  }
+
+  cer (transcript: TranscriptFile) {
+    return (new TranscriptFileEvaluator(this, transcript)).cer()
+  }
+
+  async evaluate (transcript: TranscriptFile) {
+    const evaluator = new TranscriptFileEvaluator(this, transcript)
+
+    return evaluator.evaluate()
+  }
+
+  async readAsTxt () {
+    return srtToTxt(String(await this.read()))
+  }
+
+  async distance (transcript: TranscriptFile) {
+    return levenshteinDistance(await this.readAsTxt(), await transcript.readAsTxt())
+  }
+}
diff --git a/packages/transcription/src/transcription-engine.ts b/packages/transcription/src/transcription-engine.ts
new file mode 100644
index 000000000..3174e3980
--- /dev/null
+++ b/packages/transcription/src/transcription-engine.ts
@@ -0,0 +1,23 @@
+import { ModelFormat } from './transcription-model.js'
+
+/**
+ * The engine, or framework.
+ */
+export class TranscriptionEngine {
+  name: string
+  description?: string
+  language?: string
+  requirements: string[]
+  type: 'binary' | 'bindings' | 'ws'
+  binary: string
+  license?: string
+  forgeURL?: string
+  supportedModelFormats: ModelFormat[]
+  languageDetection?: true
+  // There could be a default models.
+  // There could be a list of default models
+
+  constructor (parameters: TranscriptionEngine) {
+    Object.assign(this, parameters)
+  }
+}
diff --git a/packages/transcription/src/transcription-model.ts b/packages/transcription/src/transcription-model.ts
new file mode 100644
index 000000000..01f3bdd4a
--- /dev/null
+++ b/packages/transcription/src/transcription-model.ts
@@ -0,0 +1,34 @@
+import assert from 'node:assert'
+import { stat } from 'node:fs/promises'
+import { parse } from 'node:path'
+
+export type ModelFormat = 'PyTorch' | 'GGML' | 'ONNX' | 'CTranslate2' // CoreML, OpenVino, Scikit-Learn, TensorFlow/Keras, PySpark
+
+export class TranscriptionModel {
+  name: string
+  format?: ModelFormat
+  path?: string
+
+  // #  - hparams
+  // #  - Number of dimensions (int)
+  // #  - Name length (int)
+  // #  - Dimensions (int[n_dims])
+  // #  - Name (char[name_length])
+  // #  - Data (float[n_dims])
+
+  // #  - mel filters
+  // #  - tokenizer vocab
+  // #  - model variables
+
+  constructor (name: string, path?: string, format?: ModelFormat) {
+    this.name = name
+    this.path = path
+    this.format = format
+  }
+
+  static async fromPath (path: string) {
+    assert(await stat(path), `${path} doesn't exist.`)
+
+    return new TranscriptionModel(parse(path).name, path)
+  }
+}
diff --git a/packages/transcription/src/transcription-run.ts b/packages/transcription/src/transcription-run.ts
new file mode 100644
index 000000000..608afbb6a
--- /dev/null
+++ b/packages/transcription/src/transcription-run.ts
@@ -0,0 +1,41 @@
+import { buildSUUID, SUUID } from '@peertube/peertube-node-utils'
+import { createLogger, Logger } from 'winston'
+
+export class TranscriptionRun {
+  uuid: SUUID
+  logger: Logger
+
+  constructor (logger = createLogger(), uuid: SUUID = buildSUUID()) {
+    this.uuid = uuid
+    this.logger = logger
+  }
+
+  get runId () {
+    return this.uuid
+  }
+
+  start () {
+    performance.mark(this.getStartPerformanceMarkName())
+  }
+
+  stop () {
+    try {
+      performance.mark(this.getEndPerformanceMarkName())
+      performance.measure(
+        this.runId,
+        this.getStartPerformanceMarkName(),
+        this.getEndPerformanceMarkName()
+      )
+    } catch (e) {
+      this.logger.log({ level: 'error', message: e })
+    }
+  }
+
+  getStartPerformanceMarkName () {
+    return `${this.runId}-started`
+  }
+
+  getEndPerformanceMarkName () {
+    return `${this.runId}-ended`
+  }
+}
diff --git a/packages/transcription/src/utils.ts b/packages/transcription/src/utils.ts
new file mode 100644
index 000000000..735f11f13
--- /dev/null
+++ b/packages/transcription/src/utils.ts
@@ -0,0 +1,32 @@
+import { join, parse } from 'node:path'
+import { createWriteStream } from 'node:fs'
+import { lstat, unlink } from 'node:fs/promises'
+import assert from 'node:assert'
+import { $ } from 'execa'
+import { makeFileRequest } from '@peertube/peertube-server-commands'
+
+export const downloadFile = async (url: string, targetDirectory: string) => {
+  const { base } = parse(url)
+  const filePath = join(targetDirectory, base)
+
+  const fileStream = createWriteStream(filePath)
+  const stream = makeFileRequest(url).pipe(fileStream)
+
+  return await new Promise((resolve: (filePath: string) => void, reject) => {
+    stream.on('finish', () => resolve(filePath))
+    stream.on('error', async e => {
+      fileStream.close()
+      await unlink(filePath)
+      reject(e.message)
+    })
+  })
+}
+
+export const unzip = async (zipFilePath: string) => {
+  assert(await lstat(zipFilePath).then(stats => stats.isFile()), `${zipFilePath} isn't a file.`)
+  const { dir, name } = parse(zipFilePath)
+
+  await $`unzip -o ${zipFilePath} -d ${dir}`
+
+  return join(dir, name)
+}
diff --git a/packages/transcription/src/whisper/README.md b/packages/transcription/src/whisper/README.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/packages/transcription/src/whisper/engines.ts b/packages/transcription/src/whisper/engines.ts
new file mode 100644
index 000000000..711b098b7
--- /dev/null
+++ b/packages/transcription/src/whisper/engines.ts
@@ -0,0 +1,51 @@
+import { TranscriptionEngine } from '../transcription-engine.js'
+
+export const engines: TranscriptionEngine[] = [
+  {
+    name : 'whisper-cpp',
+    description : 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
+    type: 'binary',
+    binary: 'main',
+    language : 'cpp',
+    requirements : [],
+    forgeURL : 'https://github.com/ggerganov/whisper.cpp',
+    license : 'MIT',
+    supportedModelFormats: [ 'ONNX' ]
+  },
+  {
+    name: 'openai-whisper',
+    description: 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
+    requirements: [ 'python', 'pyTorch', 'ffmpeg' ],
+    language: 'python',
+    type: 'binary',
+    binary: 'whisper',
+    forgeURL: 'https://github.com/openai/whisper',
+    license: 'MIT',
+    supportedModelFormats: [ 'PyTorch' ],
+    languageDetection: true
+  },
+  {
+    name: 'whisper-ctranslate2',
+    description: '',
+    requirements: [ 'python' ],
+    language: 'python',
+    type: 'binary',
+    binary: 'whisper-ctranslate2',
+    forgeURL: 'https://github.com/openai/whisper',
+    license: 'MIT',
+    supportedModelFormats: [ 'CTranslate2' ],
+    languageDetection: true
+  },
+  {
+    name: 'whisper-timestamped',
+    description: '',
+    requirements: [ 'python' ],
+    language: 'python',
+    type: 'binary',
+    binary: 'whisper_timestamped',
+    forgeURL: 'https://github.com/openai/whisper',
+    license: 'MIT',
+    supportedModelFormats: [ 'CTranslate2' ],
+    languageDetection: true
+  }
+]
diff --git a/packages/transcription/src/whisper/index.ts b/packages/transcription/src/whisper/index.ts
new file mode 100644
index 000000000..ee9cae725
--- /dev/null
+++ b/packages/transcription/src/whisper/index.ts
@@ -0,0 +1,3 @@
+export * from './transcriber/index.js'
+export * from './engines.js'
+export * from './whisper-builtin-model.js'
diff --git a/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts b/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts
new file mode 100644
index 000000000..01b9739a3
--- /dev/null
+++ b/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts
@@ -0,0 +1,49 @@
+import { $ } from 'execa'
+import { buildSUUID } from '@peertube/peertube-node-utils'
+import { lstat } from 'node:fs/promises'
+import { OpenaiTranscriber, WhisperTranscribeArgs } from './openai-transcriber.js'
+import { TranscriptFile } from '../../transcript/index.js'
+import { WhisperBuiltinModel } from '../whisper-builtin-model.js'
+import assert from 'node:assert'
+
+export class Ctranslate2Transcriber extends OpenaiTranscriber {
+  async transcribe ({
+    mediaFilePath,
+    model = new WhisperBuiltinModel('tiny'),
+    language,
+    format = 'vtt',
+    runId = buildSUUID()
+  }: WhisperTranscribeArgs): Promise<TranscriptFile> {
+    this.assertLanguageDetectionAvailable(language)
+
+    const $$ = $({ verbose: process.env.NODE_ENV !== 'production' })
+
+    if (model.path) {
+      assert(await lstat(model.path).then(stats => stats.isDirectory()), 'Model path must be a path to a directory.')
+    }
+
+    const modelArgs = model.path ? [ '--model_directory', model.path ] : [ '--model', model.name ]
+    const languageArgs = language ? [ '--language', language ] : []
+
+    this.createRun(runId)
+    this.startRun()
+    await $$`${this.engine.binary} ${[
+      mediaFilePath,
+      ...modelArgs,
+      '--word_timestamps',
+      'True',
+      '--output_format',
+      'all',
+      '--output_dir',
+      this.transcriptDirectory,
+      ...languageArgs
+    ]}`
+    this.stopRun()
+
+    return new TranscriptFile({
+      language: language || await this.getDetectedLanguage(mediaFilePath),
+      path: this.getTranscriptFilePath(mediaFilePath, format),
+      format
+    })
+  }
+}
diff --git a/packages/transcription/src/whisper/transcriber/index.ts b/packages/transcription/src/whisper/transcriber/index.ts
new file mode 100644
index 000000000..950c39b07
--- /dev/null
+++ b/packages/transcription/src/whisper/transcriber/index.ts
@@ -0,0 +1,3 @@
+export * from './ctranslate2-transcriber.js'
+export * from './openai-transcriber.js'
+export * from './timestamped-transcriber.js'
diff --git a/packages/transcription/src/whisper/transcriber/openai-transcriber.ts b/packages/transcription/src/whisper/transcriber/openai-transcriber.ts
new file mode 100644
index 000000000..5d9a7ce85
--- /dev/null
+++ b/packages/transcription/src/whisper/transcriber/openai-transcriber.ts
@@ -0,0 +1,62 @@
+import { join } from 'path'
+import { $ } from 'execa'
+import { buildSUUID } from '@peertube/peertube-node-utils'
+import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
+import { AbstractTranscriber, TranscribeArgs } from '../../abstract-transcriber.js'
+import { WhisperBuiltinModel } from '../whisper-builtin-model.js'
+import { TranscriptionModel } from '../../transcription-model.js'
+import { readFile } from 'node:fs/promises'
+import { parse } from 'node:path'
+
+export type WhisperTranscribeArgs = Omit<TranscribeArgs, 'model'> & { model?: TranscriptionModel }
+
+export class OpenaiTranscriber extends AbstractTranscriber {
+  async transcribe ({
+    mediaFilePath,
+    model = new WhisperBuiltinModel('tiny'),
+    language,
+    format = 'vtt',
+    runId = buildSUUID()
+  }: WhisperTranscribeArgs): Promise<TranscriptFile> {
+    this.assertLanguageDetectionAvailable(language)
+
+    const $$ = $({ verbose: process.env.NODE_ENV !== 'production' })
+    const languageArgs = language ? [ '--language', language ] : []
+
+    this.createRun(runId)
+    this.startRun()
+    await $$`${this.engine.binary} ${[
+      mediaFilePath,
+      '--word_timestamps',
+      'True',
+      '--model',
+      model?.path || model.name,
+      '--output_format',
+      'all',
+      '--output_dir',
+      this.transcriptDirectory,
+      ...languageArgs
+    ]}`
+    this.stopRun()
+
+    return new TranscriptFile({
+      language: language || await this.getDetectedLanguage(mediaFilePath),
+      path: this.getTranscriptFilePath(mediaFilePath, format),
+      format
+    })
+  }
+
+  async getDetectedLanguage (mediaFilePath: string) {
+    const { language } = await this.readJsonTranscriptFile(mediaFilePath)
+
+    return language
+  }
+
+  async readJsonTranscriptFile (mediaFilePath: string) {
+    return JSON.parse(await readFile(this.getTranscriptFilePath(mediaFilePath, 'json'), 'utf8'))
+  }
+
+  getTranscriptFilePath (mediaFilePath: string, format: TranscriptFormat) {
+    return join(this.transcriptDirectory, `${parse(mediaFilePath).name}.${format}`)
+  }
+}
diff --git a/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts b/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts
new file mode 100644
index 000000000..fcdd33eb1
--- /dev/null
+++ b/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts
@@ -0,0 +1,55 @@
+import { $ } from 'execa'
+import { buildSUUID } from '@peertube/peertube-node-utils'
+import assert from 'node:assert'
+import { join, parse } from 'node:path'
+import { existsSync } from 'node:fs'
+import { rename } from 'node:fs/promises'
+import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
+import { OpenaiTranscriber, WhisperTranscribeArgs } from './openai-transcriber.js'
+import { WhisperBuiltinModel } from '../whisper-builtin-model.js'
+
+export class WhisperTimestampedTranscriber extends OpenaiTranscriber {
+  async transcribe ({
+    mediaFilePath,
+    model = new WhisperBuiltinModel('tiny'),
+    language,
+    format = 'vtt',
+    runId = buildSUUID()
+  }: WhisperTranscribeArgs): Promise<TranscriptFile> {
+    this.assertLanguageDetectionAvailable(language)
+
+    const $$ = $({ verbose: process.env.NODE_ENV !== 'production' })
+    const languageArgs = language ? [ '--language', language ] : []
+
+    this.createRun(runId)
+    this.startRun()
+    await $$`${this.engine.binary} ${[
+      mediaFilePath,
+      '--model',
+      model?.path || model.name,
+      '--output_format',
+      'all',
+      '--output_dir',
+      this.transcriptDirectory,
+      ...languageArgs
+    ]}`
+    this.stopRun()
+
+    const internalTranscriptPath = this.getTranscriptFilePath(mediaFilePath, format, false)
+    const transcriptPath = join(this.transcriptDirectory, `${parse(mediaFilePath).name}.${format}`)
+    // Whisper timestamped output files with the video file extension by defaults, ex: video.mp4.vtt
+    // @see https://github.com/linto-ai/whisper-timestamped/issues/189
+    assert(existsSync(internalTranscriptPath), `${internalTranscriptPath} file doesn't exist.`)
+    await rename(internalTranscriptPath, transcriptPath)
+    // communiquer-lors-dune-classe-transplantee.mp4.words.json
+    return new TranscriptFile({
+      language: language || await this.getDetectedLanguage(mediaFilePath),
+      path: transcriptPath,
+      format
+    })
+  }
+
+  getTranscriptFilePath (mediaFilePath: string, format: TranscriptFormat, words = true) {
+    return join(this.transcriptDirectory, `${parse(mediaFilePath).base}${words ? '.words' : ''}.${format}`)
+  }
+}
diff --git a/packages/transcription/src/whisper/whisper-builtin-model.ts b/packages/transcription/src/whisper/whisper-builtin-model.ts
new file mode 100644
index 000000000..32981ad20
--- /dev/null
+++ b/packages/transcription/src/whisper/whisper-builtin-model.ts
@@ -0,0 +1,11 @@
+import { TranscriptionModel } from '../transcription-model.js'
+
+export type WhisperBuiltinModelName = 'tiny' | 'base' | 'small' | 'medium' | 'large' | 'large-v2' | 'large-v3'
+
+export class WhisperBuiltinModel extends TranscriptionModel {
+
+  // eslint-disable-next-line @typescript-eslint/no-useless-constructor
+  constructor (name: WhisperBuiltinModelName) {
+    super(name)
+  }
+}
diff --git a/packages/transcription/tsconfig.json b/packages/transcription/tsconfig.json
new file mode 100644
index 000000000..94971d65a
--- /dev/null
+++ b/packages/transcription/tsconfig.json
@@ -0,0 +1,15 @@
+{
+  "extends": "../../tsconfig.base.json",
+  "compilerOptions": {
+    "outDir": "./dist",
+    "rootDir": "src",
+    "tsBuildInfoFile": "./dist/.tsbuildinfo"
+  },
+  "references": [
+    { "path": "../models" },
+    { "path": "../core-utils" },
+    { "path": "../node-utils" },
+    { "path": "../jiwer" },
+    { "path": "../server-commands" }
+  ]
+}
diff --git a/packages/transcription/tsconfig.types.json b/packages/transcription/tsconfig.types.json
new file mode 100644
index 000000000..9edb53ece
--- /dev/null
+++ b/packages/transcription/tsconfig.types.json
@@ -0,0 +1,10 @@
+{
+  "extends": "./tsconfig.json",
+  "compilerOptions": {
+    "outDir": "../types-generator/dist/peertube-transcription",
+    "tsBuildInfoFile": "../types-generator/dist/peertube-transcription/.tsbuildinfo",
+    "stripInternal": true,
+    "removeComments": false,
+    "emitDeclarationOnly": true
+  }
+}
diff --git a/scripts/ci.sh b/scripts/ci.sh
index 3d29b7ae9..8d483b42c 100755
--- a/scripts/ci.sh
+++ b/scripts/ci.sh
@@ -146,4 +146,13 @@ elif [ "$1" = "lint" ]; then
     npm run swagger-cli -- validate support/doc/api/openapi.yaml
 
     ( cd client && npm run lint )
+elif [ "$1" = "transcription" ]; then
+    npm run preinstall --workspace=@peertube/peertube-transcription --workspace=@peertube/peertube-jiwer
+    npm run build:server
+    npm run build:tests
+
+    transcriptionFiles=$(findTestFiles ./packages/tests/dist/transcription)
+    jiwerFiles=$(findTestFiles ./packages/tests/dist/jiwer)
+
+    MOCHA_PARALLEL=true runJSTest "$1" $((3*$speedFactor)) $transcriptionFiles $jiwerFiles
 fi
diff --git a/server/tsconfig.json b/server/tsconfig.json
index 21442d082..ed0bfca48 100644
--- a/server/tsconfig.json
+++ b/server/tsconfig.json
@@ -14,6 +14,7 @@
     { "path": "../packages/ffmpeg" },
     { "path": "../packages/models" },
     { "path": "../packages/node-utils" },
+    { "path": "../packages/transcription" },
     { "path": "../packages/typescript-utils" }
   ],
   "include": [
diff --git a/tsconfig.eslint.json b/tsconfig.eslint.json
index c2e868173..61542e14e 100644
--- a/tsconfig.eslint.json
+++ b/tsconfig.eslint.json
@@ -24,9 +24,11 @@
     { "path": "./apps/peertube-cli" },
     { "path": "./packages/core-utils" },
     { "path": "./packages/ffmpeg" },
+    { "path": "./packages/jiwer" },
     { "path": "./packages/models" },
     { "path": "./packages/node-utils" },
     { "path": "./packages/server-commands" },
+    { "path": "./packages/transcription" },
     { "path": "./packages/typescript-utils" }
   ]
 }