PeerTube/packages/transcription-devtools/src/levenshtein.ts

function min (d0: number, d1: number, d2: number, bx: number, ay: number) {
  return d0 < d1 || d2 < d1
    ? d0 > d2
      ? d2 + 1
      : d0 + 1
    : bx === ay
      ? d1
      : d1 + 1
}

/**
 * @see https://github.com/gustf/js-levenshtein
 */
export function levenshteinDistance (a: string, b: string): number {
  if (a === b) {
    return 0
  }

  if (a.length > b.length) {
    const tmp = a
    a = b
    b = tmp
  }

  let la = a.length
  let lb = b.length

  while (la > 0 && (a.charCodeAt(la - 1) === b.charCodeAt(lb - 1))) {
    la--
    lb--
  }

  let offset = 0

  while (offset < la && (a.charCodeAt(offset) === b.charCodeAt(offset))) {
    offset++
  }

  la -= offset
  lb -= offset

  if (la === 0 || lb < 3) {
    return lb
  }

  let x = 0
  let y: number
  let d0: number
  let d1: number
  let d2: number
  let d3: number
  let dd: number
  let dy: number
  let ay: number
  let bx0: number
  let bx1: number
  let bx2: number
  let bx3: number

  const vector: number[] = []

  for (y = 0; y < la; y++) {
    vector.push(y + 1)
    vector.push(a.charCodeAt(offset + y))
  }

  const len = vector.length - 1

  for (; x < lb - 3;) {
    bx0 = b.charCodeAt(offset + (d0 = x))
    bx1 = b.charCodeAt(offset + (d1 = x + 1))
    bx2 = b.charCodeAt(offset + (d2 = x + 2))
    bx3 = b.charCodeAt(offset + (d3 = x + 3))
    dd = (x += 4)
    for (y = 0; y < len; y += 2) {
      dy = vector[y]
      ay = vector[y + 1]
      d0 = min(dy, d0, d1, bx0, ay)
      d1 = min(d0, d1, d2, bx1, ay)
      d2 = min(d1, d2, d3, bx2, ay)
      dd = min(d2, d3, dd, bx3, ay)
      vector[y] = dd
      d3 = d2
      d2 = d1
      d1 = d0
      d0 = dy
    }
  }

  for (; x < lb;) {
    bx0 = b.charCodeAt(offset + (d0 = x))
    dd = ++x
    for (y = 0; y < len; y += 2) {
      dy = vector[y]
      vector[y] = dd = min(dy, d0, dd, bx0, vector[y + 1])
      d0 = dy
    }
  }

  return dd
}
feat(transcription): groundwork chore: fiddling around some more chore: add ctranslate2 and timestamped chore: add performance markers chore: refactor test chore: change worflow name chore: ensure Python3 chore(duration): convert to chai/mocha syntahx chore(transcription): add individual tests for others transcribers chore(transcription): implement formats test of all implementations Also compare result of other implementation to the reference implementation chore(transcription): add more test case with other language and models size and local model chore(test): wip ctranslate 2 adapat chore(transcription): wip transcript file and benchmark chore(test): clean a bit chore(test): clean a bit chore(test): refacto timestamed spec chore(test): update workflow chore(test): fix glob expansion with sh chore(test): extract some hw info chore(test): fix async tests chore(benchmark): add model info feat(transcription): allow use of a local mode in timestamped-whisper feat(transcription): extract run and profiling info in own value object feat(transcription): extract run concept in own class an run more bench chore(transcription): somplify run object only a uuid is now needed and add more benchmark scenario docs(transcription): creates own package readme docs(transcription): add local model usage docs(transcription): update README fix(transcription): use fr video for better comparison chore(transcription): make openai comparison passed docs(timestamped): clea chore(transcription): change transcribers transcribe method signature Introduce whisper builtin model. fix(transcription): activate language detection Forbid transcript creation without a language. Add `languageDetection` flag to an engine and some assertions. Fix an issue in `whisper-ctranslate2` : https://github.com/Softcatala/whisper-ctranslate2/pull/93 chore(transcription): use PeerTube time helpers instead of custom ones Update existing time function to output an integer number of seconds and add a ms human-readable time formatter with hints of tests. chore(transcription): use PeerTube UUID helpers chore(transcription): enable CER evaluation Thanks to this recent fix in Jiwer <3 https://github.com/jitsi/jiwer/issues/873 chore(jiwer): creates JiWer package I'm not very happy with the TranscriptFileEvaluator constructor... suggestions ? chore(JiWer): add usage in README docs(jiwer): update JiWer readme chore(transcription): use FunMOOC video in fixtures chore(transcription): add proper english video fixture chore(transcription): use os tmp directory where relevant chore(transcription): fix jiwer cli test reference.txt chore(transcription): move benchmark out of tests chore(transcription): remove transcription workflow docs(transcription): add benchmark info fix(transcription): use ms precision in other transcribers chore(transcription): simplify most of the tests chore(transcription): remove slashes when building path with join chore(transcription): make fromPath method async chore(transcription): assert path to model is a directory for CTranslate2 transcriber chore(transcription): ctranslate2 assertion chore(transcription): ctranslate2 assertion chore(transcription): add preinstall script for Python dependencies chore(transcription): add download and unzip utils functions chore(transcription): add download and unzip utils functions chore(transcription): download & unzip models fixtures chore(transcription): zip chore(transcription): raise download file test timeout chore(transcription): simplify download file test chore(transcription): add transcriptions test to CI chore(transcription): raise test preconditions timeout chore(transcription): run preinstall scripts before running ci chore(transcription): create dedicated tmp folder for transcriber tests chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): use short video for local model test chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): setup verbosity based on NODE_ENV value 2024-03-29 04:34:45 -05:00			`function min (d0: number, d1: number, d2: number, bx: number, ay: number) {`
			`return d0 < d1 \|\| d2 < d1`
			`? d0 > d2`
			`? d2 + 1`
			`: d0 + 1`
			`: bx === ay`
			`? d1`
			`: d1 + 1`
			`}`

			`/**`
			`* @see https://github.com/gustf/js-levenshtein`
			`*/`
			`export function levenshteinDistance (a: string, b: string): number {`
			`if (a === b) {`
			`return 0`
			`}`

			`if (a.length > b.length) {`
			`const tmp = a`
			`a = b`
			`b = tmp`
			`}`

			`let la = a.length`
			`let lb = b.length`

			`while (la > 0 && (a.charCodeAt(la - 1) === b.charCodeAt(lb - 1))) {`
			`la--`
			`lb--`
			`}`

			`let offset = 0`

			`while (offset < la && (a.charCodeAt(offset) === b.charCodeAt(offset))) {`
			`offset++`
			`}`

			`la -= offset`
			`lb -= offset`

			`if (la === 0 \|\| lb < 3) {`
			`return lb`
			`}`

			`let x = 0`
			`let y: number`
			`let d0: number`
			`let d1: number`
			`let d2: number`
			`let d3: number`
			`let dd: number`
			`let dy: number`
			`let ay: number`
			`let bx0: number`
			`let bx1: number`
			`let bx2: number`
			`let bx3: number`

			`const vector: number[] = []`

			`for (y = 0; y < la; y++) {`
			`vector.push(y + 1)`
			`vector.push(a.charCodeAt(offset + y))`
			`}`

			`const len = vector.length - 1`

			`for (; x < lb - 3;) {`
			`bx0 = b.charCodeAt(offset + (d0 = x))`
			`bx1 = b.charCodeAt(offset + (d1 = x + 1))`
			`bx2 = b.charCodeAt(offset + (d2 = x + 2))`
			`bx3 = b.charCodeAt(offset + (d3 = x + 3))`
			`dd = (x += 4)`
			`for (y = 0; y < len; y += 2) {`
			`dy = vector[y]`
			`ay = vector[y + 1]`
			`d0 = min(dy, d0, d1, bx0, ay)`
			`d1 = min(d0, d1, d2, bx1, ay)`
			`d2 = min(d1, d2, d3, bx2, ay)`
			`dd = min(d2, d3, dd, bx3, ay)`
			`vector[y] = dd`
			`d3 = d2`
			`d2 = d1`
			`d1 = d0`
			`d0 = dy`
			`}`
			`}`

			`for (; x < lb;) {`
			`bx0 = b.charCodeAt(offset + (d0 = x))`
			`dd = ++x`
			`for (y = 0; y < len; y += 2) {`
			`dy = vector[y]`
			`vector[y] = dd = min(dy, d0, dd, bx0, vector[y + 1])`
			`d0 = dy`
			`}`
			`}`

			`return dd`
			`}`