/* Port from https://github.com/josephrocca/clip-bpe-js/blob/main/mod.js */

import htmlEntities from './html5-entities'
import bpeVocabData from './bpe_simple_vocab_16e6'
// import ftfy from "https://deno.land/x/ftfy_pyodide@v0.1.1/mod.js";

const ord = (c: string): number => {
  return c.charCodeAt(0)
}

const range = (start: number, stop?: number, step = 1): number[] => {
  if (stop === undefined) {
    stop = start
    start = 0
  }

  if ((step > 0 && start >= stop) || (step < 0 && start <= stop)) {
    return []
  }

  const result = []

  for (let i = start; step > 0 ? i < stop : i > stop; i += step) {
    result.push(i)
  }

  return result
}

const bytesToUnicode = (): Record<string, string> => {
  const bs = [...range(ord('!'), ord('~') + 1), ...range(ord('¡'), ord('¬') + 1), ...range(ord('®'), ord('ÿ') + 1)]
  const cs = bs.slice(0)
  let n = 0

  for (const b of range(2 ** 8)) {
    if (!bs.includes(b)) {
      bs.push(b)
      cs.push(2 ** 8 + n)
      n += 1
    }
  }

  const cs2 = cs.map((code) => String.fromCharCode(code))

  return Object.fromEntries(bs.map((v, i) => [v, cs2[i] as string]))
}

const getPairs = (word: string[]): string[][] => {
  const pairs = []
  let prevChar = word[0] as string

  for (const char of word.slice(1)) {
    pairs.push([prevChar, char])
    prevChar = char
  }

  return pairs
}

const basicClean = (text: string): string => {
  // text = ftfy.fix_text(text);
  text = htmlEntities.decode(htmlEntities.decode(text))

  return text.trim()
}

const whitespaceClean = (text: string): string => {
  return text.replace(/\s+/g, ' ').trim()
}

export default class {
  byteEncoder: Record<string, string>

  byteDecoder: Record<string, string>

  encoder: Record<string, number>

  decoder: Record<number, string>

  bpeRanks: Record<string, number>

  cache: Record<string, string>

  pat: RegExp

  constructor() {
    this.byteEncoder = bytesToUnicode()
    this.byteDecoder = Object.fromEntries(Object.entries(this.byteEncoder).map(([k, v]) => [v, k]))

    const allLines = bpeVocabData.text.split('\n')
    const lines = allLines.slice(1, 49152 - 256 - 2 + 1)
    const merges = lines.map((line) => line.split(' '))

    // There was a bug related to the ordering of Python's .values() output so the Python output is copy-pasted:
    let vocab = [
      '!',
      '"',
      '#',
      '$',
      '%',
      '&',
      "'",
      '(',
      ')',
      '*',
      '+',
      ',',
      '-',
      '.',
      '/',
      '0',
      '1',
      '2',
      '3',
      '4',
      '5',
      '6',
      '7',
      '8',
      '9',
      ':',
      ';',
      '<',
      '=',
      '>',
      '?',
      '@',
      'A',
      'B',
      'C',
      'D',
      'E',
      'F',
      'G',
      'H',
      'I',
      'J',
      'K',
      'L',
      'M',
      'N',
      'O',
      'P',
      'Q',
      'R',
      'S',
      'T',
      'U',
      'V',
      'W',
      'X',
      'Y',
      'Z',
      '[',
      '\\',
      ']',
      '^',
      '_',
      '`',
      'a',
      'b',
      'c',
      'd',
      'e',
      'f',
      'g',
      'h',
      'i',
      'j',
      'k',
      'l',
      'm',
      'n',
      'o',
      'p',
      'q',
      'r',
      's',
      't',
      'u',
      'v',
      'w',
      'x',
      'y',
      'z',
      '{',
      '|',
      '}',
      '~',
      '¡',
      '¢',
      '£',
      '¤',
      '¥',
      '¦',
      '§',
      '¨',
      '©',
      'ª',
      '«',
      '¬',
      '®',
      '¯',
      '°',
      '±',
      '²',
      '³',
      '´',
      'µ',
      '¶',
      '·',
      '¸',
      '¹',
      'º',
      '»',
      '¼',
      '½',
      '¾',
      '¿',
      'À',
      'Á',
      'Â',
      'Ã',
      'Ä',
      'Å',
      'Æ',
      'Ç',
      'È',
      'É',
      'Ê',
      'Ë',
      'Ì',
      'Í',
      'Î',
      'Ï',
      'Ð',
      'Ñ',
      'Ò',
      'Ó',
      'Ô',
      'Õ',
      'Ö',
      '×',
      'Ø',
      'Ù',
      'Ú',
      'Û',
      'Ü',
      'Ý',
      'Þ',
      'ß',
      'à',
      'á',
      'â',
      'ã',
      'ä',
      'å',
      'æ',
      'ç',
      'è',
      'é',
      'ê',
      'ë',
      'ì',
      'í',
      'î',
      'ï',
      'ð',
      'ñ',
      'ò',
      'ó',
      'ô',
      'õ',
      'ö',
      '÷',
      'ø',
      'ù',
      'ú',
      'û',
      'ü',
      'ý',
      'þ',
      'ÿ',
      'Ā',
      'ā',
      'Ă',
      'ă',
      'Ą',
      'ą',
      'Ć',
      'ć',
      'Ĉ',
      'ĉ',
      'Ċ',
      'ċ',
      'Č',
      'č',
      'Ď',
      'ď',
      'Đ',
      'đ',
      'Ē',
      'ē',
      'Ĕ',
      'ĕ',
      'Ė',
      'ė',
      'Ę',
      'ę',
      'Ě',
      'ě',
      'Ĝ',
      'ĝ',
      'Ğ',
      'ğ',
      'Ġ',
      'ġ',
      'Ģ',
      'ģ',
      'Ĥ',
      'ĥ',
      'Ħ',
      'ħ',
      'Ĩ',
      'ĩ',
      'Ī',
      'ī',
      'Ĭ',
      'ĭ',
      'Į',
      'į',
      'İ',
      'ı',
      'Ĳ',
      'ĳ',
      'Ĵ',
      'ĵ',
      'Ķ',
      'ķ',
      'ĸ',
      'Ĺ',
      'ĺ',
      'Ļ',
      'ļ',
      'Ľ',
      'ľ',
      'Ŀ',
      'ŀ',
      'Ł',
      'ł',
      'Ń',
    ]

    vocab = [...vocab, ...vocab.map((v) => `${v}</w>`)]
    for (const merge of merges) {
      vocab.push(merge.join(''))
    }
    vocab.push('<|startoftext|>', '<|endoftext|>')
    this.encoder = Object.fromEntries(vocab.map((v, i) => [v, i]))
    this.decoder = Object.fromEntries(Object.entries(this.encoder).map(([k, v]) => [v, k]))
    this.bpeRanks = Object.fromEntries(merges.map((v, i) => [v.join('·😎·'), i])) // ·😎· because js doesn't yet have tuples
    this.cache = { '<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>' }
    this.pat = /<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|\p{L}+|\p{N}|[^\s\p{L}\p{N}]+/giu
  }

  bpe(token: string): string {
    if (this.cache[token] !== undefined) {
      return this.cache[token] as string
    }

    let word = [...token.slice(0, -1), `${token.slice(-1)}</w>`]
    let pairs = getPairs(word)

    if (pairs.length === 0) {
      return `${token}</w>`
    }

    while (1) {
      let bigram = null
      let minRank = Infinity

      for (const p of pairs) {
        const r = this.bpeRanks[p.join('·😎·')]

        if (r === undefined) {
          // eslint-disable-next-line no-continue
          continue
        }

        if (r < minRank) {
          minRank = r
          bigram = p
        }
      }

      if (bigram === null) {
        break
      }

      const first: string = bigram[0] as string
      const second: string = bigram[1] as string
      const newWord: string[] = []
      let i = 0

      while (i < word.length) {
        const j = word.indexOf(first, i)

        if (j === -1) {
          newWord.push(...word.slice(i))
          break
        }

        newWord.push(...word.slice(i, j))
        i = j

        if (word[i] === first && i < word.length - 1 && word[i + 1] === second) {
          newWord.push(first + second)
          i += 2
        } else {
          newWord.push(word[i] as string)
          i += 1
        }
      }
      word = newWord
      if (word.length === 1) {
        break
      } else {
        pairs = getPairs(word)
      }
    }
    const finalWord = word.join(' ')

    this.cache[token] = finalWord

    return finalWord
  }

  encode(text: string): number[] {
    const bpeTokens = []

    text = whitespaceClean(basicClean(text)).toLowerCase()
    for (let token of [...text.matchAll(this.pat)].map((m) => m[0])) {
      token = [...token].map((b) => this.byteEncoder[b.charCodeAt(0)]).join('')
      bpeTokens.push(
        ...this.bpe(token)
          .split(' ')
          .map((bpe_token) => this.encoder[bpe_token] as number),
      )
    }

    return bpeTokens
  }

  // adds start and end token, and adds padding 0's and ensures it's 77 tokens long
  encodeForCLIP(text: string): number[] {
    let tokens = this.encode(text)

    tokens.unshift(49406) // start token
    tokens = tokens.slice(0, 76)
    tokens.push(49407) // end token
    while (tokens.length < 77) tokens.push(0)

    return tokens
  }

  decode(tokens: number[]): string {
    const text = tokens.map((token) => this.decoder[token]).join('')

    return [...text]
      .map((c) => this.byteDecoder[c])
      .map((v) => String.fromCharCode(+(v as string)))
      .join('')
      .replaceAll('</w>', ' ')
  }
}
