technopark-scraper/node_modules/@streamparser/json/dist/deno/tokenizer.ts

import { charset, escapedSequences } from './utils/utf-8.ts';
import {
  StringBuilder,
  NonBufferedString,
  BufferedString,
} from './utils/bufferedString.ts';
import { TokenType } from './utils/constants.ts';
import { JsonPrimitive } from './utils/types.ts';

const {
  LEFT_BRACE,
  RIGHT_BRACE,
  LEFT_BRACKET,
  RIGHT_BRACKET,
  COLON,
  COMMA,
  TRUE,
  FALSE,
  NULL,
  STRING,
  NUMBER,
} = TokenType;

// Tokenizer States
enum TokenizerStates {
  START,
  ENDED,
  ERROR,
  TRUE1,
  TRUE2,
  TRUE3,
  FALSE1,
  FALSE2,
  FALSE3,
  FALSE4,
  NULL1,
  NULL2,
  NULL3,
  STRING_DEFAULT,
  STRING_AFTER_BACKSLASH,
  STRING_UNICODE_DIGIT_1,
  STRING_UNICODE_DIGIT_2,
  STRING_UNICODE_DIGIT_3,
  STRING_UNICODE_DIGIT_4,
  STRING_INCOMPLETE_CHAR,
  NUMBER_AFTER_INITIAL_MINUS,
  NUMBER_AFTER_INITIAL_ZERO,
  NUMBER_AFTER_INITIAL_NON_ZERO,
  NUMBER_AFTER_FULL_STOP,
  NUMBER_AFTER_DECIMAL,
  NUMBER_AFTER_E,
  NUMBER_AFTER_E_AND_SIGN,
  NUMBER_AFTER_E_AND_DIGIT,
  SEPARATOR,
}

export interface TokenizerOptions {
  stringBufferSize?: number;
  numberBufferSize?: number;
  separator?: string;
}

const defaultOpts: TokenizerOptions = {
  stringBufferSize: 0,
  numberBufferSize: 0,
  separator: undefined,
};

export class TokenizerError extends Error {
  constructor(message: string) {
    super(message);
    // Typescript is broken. This is a workaround
    Object.setPrototypeOf(this, TokenizerError.prototype);
  }
}

export default class Tokenizer {
  private state = TokenizerStates.START;

  private separator?: string;
  private separatorBytes?: Uint8Array;
  private separatorIndex = 0;
  private bufferedString: StringBuilder;
  private bufferedNumber: StringBuilder;

  private unicode: string | undefined = undefined; // unicode escapes
  private highSurrogate: number | undefined = undefined;
  private bytes_remaining = 0; // number of bytes remaining in multi byte utf8 char to read after split boundary
  private bytes_in_sequence = 0; // bytes in multi byte utf8 char to read
  private char_split_buffer = new Uint8Array(4); // for rebuilding chars split before boundary is reached
  private encoder = new TextEncoder();
  private offset = -1;

  constructor(opts?: TokenizerOptions) {
    opts = { ...defaultOpts, ...opts };

    this.bufferedString =
      opts.stringBufferSize && opts.stringBufferSize > 4
        ? new BufferedString(opts.stringBufferSize)
        : new NonBufferedString();
    this.bufferedNumber =
      opts.numberBufferSize && opts.numberBufferSize > 0
        ? new BufferedString(opts.numberBufferSize)
        : new NonBufferedString();

    this.separator = opts.separator;
    this.separatorBytes = opts.separator
      ? this.encoder.encode(opts.separator)
      : undefined;
  }

  public get isEnded(): boolean {
    return this.state === TokenizerStates.ENDED;
  }

  public write(input: Iterable<number> | string): void {
    let buffer: Uint8Array;
    if (input instanceof Uint8Array) {
      buffer = input;
    } else if (typeof input === "string") {
      buffer = this.encoder.encode(input);
    } else if (
      (typeof input === "object" && "buffer" in input) ||
      Array.isArray(input)
    ) {
      buffer = Uint8Array.from(input);
    } else {
      this.error(
        new TypeError(
          "Unexpected type. The `write` function only accepts Arrays, TypedArrays and Strings."
        )
      );
      return;
    }

    for (let i = 0; i < buffer.length; i += 1) {
      const n = buffer[i]; // get current byte from buffer
      switch (this.state) {
        case TokenizerStates.START:
          this.offset += 1;

          if (this.separatorBytes && n === this.separatorBytes[0]) {
            if (this.separatorBytes.length === 1) {
              this.state = TokenizerStates.START;
              this.onToken(
                TokenType.SEPARATOR,
                this.separator as string,
                this.offset + this.separatorBytes.length - 1
              );
              continue;
            }
            this.state = TokenizerStates.SEPARATOR;
            continue;
          }

          if (
            n === charset.SPACE ||
            n === charset.NEWLINE ||
            n === charset.CARRIAGE_RETURN ||
            n === charset.TAB
          ) {
            // whitespace
            continue;
          }

          if (n === charset.LEFT_CURLY_BRACKET) {
            this.onToken(LEFT_BRACE, "{", this.offset);
            continue;
          }
          if (n === charset.RIGHT_CURLY_BRACKET) {
            this.onToken(RIGHT_BRACE, "}", this.offset);
            continue;
          }
          if (n === charset.LEFT_SQUARE_BRACKET) {
            this.onToken(LEFT_BRACKET, "[", this.offset);
            continue;
          }
          if (n === charset.RIGHT_SQUARE_BRACKET) {
            this.onToken(RIGHT_BRACKET, "]", this.offset);
            continue;
          }
          if (n === charset.COLON) {
            this.onToken(COLON, ":", this.offset);
            continue;
          }
          if (n === charset.COMMA) {
            this.onToken(COMMA, ",", this.offset);
            continue;
          }

          if (n === charset.LATIN_SMALL_LETTER_T) {
            this.state = TokenizerStates.TRUE1;
            continue;
          }

          if (n === charset.LATIN_SMALL_LETTER_F) {
            this.state = TokenizerStates.FALSE1;
            continue;
          }

          if (n === charset.LATIN_SMALL_LETTER_N) {
            this.state = TokenizerStates.NULL1;
            continue;
          }

          if (n === charset.QUOTATION_MARK) {
            this.bufferedString.reset();
            this.state = TokenizerStates.STRING_DEFAULT;
            continue;
          }

          if (n >= charset.DIGIT_ONE && n <= charset.DIGIT_NINE) {
            this.bufferedNumber.reset();
            this.bufferedNumber.appendChar(n);
            this.state = TokenizerStates.NUMBER_AFTER_INITIAL_NON_ZERO;
            continue;
          }

          if (n === charset.DIGIT_ZERO) {
            this.bufferedNumber.reset();
            this.bufferedNumber.appendChar(n);
            this.state = TokenizerStates.NUMBER_AFTER_INITIAL_ZERO;
            continue;
          }

          if (n === charset.HYPHEN_MINUS) {
            this.bufferedNumber.reset();
            this.bufferedNumber.appendChar(n);
            this.state = TokenizerStates.NUMBER_AFTER_INITIAL_MINUS;
            continue;
          }

          break;
        // STRING
        case TokenizerStates.STRING_DEFAULT:
          if (n === charset.QUOTATION_MARK) {
            const string = this.bufferedString.toString();
            this.state = TokenizerStates.START;
            this.onToken(STRING, string, this.offset);
            this.offset += this.bufferedString.byteLength + 1;
            continue;
          }

          if (n === charset.REVERSE_SOLIDUS) {
            this.state = TokenizerStates.STRING_AFTER_BACKSLASH;
            continue;
          }

          if (n >= 128) {
            // Parse multi byte (>=128) chars one at a time
            if (n >= 194 && n <= 223) {
              this.bytes_in_sequence = 2;
            } else if (n <= 239) {
              this.bytes_in_sequence = 3;
            } else {
              this.bytes_in_sequence = 4;
            }

            if (this.bytes_in_sequence <= buffer.length - i) {
              // if bytes needed to complete char fall outside buffer length, we have a boundary split
              this.bufferedString.appendBuf(
                buffer,
                i,
                i + this.bytes_in_sequence
              );
              i += this.bytes_in_sequence - 1;
              continue;
            }

            this.bytes_remaining = i + this.bytes_in_sequence - buffer.length;
            this.char_split_buffer.set(buffer.subarray(i));
            i = buffer.length - 1;
            this.state = TokenizerStates.STRING_INCOMPLETE_CHAR;
            continue;
          }

          if (n >= charset.SPACE) {
            this.bufferedString.appendChar(n);
            continue;
          }

          break;
        case TokenizerStates.STRING_INCOMPLETE_CHAR:
          // check for carry over of a multi byte char split between data chunks
          // & fill temp buffer it with start of this data chunk up to the boundary limit set in the last iteration
          this.char_split_buffer.set(
            buffer.subarray(i, i + this.bytes_remaining),
            this.bytes_in_sequence - this.bytes_remaining
          );
          this.bufferedString.appendBuf(
            this.char_split_buffer,
            0,
            this.bytes_in_sequence
          );
          i = this.bytes_remaining - 1;
          this.state = TokenizerStates.STRING_DEFAULT;
          continue;
        case TokenizerStates.STRING_AFTER_BACKSLASH:
          const controlChar = escapedSequences[n];
          if (controlChar) {
            this.bufferedString.appendChar(controlChar);
            this.state = TokenizerStates.STRING_DEFAULT;
            continue;
          }

          if (n === charset.LATIN_SMALL_LETTER_U) {
            this.unicode = "";
            this.state = TokenizerStates.STRING_UNICODE_DIGIT_1;
            continue;
          }

          break;
        case TokenizerStates.STRING_UNICODE_DIGIT_1:
        case TokenizerStates.STRING_UNICODE_DIGIT_2:
        case TokenizerStates.STRING_UNICODE_DIGIT_3:
          if (
            (n >= charset.DIGIT_ZERO && n <= charset.DIGIT_NINE) ||
            (n >= charset.LATIN_CAPITAL_LETTER_A &&
              n <= charset.LATIN_CAPITAL_LETTER_F) ||
            (n >= charset.LATIN_SMALL_LETTER_A &&
              n <= charset.LATIN_SMALL_LETTER_F)
          ) {
            this.unicode += String.fromCharCode(n);
            this.state += 1;
            continue;
          }
          break;
        case TokenizerStates.STRING_UNICODE_DIGIT_4:
          if (
            (n >= charset.DIGIT_ZERO && n <= charset.DIGIT_NINE) ||
            (n >= charset.LATIN_CAPITAL_LETTER_A &&
              n <= charset.LATIN_CAPITAL_LETTER_F) ||
            (n >= charset.LATIN_SMALL_LETTER_A &&
              n <= charset.LATIN_SMALL_LETTER_F)
          ) {
            const intVal = parseInt(this.unicode + String.fromCharCode(n), 16);
            if (this.highSurrogate === undefined) {
              if (intVal >= 0xd800 && intVal <= 0xdbff) {
                //<55296,56319> - highSurrogate
                this.highSurrogate = intVal;
              } else {
                this.bufferedString.appendBuf(
                  this.encoder.encode(String.fromCharCode(intVal))
                );
              }
            } else {
              if (intVal >= 0xdc00 && intVal <= 0xdfff) {
                //<56320,57343> - lowSurrogate
                this.bufferedString.appendBuf(
                  this.encoder.encode(
                    String.fromCharCode(this.highSurrogate, intVal)
                  )
                );
              } else {
                this.bufferedString.appendBuf(
                  this.encoder.encode(String.fromCharCode(this.highSurrogate))
                );
              }
              this.highSurrogate = undefined;
            }
            this.state = TokenizerStates.STRING_DEFAULT;
            continue;
          }
        // Number
        case TokenizerStates.NUMBER_AFTER_INITIAL_MINUS:
          if (n === charset.DIGIT_ZERO) {
            this.bufferedNumber.appendChar(n);
            this.state = TokenizerStates.NUMBER_AFTER_INITIAL_ZERO;
            continue;
          }

          if (n >= charset.DIGIT_ONE && n <= charset.DIGIT_NINE) {
            this.bufferedNumber.appendChar(n);
            this.state = TokenizerStates.NUMBER_AFTER_INITIAL_NON_ZERO;
            continue;
          }

          break;
        case TokenizerStates.NUMBER_AFTER_INITIAL_ZERO:
          if (n === charset.FULL_STOP) {
            this.bufferedNumber.appendChar(n);
            this.state = TokenizerStates.NUMBER_AFTER_FULL_STOP;
            continue;
          }

          if (
            n === charset.LATIN_SMALL_LETTER_E ||
            n === charset.LATIN_CAPITAL_LETTER_E
          ) {
            this.bufferedNumber.appendChar(n);
            this.state = TokenizerStates.NUMBER_AFTER_E;
            continue;
          }

          i -= 1;
          this.state = TokenizerStates.START;
          this.emitNumber();
          continue;
        case TokenizerStates.NUMBER_AFTER_INITIAL_NON_ZERO:
          if (n >= charset.DIGIT_ZERO && n <= charset.DIGIT_NINE) {
            this.bufferedNumber.appendChar(n);
            continue;
          }

          if (n === charset.FULL_STOP) {
            this.bufferedNumber.appendChar(n);
            this.state = TokenizerStates.NUMBER_AFTER_FULL_STOP;
            continue;
          }

          if (
            n === charset.LATIN_SMALL_LETTER_E ||
            n === charset.LATIN_CAPITAL_LETTER_E
          ) {
            this.bufferedNumber.appendChar(n);
            this.state = TokenizerStates.NUMBER_AFTER_E;
            continue;
          }

          i -= 1;
          this.state = TokenizerStates.START;
          this.emitNumber();
          continue;
        case TokenizerStates.NUMBER_AFTER_FULL_STOP:
          if (n >= charset.DIGIT_ZERO && n <= charset.DIGIT_NINE) {
            this.bufferedNumber.appendChar(n);
            this.state = TokenizerStates.NUMBER_AFTER_DECIMAL;
            continue;
          }

          break;
        case TokenizerStates.NUMBER_AFTER_DECIMAL:
          if (n >= charset.DIGIT_ZERO && n <= charset.DIGIT_NINE) {
            this.bufferedNumber.appendChar(n);
            continue;
          }

          if (
            n === charset.LATIN_SMALL_LETTER_E ||
            n === charset.LATIN_CAPITAL_LETTER_E
          ) {
            this.bufferedNumber.appendChar(n);
            this.state = TokenizerStates.NUMBER_AFTER_E;
            continue;
          }

          i -= 1;
          this.state = TokenizerStates.START;
          this.emitNumber();
          continue;
        case TokenizerStates.NUMBER_AFTER_E:
          if (n === charset.PLUS_SIGN || n === charset.HYPHEN_MINUS) {
            this.bufferedNumber.appendChar(n);
            this.state = TokenizerStates.NUMBER_AFTER_E_AND_SIGN;
            continue;
          }
        // Allow cascading
        case TokenizerStates.NUMBER_AFTER_E_AND_SIGN:
          if (n >= charset.DIGIT_ZERO && n <= charset.DIGIT_NINE) {
            this.bufferedNumber.appendChar(n);
            this.state = TokenizerStates.NUMBER_AFTER_E_AND_DIGIT;
            continue;
          }

          break;
        case TokenizerStates.NUMBER_AFTER_E_AND_DIGIT:
          if (n >= charset.DIGIT_ZERO && n <= charset.DIGIT_NINE) {
            this.bufferedNumber.appendChar(n);
            continue;
          }

          i -= 1;
          this.state = TokenizerStates.START;
          this.emitNumber();
          continue;
        // TRUE
        case TokenizerStates.TRUE1:
          if (n === charset.LATIN_SMALL_LETTER_R) {
            this.state = TokenizerStates.TRUE2;
            continue;
          }
          break;
        case TokenizerStates.TRUE2:
          if (n === charset.LATIN_SMALL_LETTER_U) {
            this.state = TokenizerStates.TRUE3;
            continue;
          }
          break;
        case TokenizerStates.TRUE3:
          if (n === charset.LATIN_SMALL_LETTER_E) {
            this.state = TokenizerStates.START;
            this.onToken(TRUE, true, this.offset);
            this.offset += 3;
            continue;
          }
          break;
        // FALSE
        case TokenizerStates.FALSE1:
          if (n === charset.LATIN_SMALL_LETTER_A) {
            this.state = TokenizerStates.FALSE2;
            continue;
          }
          break;
        case TokenizerStates.FALSE2:
          if (n === charset.LATIN_SMALL_LETTER_L) {
            this.state = TokenizerStates.FALSE3;
            continue;
          }
          break;
        case TokenizerStates.FALSE3:
          if (n === charset.LATIN_SMALL_LETTER_S) {
            this.state = TokenizerStates.FALSE4;
            continue;
          }
          break;
        case TokenizerStates.FALSE4:
          if (n === charset.LATIN_SMALL_LETTER_E) {
            this.state = TokenizerStates.START;
            this.onToken(FALSE, false, this.offset);
            this.offset += 4;
            continue;
          }
          break;
        // NULL
        case TokenizerStates.NULL1:
          if (n === charset.LATIN_SMALL_LETTER_U) {
            this.state = TokenizerStates.NULL2;
            continue;
          }
          break;
        case TokenizerStates.NULL2:
          if (n === charset.LATIN_SMALL_LETTER_L) {
            this.state = TokenizerStates.NULL3;
            continue;
          }
          break;
        case TokenizerStates.NULL3:
          if (n === charset.LATIN_SMALL_LETTER_L) {
            this.state = TokenizerStates.START;
            this.onToken(NULL, null, this.offset);
            this.offset += 3;
            continue;
          }
          break;
        case TokenizerStates.SEPARATOR:
          this.separatorIndex += 1;
          if (
            !this.separatorBytes ||
            n !== this.separatorBytes[this.separatorIndex]
          ) {
            break;
          }
          if (this.separatorIndex === this.separatorBytes.length - 1) {
            this.state = TokenizerStates.START;
            this.onToken(
              TokenType.SEPARATOR,
              this.separator as string,
              this.offset + this.separatorIndex
            );
            this.separatorIndex = 0;
          }
          continue;
        case TokenizerStates.ENDED:
          if (
            n === charset.SPACE ||
            n === charset.NEWLINE ||
            n === charset.CARRIAGE_RETURN ||
            n === charset.TAB
          ) {
            // whitespace
            continue;
          }
      }

      this.error(
        new TokenizerError(
          `Unexpected "${String.fromCharCode(n)}" at position "${i}" in state ${
            TokenizerStates[this.state]
          }`
        )
      );
      return;
    }
  }

  private emitNumber(): void {
    this.onToken(
      NUMBER,
      this.parseNumber(this.bufferedNumber.toString()),
      this.offset
    );
    this.offset += this.bufferedNumber.byteLength - 1;
  }

  protected parseNumber(numberStr: string): number {
    return Number(numberStr);
  }

  public error(err: Error): void {
    if (this.state !== TokenizerStates.ENDED) {
      this.state = TokenizerStates.ERROR;
    }

    this.onError(err);
  }

  public end(): void {
    switch (this.state) {
      case TokenizerStates.NUMBER_AFTER_INITIAL_ZERO:
      case TokenizerStates.NUMBER_AFTER_INITIAL_NON_ZERO:
      case TokenizerStates.NUMBER_AFTER_DECIMAL:
      case TokenizerStates.NUMBER_AFTER_E_AND_DIGIT:
        this.state = TokenizerStates.ENDED;
        this.emitNumber();
        this.onEnd();
        break;
      case TokenizerStates.START:
      case TokenizerStates.ERROR:
      case TokenizerStates.SEPARATOR:
        this.state = TokenizerStates.ENDED;
        this.onEnd();
        break;
      default:
        this.error(
          new TokenizerError(
            `Tokenizer ended in the middle of a token (state: ${
              TokenizerStates[this.state]
            }). Either not all the data was received or the data was invalid.`
          )
        );
    }
  }

  public onToken(token: TokenType.LEFT_BRACE, value: "{", offset: number): void;
  public onToken(
    token: TokenType.RIGHT_BRACE,
    value: "}",
    offset: number
  ): void;
  public onToken(
    token: TokenType.LEFT_BRACKET,
    value: "[",
    offset: number
  ): void;
  public onToken(
    token: TokenType.RIGHT_BRACKET,
    value: "]",
    offset: number
  ): void;
  public onToken(token: TokenType.COLON, value: ":", offset: number): void;
  public onToken(token: TokenType.COMMA, value: ",", offset: number): void;
  public onToken(token: TokenType.TRUE, value: true, offset: number): void;
  public onToken(token: TokenType.FALSE, value: false, offset: number): void;
  public onToken(token: TokenType.NULL, value: null, offset: number): void;
  public onToken(token: TokenType.STRING, value: string, offset: number): void;
  public onToken(token: TokenType.NUMBER, value: number, offset: number): void;
  public onToken(
    token: TokenType.SEPARATOR,
    value: string,
    offset: number
  ): void;
  // eslint-disable-next-line @typescript-eslint/no-unused-vars
  public onToken(token: TokenType, value: JsonPrimitive, offset: number): void {
    // Override me
    throw new TokenizerError(
      'Can\'t emit tokens before the "onToken" callback has been set up.'
    );
  }

  public onError(err: Error): void {
    // Override me
    throw err;
  }

  public onEnd(): void {
    // Override me
  }
}