technopark-scraper/node_modules/@streamparser/json/dist/deno/tokenizer.ts

679 lines
20 KiB
TypeScript

import { charset, escapedSequences } from './utils/utf-8.ts';
import {
StringBuilder,
NonBufferedString,
BufferedString,
} from './utils/bufferedString.ts';
import { TokenType } from './utils/constants.ts';
import { JsonPrimitive } from './utils/types.ts';
const {
LEFT_BRACE,
RIGHT_BRACE,
LEFT_BRACKET,
RIGHT_BRACKET,
COLON,
COMMA,
TRUE,
FALSE,
NULL,
STRING,
NUMBER,
} = TokenType;
// Tokenizer States
enum TokenizerStates {
START,
ENDED,
ERROR,
TRUE1,
TRUE2,
TRUE3,
FALSE1,
FALSE2,
FALSE3,
FALSE4,
NULL1,
NULL2,
NULL3,
STRING_DEFAULT,
STRING_AFTER_BACKSLASH,
STRING_UNICODE_DIGIT_1,
STRING_UNICODE_DIGIT_2,
STRING_UNICODE_DIGIT_3,
STRING_UNICODE_DIGIT_4,
STRING_INCOMPLETE_CHAR,
NUMBER_AFTER_INITIAL_MINUS,
NUMBER_AFTER_INITIAL_ZERO,
NUMBER_AFTER_INITIAL_NON_ZERO,
NUMBER_AFTER_FULL_STOP,
NUMBER_AFTER_DECIMAL,
NUMBER_AFTER_E,
NUMBER_AFTER_E_AND_SIGN,
NUMBER_AFTER_E_AND_DIGIT,
SEPARATOR,
}
export interface TokenizerOptions {
stringBufferSize?: number;
numberBufferSize?: number;
separator?: string;
}
const defaultOpts: TokenizerOptions = {
stringBufferSize: 0,
numberBufferSize: 0,
separator: undefined,
};
export class TokenizerError extends Error {
constructor(message: string) {
super(message);
// Typescript is broken. This is a workaround
Object.setPrototypeOf(this, TokenizerError.prototype);
}
}
export default class Tokenizer {
private state = TokenizerStates.START;
private separator?: string;
private separatorBytes?: Uint8Array;
private separatorIndex = 0;
private bufferedString: StringBuilder;
private bufferedNumber: StringBuilder;
private unicode: string | undefined = undefined; // unicode escapes
private highSurrogate: number | undefined = undefined;
private bytes_remaining = 0; // number of bytes remaining in multi byte utf8 char to read after split boundary
private bytes_in_sequence = 0; // bytes in multi byte utf8 char to read
private char_split_buffer = new Uint8Array(4); // for rebuilding chars split before boundary is reached
private encoder = new TextEncoder();
private offset = -1;
constructor(opts?: TokenizerOptions) {
opts = { ...defaultOpts, ...opts };
this.bufferedString =
opts.stringBufferSize && opts.stringBufferSize > 4
? new BufferedString(opts.stringBufferSize)
: new NonBufferedString();
this.bufferedNumber =
opts.numberBufferSize && opts.numberBufferSize > 0
? new BufferedString(opts.numberBufferSize)
: new NonBufferedString();
this.separator = opts.separator;
this.separatorBytes = opts.separator
? this.encoder.encode(opts.separator)
: undefined;
}
public get isEnded(): boolean {
return this.state === TokenizerStates.ENDED;
}
public write(input: Iterable<number> | string): void {
let buffer: Uint8Array;
if (input instanceof Uint8Array) {
buffer = input;
} else if (typeof input === "string") {
buffer = this.encoder.encode(input);
} else if (
(typeof input === "object" && "buffer" in input) ||
Array.isArray(input)
) {
buffer = Uint8Array.from(input);
} else {
this.error(
new TypeError(
"Unexpected type. The `write` function only accepts Arrays, TypedArrays and Strings."
)
);
return;
}
for (let i = 0; i < buffer.length; i += 1) {
const n = buffer[i]; // get current byte from buffer
switch (this.state) {
case TokenizerStates.START:
this.offset += 1;
if (this.separatorBytes && n === this.separatorBytes[0]) {
if (this.separatorBytes.length === 1) {
this.state = TokenizerStates.START;
this.onToken(
TokenType.SEPARATOR,
this.separator as string,
this.offset + this.separatorBytes.length - 1
);
continue;
}
this.state = TokenizerStates.SEPARATOR;
continue;
}
if (
n === charset.SPACE ||
n === charset.NEWLINE ||
n === charset.CARRIAGE_RETURN ||
n === charset.TAB
) {
// whitespace
continue;
}
if (n === charset.LEFT_CURLY_BRACKET) {
this.onToken(LEFT_BRACE, "{", this.offset);
continue;
}
if (n === charset.RIGHT_CURLY_BRACKET) {
this.onToken(RIGHT_BRACE, "}", this.offset);
continue;
}
if (n === charset.LEFT_SQUARE_BRACKET) {
this.onToken(LEFT_BRACKET, "[", this.offset);
continue;
}
if (n === charset.RIGHT_SQUARE_BRACKET) {
this.onToken(RIGHT_BRACKET, "]", this.offset);
continue;
}
if (n === charset.COLON) {
this.onToken(COLON, ":", this.offset);
continue;
}
if (n === charset.COMMA) {
this.onToken(COMMA, ",", this.offset);
continue;
}
if (n === charset.LATIN_SMALL_LETTER_T) {
this.state = TokenizerStates.TRUE1;
continue;
}
if (n === charset.LATIN_SMALL_LETTER_F) {
this.state = TokenizerStates.FALSE1;
continue;
}
if (n === charset.LATIN_SMALL_LETTER_N) {
this.state = TokenizerStates.NULL1;
continue;
}
if (n === charset.QUOTATION_MARK) {
this.bufferedString.reset();
this.state = TokenizerStates.STRING_DEFAULT;
continue;
}
if (n >= charset.DIGIT_ONE && n <= charset.DIGIT_NINE) {
this.bufferedNumber.reset();
this.bufferedNumber.appendChar(n);
this.state = TokenizerStates.NUMBER_AFTER_INITIAL_NON_ZERO;
continue;
}
if (n === charset.DIGIT_ZERO) {
this.bufferedNumber.reset();
this.bufferedNumber.appendChar(n);
this.state = TokenizerStates.NUMBER_AFTER_INITIAL_ZERO;
continue;
}
if (n === charset.HYPHEN_MINUS) {
this.bufferedNumber.reset();
this.bufferedNumber.appendChar(n);
this.state = TokenizerStates.NUMBER_AFTER_INITIAL_MINUS;
continue;
}
break;
// STRING
case TokenizerStates.STRING_DEFAULT:
if (n === charset.QUOTATION_MARK) {
const string = this.bufferedString.toString();
this.state = TokenizerStates.START;
this.onToken(STRING, string, this.offset);
this.offset += this.bufferedString.byteLength + 1;
continue;
}
if (n === charset.REVERSE_SOLIDUS) {
this.state = TokenizerStates.STRING_AFTER_BACKSLASH;
continue;
}
if (n >= 128) {
// Parse multi byte (>=128) chars one at a time
if (n >= 194 && n <= 223) {
this.bytes_in_sequence = 2;
} else if (n <= 239) {
this.bytes_in_sequence = 3;
} else {
this.bytes_in_sequence = 4;
}
if (this.bytes_in_sequence <= buffer.length - i) {
// if bytes needed to complete char fall outside buffer length, we have a boundary split
this.bufferedString.appendBuf(
buffer,
i,
i + this.bytes_in_sequence
);
i += this.bytes_in_sequence - 1;
continue;
}
this.bytes_remaining = i + this.bytes_in_sequence - buffer.length;
this.char_split_buffer.set(buffer.subarray(i));
i = buffer.length - 1;
this.state = TokenizerStates.STRING_INCOMPLETE_CHAR;
continue;
}
if (n >= charset.SPACE) {
this.bufferedString.appendChar(n);
continue;
}
break;
case TokenizerStates.STRING_INCOMPLETE_CHAR:
// check for carry over of a multi byte char split between data chunks
// & fill temp buffer it with start of this data chunk up to the boundary limit set in the last iteration
this.char_split_buffer.set(
buffer.subarray(i, i + this.bytes_remaining),
this.bytes_in_sequence - this.bytes_remaining
);
this.bufferedString.appendBuf(
this.char_split_buffer,
0,
this.bytes_in_sequence
);
i = this.bytes_remaining - 1;
this.state = TokenizerStates.STRING_DEFAULT;
continue;
case TokenizerStates.STRING_AFTER_BACKSLASH:
const controlChar = escapedSequences[n];
if (controlChar) {
this.bufferedString.appendChar(controlChar);
this.state = TokenizerStates.STRING_DEFAULT;
continue;
}
if (n === charset.LATIN_SMALL_LETTER_U) {
this.unicode = "";
this.state = TokenizerStates.STRING_UNICODE_DIGIT_1;
continue;
}
break;
case TokenizerStates.STRING_UNICODE_DIGIT_1:
case TokenizerStates.STRING_UNICODE_DIGIT_2:
case TokenizerStates.STRING_UNICODE_DIGIT_3:
if (
(n >= charset.DIGIT_ZERO && n <= charset.DIGIT_NINE) ||
(n >= charset.LATIN_CAPITAL_LETTER_A &&
n <= charset.LATIN_CAPITAL_LETTER_F) ||
(n >= charset.LATIN_SMALL_LETTER_A &&
n <= charset.LATIN_SMALL_LETTER_F)
) {
this.unicode += String.fromCharCode(n);
this.state += 1;
continue;
}
break;
case TokenizerStates.STRING_UNICODE_DIGIT_4:
if (
(n >= charset.DIGIT_ZERO && n <= charset.DIGIT_NINE) ||
(n >= charset.LATIN_CAPITAL_LETTER_A &&
n <= charset.LATIN_CAPITAL_LETTER_F) ||
(n >= charset.LATIN_SMALL_LETTER_A &&
n <= charset.LATIN_SMALL_LETTER_F)
) {
const intVal = parseInt(this.unicode + String.fromCharCode(n), 16);
if (this.highSurrogate === undefined) {
if (intVal >= 0xd800 && intVal <= 0xdbff) {
//<55296,56319> - highSurrogate
this.highSurrogate = intVal;
} else {
this.bufferedString.appendBuf(
this.encoder.encode(String.fromCharCode(intVal))
);
}
} else {
if (intVal >= 0xdc00 && intVal <= 0xdfff) {
//<56320,57343> - lowSurrogate
this.bufferedString.appendBuf(
this.encoder.encode(
String.fromCharCode(this.highSurrogate, intVal)
)
);
} else {
this.bufferedString.appendBuf(
this.encoder.encode(String.fromCharCode(this.highSurrogate))
);
}
this.highSurrogate = undefined;
}
this.state = TokenizerStates.STRING_DEFAULT;
continue;
}
// Number
case TokenizerStates.NUMBER_AFTER_INITIAL_MINUS:
if (n === charset.DIGIT_ZERO) {
this.bufferedNumber.appendChar(n);
this.state = TokenizerStates.NUMBER_AFTER_INITIAL_ZERO;
continue;
}
if (n >= charset.DIGIT_ONE && n <= charset.DIGIT_NINE) {
this.bufferedNumber.appendChar(n);
this.state = TokenizerStates.NUMBER_AFTER_INITIAL_NON_ZERO;
continue;
}
break;
case TokenizerStates.NUMBER_AFTER_INITIAL_ZERO:
if (n === charset.FULL_STOP) {
this.bufferedNumber.appendChar(n);
this.state = TokenizerStates.NUMBER_AFTER_FULL_STOP;
continue;
}
if (
n === charset.LATIN_SMALL_LETTER_E ||
n === charset.LATIN_CAPITAL_LETTER_E
) {
this.bufferedNumber.appendChar(n);
this.state = TokenizerStates.NUMBER_AFTER_E;
continue;
}
i -= 1;
this.state = TokenizerStates.START;
this.emitNumber();
continue;
case TokenizerStates.NUMBER_AFTER_INITIAL_NON_ZERO:
if (n >= charset.DIGIT_ZERO && n <= charset.DIGIT_NINE) {
this.bufferedNumber.appendChar(n);
continue;
}
if (n === charset.FULL_STOP) {
this.bufferedNumber.appendChar(n);
this.state = TokenizerStates.NUMBER_AFTER_FULL_STOP;
continue;
}
if (
n === charset.LATIN_SMALL_LETTER_E ||
n === charset.LATIN_CAPITAL_LETTER_E
) {
this.bufferedNumber.appendChar(n);
this.state = TokenizerStates.NUMBER_AFTER_E;
continue;
}
i -= 1;
this.state = TokenizerStates.START;
this.emitNumber();
continue;
case TokenizerStates.NUMBER_AFTER_FULL_STOP:
if (n >= charset.DIGIT_ZERO && n <= charset.DIGIT_NINE) {
this.bufferedNumber.appendChar(n);
this.state = TokenizerStates.NUMBER_AFTER_DECIMAL;
continue;
}
break;
case TokenizerStates.NUMBER_AFTER_DECIMAL:
if (n >= charset.DIGIT_ZERO && n <= charset.DIGIT_NINE) {
this.bufferedNumber.appendChar(n);
continue;
}
if (
n === charset.LATIN_SMALL_LETTER_E ||
n === charset.LATIN_CAPITAL_LETTER_E
) {
this.bufferedNumber.appendChar(n);
this.state = TokenizerStates.NUMBER_AFTER_E;
continue;
}
i -= 1;
this.state = TokenizerStates.START;
this.emitNumber();
continue;
case TokenizerStates.NUMBER_AFTER_E:
if (n === charset.PLUS_SIGN || n === charset.HYPHEN_MINUS) {
this.bufferedNumber.appendChar(n);
this.state = TokenizerStates.NUMBER_AFTER_E_AND_SIGN;
continue;
}
// Allow cascading
case TokenizerStates.NUMBER_AFTER_E_AND_SIGN:
if (n >= charset.DIGIT_ZERO && n <= charset.DIGIT_NINE) {
this.bufferedNumber.appendChar(n);
this.state = TokenizerStates.NUMBER_AFTER_E_AND_DIGIT;
continue;
}
break;
case TokenizerStates.NUMBER_AFTER_E_AND_DIGIT:
if (n >= charset.DIGIT_ZERO && n <= charset.DIGIT_NINE) {
this.bufferedNumber.appendChar(n);
continue;
}
i -= 1;
this.state = TokenizerStates.START;
this.emitNumber();
continue;
// TRUE
case TokenizerStates.TRUE1:
if (n === charset.LATIN_SMALL_LETTER_R) {
this.state = TokenizerStates.TRUE2;
continue;
}
break;
case TokenizerStates.TRUE2:
if (n === charset.LATIN_SMALL_LETTER_U) {
this.state = TokenizerStates.TRUE3;
continue;
}
break;
case TokenizerStates.TRUE3:
if (n === charset.LATIN_SMALL_LETTER_E) {
this.state = TokenizerStates.START;
this.onToken(TRUE, true, this.offset);
this.offset += 3;
continue;
}
break;
// FALSE
case TokenizerStates.FALSE1:
if (n === charset.LATIN_SMALL_LETTER_A) {
this.state = TokenizerStates.FALSE2;
continue;
}
break;
case TokenizerStates.FALSE2:
if (n === charset.LATIN_SMALL_LETTER_L) {
this.state = TokenizerStates.FALSE3;
continue;
}
break;
case TokenizerStates.FALSE3:
if (n === charset.LATIN_SMALL_LETTER_S) {
this.state = TokenizerStates.FALSE4;
continue;
}
break;
case TokenizerStates.FALSE4:
if (n === charset.LATIN_SMALL_LETTER_E) {
this.state = TokenizerStates.START;
this.onToken(FALSE, false, this.offset);
this.offset += 4;
continue;
}
break;
// NULL
case TokenizerStates.NULL1:
if (n === charset.LATIN_SMALL_LETTER_U) {
this.state = TokenizerStates.NULL2;
continue;
}
break;
case TokenizerStates.NULL2:
if (n === charset.LATIN_SMALL_LETTER_L) {
this.state = TokenizerStates.NULL3;
continue;
}
break;
case TokenizerStates.NULL3:
if (n === charset.LATIN_SMALL_LETTER_L) {
this.state = TokenizerStates.START;
this.onToken(NULL, null, this.offset);
this.offset += 3;
continue;
}
break;
case TokenizerStates.SEPARATOR:
this.separatorIndex += 1;
if (
!this.separatorBytes ||
n !== this.separatorBytes[this.separatorIndex]
) {
break;
}
if (this.separatorIndex === this.separatorBytes.length - 1) {
this.state = TokenizerStates.START;
this.onToken(
TokenType.SEPARATOR,
this.separator as string,
this.offset + this.separatorIndex
);
this.separatorIndex = 0;
}
continue;
case TokenizerStates.ENDED:
if (
n === charset.SPACE ||
n === charset.NEWLINE ||
n === charset.CARRIAGE_RETURN ||
n === charset.TAB
) {
// whitespace
continue;
}
}
this.error(
new TokenizerError(
`Unexpected "${String.fromCharCode(n)}" at position "${i}" in state ${
TokenizerStates[this.state]
}`
)
);
return;
}
}
private emitNumber(): void {
this.onToken(
NUMBER,
this.parseNumber(this.bufferedNumber.toString()),
this.offset
);
this.offset += this.bufferedNumber.byteLength - 1;
}
protected parseNumber(numberStr: string): number {
return Number(numberStr);
}
public error(err: Error): void {
if (this.state !== TokenizerStates.ENDED) {
this.state = TokenizerStates.ERROR;
}
this.onError(err);
}
public end(): void {
switch (this.state) {
case TokenizerStates.NUMBER_AFTER_INITIAL_ZERO:
case TokenizerStates.NUMBER_AFTER_INITIAL_NON_ZERO:
case TokenizerStates.NUMBER_AFTER_DECIMAL:
case TokenizerStates.NUMBER_AFTER_E_AND_DIGIT:
this.state = TokenizerStates.ENDED;
this.emitNumber();
this.onEnd();
break;
case TokenizerStates.START:
case TokenizerStates.ERROR:
case TokenizerStates.SEPARATOR:
this.state = TokenizerStates.ENDED;
this.onEnd();
break;
default:
this.error(
new TokenizerError(
`Tokenizer ended in the middle of a token (state: ${
TokenizerStates[this.state]
}). Either not all the data was received or the data was invalid.`
)
);
}
}
public onToken(token: TokenType.LEFT_BRACE, value: "{", offset: number): void;
public onToken(
token: TokenType.RIGHT_BRACE,
value: "}",
offset: number
): void;
public onToken(
token: TokenType.LEFT_BRACKET,
value: "[",
offset: number
): void;
public onToken(
token: TokenType.RIGHT_BRACKET,
value: "]",
offset: number
): void;
public onToken(token: TokenType.COLON, value: ":", offset: number): void;
public onToken(token: TokenType.COMMA, value: ",", offset: number): void;
public onToken(token: TokenType.TRUE, value: true, offset: number): void;
public onToken(token: TokenType.FALSE, value: false, offset: number): void;
public onToken(token: TokenType.NULL, value: null, offset: number): void;
public onToken(token: TokenType.STRING, value: string, offset: number): void;
public onToken(token: TokenType.NUMBER, value: number, offset: number): void;
public onToken(
token: TokenType.SEPARATOR,
value: string,
offset: number
): void;
// eslint-disable-next-line @typescript-eslint/no-unused-vars
public onToken(token: TokenType, value: JsonPrimitive, offset: number): void {
// Override me
throw new TokenizerError(
'Can\'t emit tokens before the "onToken" callback has been set up.'
);
}
public onError(err: Error): void {
// Override me
throw err;
}
public onEnd(): void {
// Override me
}
}