447 lines
13 KiB
JavaScript
447 lines
13 KiB
JavaScript
|
|
import { decode as _decode } from '../decode.js'
|
||
|
|
import { Token, Type } from '../token.js'
|
||
|
|
import { decodeCodePointsArray } from '../byte-utils.js'
|
||
|
|
import { decodeErrPrefix } from '../common.js'
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @typedef {import('../../interface').DecodeOptions} DecodeOptions
|
||
|
|
* @typedef {import('../../interface').DecodeTokenizer} DecodeTokenizer
|
||
|
|
*/
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @implements {DecodeTokenizer}
|
||
|
|
*/
|
||
|
|
class Tokenizer {
|
||
|
|
/**
|
||
|
|
* @param {Uint8Array} data
|
||
|
|
* @param {DecodeOptions} options
|
||
|
|
*/
|
||
|
|
constructor (data, options = {}) {
|
||
|
|
this.pos = 0
|
||
|
|
this.data = data
|
||
|
|
this.options = options
|
||
|
|
/** @type {string[]} */
|
||
|
|
this.modeStack = ['value']
|
||
|
|
this.lastToken = ''
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @returns {boolean}
|
||
|
|
*/
|
||
|
|
done () {
|
||
|
|
return this.pos >= this.data.length
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @returns {number}
|
||
|
|
*/
|
||
|
|
ch () {
|
||
|
|
return this.data[this.pos]
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @returns {string}
|
||
|
|
*/
|
||
|
|
currentMode () {
|
||
|
|
return this.modeStack[this.modeStack.length - 1]
|
||
|
|
}
|
||
|
|
|
||
|
|
skipWhitespace () {
|
||
|
|
let c = this.ch()
|
||
|
|
// @ts-ignore
|
||
|
|
while (c === 32 /* ' ' */ || c === 9 /* '\t' */ || c === 13 /* '\r' */ || c === 10 /* '\n' */) {
|
||
|
|
c = this.data[++this.pos]
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @param {number[]} str
|
||
|
|
*/
|
||
|
|
expect (str) {
|
||
|
|
if (this.data.length - this.pos < str.length) {
|
||
|
|
throw new Error(`${decodeErrPrefix} unexpected end of input at position ${this.pos}`)
|
||
|
|
}
|
||
|
|
for (let i = 0; i < str.length; i++) {
|
||
|
|
if (this.data[this.pos++] !== str[i]) {
|
||
|
|
throw new Error(`${decodeErrPrefix} unexpected token at position ${this.pos}, expected to find '${String.fromCharCode(...str)}'`)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
parseNumber () {
|
||
|
|
const startPos = this.pos
|
||
|
|
let negative = false
|
||
|
|
let float = false
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @param {number[]} chars
|
||
|
|
*/
|
||
|
|
const swallow = (chars) => {
|
||
|
|
while (!this.done()) {
|
||
|
|
const ch = this.ch()
|
||
|
|
if (chars.includes(ch)) {
|
||
|
|
this.pos++
|
||
|
|
} else {
|
||
|
|
break
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// lead
|
||
|
|
if (this.ch() === 45) { // '-'
|
||
|
|
negative = true
|
||
|
|
this.pos++
|
||
|
|
}
|
||
|
|
if (this.ch() === 48) { // '0'
|
||
|
|
this.pos++
|
||
|
|
if (this.ch() === 46) { // '.'
|
||
|
|
this.pos++
|
||
|
|
float = true
|
||
|
|
} else {
|
||
|
|
return new Token(Type.uint, 0, this.pos - startPos)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
swallow([48, 49, 50, 51, 52, 53, 54, 55, 56, 57]) // DIGIT
|
||
|
|
if (negative && this.pos === startPos + 1) {
|
||
|
|
throw new Error(`${decodeErrPrefix} unexpected token at position ${this.pos}`)
|
||
|
|
}
|
||
|
|
if (!this.done() && this.ch() === 46) { // '.'
|
||
|
|
if (float) {
|
||
|
|
throw new Error(`${decodeErrPrefix} unexpected token at position ${this.pos}`)
|
||
|
|
}
|
||
|
|
float = true
|
||
|
|
this.pos++
|
||
|
|
swallow([48, 49, 50, 51, 52, 53, 54, 55, 56, 57]) // DIGIT
|
||
|
|
}
|
||
|
|
if (!this.done() && (this.ch() === 101 || this.ch() === 69)) { // '[eE]'
|
||
|
|
float = true
|
||
|
|
this.pos++
|
||
|
|
if (!this.done() && (this.ch() === 43 || this.ch() === 45)) { // '+', '-'
|
||
|
|
this.pos++
|
||
|
|
}
|
||
|
|
swallow([48, 49, 50, 51, 52, 53, 54, 55, 56, 57]) // DIGIT
|
||
|
|
}
|
||
|
|
// @ts-ignore
|
||
|
|
const numStr = String.fromCharCode.apply(null, this.data.subarray(startPos, this.pos))
|
||
|
|
const num = parseFloat(numStr)
|
||
|
|
if (float) {
|
||
|
|
return new Token(Type.float, num, this.pos - startPos)
|
||
|
|
}
|
||
|
|
if (this.options.allowBigInt !== true || Number.isSafeInteger(num)) {
|
||
|
|
return new Token(num >= 0 ? Type.uint : Type.negint, num, this.pos - startPos)
|
||
|
|
}
|
||
|
|
return new Token(num >= 0 ? Type.uint : Type.negint, BigInt(numStr), this.pos - startPos)
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @returns {Token}
|
||
|
|
*/
|
||
|
|
parseString () {
|
||
|
|
/* c8 ignore next 4 */
|
||
|
|
if (this.ch() !== 34) { // '"'
|
||
|
|
// this would be a programming error
|
||
|
|
throw new Error(`${decodeErrPrefix} unexpected character at position ${this.pos}; this shouldn't happen`)
|
||
|
|
}
|
||
|
|
this.pos++
|
||
|
|
|
||
|
|
// check for simple fast-path, all printable ascii, no escapes
|
||
|
|
// >0x10000 elements may fail fn.apply() (http://stackoverflow.com/a/22747272/680742)
|
||
|
|
for (let i = this.pos, l = 0; i < this.data.length && l < 0x10000; i++, l++) {
|
||
|
|
const ch = this.data[i]
|
||
|
|
if (ch === 92 || ch < 32 || ch >= 128) { // '\', ' ', control-chars or non-trivial
|
||
|
|
break
|
||
|
|
}
|
||
|
|
if (ch === 34) { // '"'
|
||
|
|
// @ts-ignore
|
||
|
|
const str = String.fromCharCode.apply(null, this.data.subarray(this.pos, i))
|
||
|
|
this.pos = i + 1
|
||
|
|
return new Token(Type.string, str, l)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
const startPos = this.pos
|
||
|
|
const chars = []
|
||
|
|
|
||
|
|
const readu4 = () => {
|
||
|
|
if (this.pos + 4 >= this.data.length) {
|
||
|
|
throw new Error(`${decodeErrPrefix} unexpected end of unicode escape sequence at position ${this.pos}`)
|
||
|
|
}
|
||
|
|
let u4 = 0
|
||
|
|
for (let i = 0; i < 4; i++) {
|
||
|
|
let ch = this.ch()
|
||
|
|
if (ch >= 48 && ch <= 57) { // '0' && '9'
|
||
|
|
ch -= 48
|
||
|
|
} else if (ch >= 97 && ch <= 102) { // 'a' && 'f'
|
||
|
|
ch = ch - 97 + 10
|
||
|
|
} else if (ch >= 65 && ch <= 70) { // 'A' && 'F'
|
||
|
|
ch = ch - 65 + 10
|
||
|
|
} else {
|
||
|
|
throw new Error(`${decodeErrPrefix} unexpected unicode escape character at position ${this.pos}`)
|
||
|
|
}
|
||
|
|
u4 = u4 * 16 + ch
|
||
|
|
this.pos++
|
||
|
|
}
|
||
|
|
return u4
|
||
|
|
}
|
||
|
|
|
||
|
|
// mostly taken from feross/buffer and adjusted to fit
|
||
|
|
const readUtf8Char = () => {
|
||
|
|
const firstByte = this.ch()
|
||
|
|
let codePoint = null
|
||
|
|
/* c8 ignore next 1 */
|
||
|
|
let bytesPerSequence = (firstByte > 0xef) ? 4 : (firstByte > 0xdf) ? 3 : (firstByte > 0xbf) ? 2 : 1
|
||
|
|
|
||
|
|
if (this.pos + bytesPerSequence > this.data.length) {
|
||
|
|
throw new Error(`${decodeErrPrefix} unexpected unicode sequence at position ${this.pos}`)
|
||
|
|
}
|
||
|
|
|
||
|
|
let secondByte, thirdByte, fourthByte, tempCodePoint
|
||
|
|
|
||
|
|
switch (bytesPerSequence) {
|
||
|
|
/* c8 ignore next 6 */
|
||
|
|
// this case is dealt with by the caller function
|
||
|
|
case 1:
|
||
|
|
if (firstByte < 0x80) {
|
||
|
|
codePoint = firstByte
|
||
|
|
}
|
||
|
|
break
|
||
|
|
case 2:
|
||
|
|
secondByte = this.data[this.pos + 1]
|
||
|
|
if ((secondByte & 0xc0) === 0x80) {
|
||
|
|
tempCodePoint = (firstByte & 0x1f) << 0x6 | (secondByte & 0x3f)
|
||
|
|
if (tempCodePoint > 0x7f) {
|
||
|
|
codePoint = tempCodePoint
|
||
|
|
}
|
||
|
|
}
|
||
|
|
break
|
||
|
|
case 3:
|
||
|
|
secondByte = this.data[this.pos + 1]
|
||
|
|
thirdByte = this.data[this.pos + 2]
|
||
|
|
if ((secondByte & 0xc0) === 0x80 && (thirdByte & 0xc0) === 0x80) {
|
||
|
|
tempCodePoint = (firstByte & 0xf) << 0xc | (secondByte & 0x3f) << 0x6 | (thirdByte & 0x3f)
|
||
|
|
/* c8 ignore next 3 */
|
||
|
|
if (tempCodePoint > 0x7ff && (tempCodePoint < 0xd800 || tempCodePoint > 0xdfff)) {
|
||
|
|
codePoint = tempCodePoint
|
||
|
|
}
|
||
|
|
}
|
||
|
|
break
|
||
|
|
case 4:
|
||
|
|
secondByte = this.data[this.pos + 1]
|
||
|
|
thirdByte = this.data[this.pos + 2]
|
||
|
|
fourthByte = this.data[this.pos + 3]
|
||
|
|
if ((secondByte & 0xc0) === 0x80 && (thirdByte & 0xc0) === 0x80 && (fourthByte & 0xc0) === 0x80) {
|
||
|
|
tempCodePoint = (firstByte & 0xf) << 0x12 | (secondByte & 0x3f) << 0xc | (thirdByte & 0x3f) << 0x6 | (fourthByte & 0x3f)
|
||
|
|
if (tempCodePoint > 0xffff && tempCodePoint < 0x110000) {
|
||
|
|
codePoint = tempCodePoint
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/* c8 ignore next 5 */
|
||
|
|
if (codePoint === null) {
|
||
|
|
// we did not generate a valid codePoint so insert a
|
||
|
|
// replacement char (U+FFFD) and advance only 1 byte
|
||
|
|
codePoint = 0xfffd
|
||
|
|
bytesPerSequence = 1
|
||
|
|
} else if (codePoint > 0xffff) {
|
||
|
|
// encode to utf16 (surrogate pair dance)
|
||
|
|
codePoint -= 0x10000
|
||
|
|
chars.push(codePoint >>> 10 & 0x3ff | 0xd800)
|
||
|
|
codePoint = 0xdc00 | codePoint & 0x3ff
|
||
|
|
}
|
||
|
|
|
||
|
|
chars.push(codePoint)
|
||
|
|
this.pos += bytesPerSequence
|
||
|
|
}
|
||
|
|
|
||
|
|
// TODO: could take the approach of a quick first scan for special chars like encoding/json/decode.go#unquoteBytes
|
||
|
|
// and converting all of the ascii chars from the base array in bulk
|
||
|
|
while (!this.done()) {
|
||
|
|
const ch = this.ch()
|
||
|
|
let ch1
|
||
|
|
switch (ch) {
|
||
|
|
case 92: // '\'
|
||
|
|
this.pos++
|
||
|
|
if (this.done()) {
|
||
|
|
throw new Error(`${decodeErrPrefix} unexpected string termination at position ${this.pos}`)
|
||
|
|
}
|
||
|
|
ch1 = this.ch()
|
||
|
|
this.pos++
|
||
|
|
switch (ch1) {
|
||
|
|
case 34: // '"'
|
||
|
|
case 39: // '\''
|
||
|
|
case 92: // '\'
|
||
|
|
case 47: // '/'
|
||
|
|
chars.push(ch1)
|
||
|
|
break
|
||
|
|
case 98: // 'b'
|
||
|
|
chars.push(8)
|
||
|
|
break
|
||
|
|
case 116: // 't'
|
||
|
|
chars.push(9)
|
||
|
|
break
|
||
|
|
case 110: // 'n'
|
||
|
|
chars.push(10)
|
||
|
|
break
|
||
|
|
case 102: // 'f'
|
||
|
|
chars.push(12)
|
||
|
|
break
|
||
|
|
case 114: // 'r'
|
||
|
|
chars.push(13)
|
||
|
|
break
|
||
|
|
case 117: // 'u'
|
||
|
|
chars.push(readu4())
|
||
|
|
break
|
||
|
|
default:
|
||
|
|
throw new Error(`${decodeErrPrefix} unexpected string escape character at position ${this.pos}`)
|
||
|
|
}
|
||
|
|
break
|
||
|
|
case 34: // '"'
|
||
|
|
this.pos++
|
||
|
|
return new Token(Type.string, decodeCodePointsArray(chars), this.pos - startPos)
|
||
|
|
default:
|
||
|
|
if (ch < 32) { // ' '
|
||
|
|
throw new Error(`${decodeErrPrefix} invalid control character at position ${this.pos}`)
|
||
|
|
} else if (ch < 0x80) {
|
||
|
|
chars.push(ch)
|
||
|
|
this.pos++
|
||
|
|
} else {
|
||
|
|
readUtf8Char()
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
throw new Error(`${decodeErrPrefix} unexpected end of string at position ${this.pos}`)
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @returns {Token}
|
||
|
|
*/
|
||
|
|
parseValue () {
|
||
|
|
switch (this.ch()) {
|
||
|
|
case 123: // '{'
|
||
|
|
this.modeStack.push('obj-start')
|
||
|
|
this.pos++
|
||
|
|
return new Token(Type.map, Infinity, 1)
|
||
|
|
case 91: // '['
|
||
|
|
this.modeStack.push('array-start')
|
||
|
|
this.pos++
|
||
|
|
return new Token(Type.array, Infinity, 1)
|
||
|
|
case 34: { // '"'
|
||
|
|
return this.parseString()
|
||
|
|
}
|
||
|
|
case 110: // 'n' / null
|
||
|
|
this.expect([110, 117, 108, 108]) // 'null'
|
||
|
|
return new Token(Type.null, null, 4)
|
||
|
|
case 102: // 'f' / // false
|
||
|
|
this.expect([102, 97, 108, 115, 101]) // 'false'
|
||
|
|
return new Token(Type.false, false, 5)
|
||
|
|
case 116: // 't' / // true
|
||
|
|
this.expect([116, 114, 117, 101]) // 'true'
|
||
|
|
return new Token(Type.true, true, 4)
|
||
|
|
case 45: // '-'
|
||
|
|
case 48: // '0'
|
||
|
|
case 49: // '1'
|
||
|
|
case 50: // '2'
|
||
|
|
case 51: // '3'
|
||
|
|
case 52: // '4'
|
||
|
|
case 53: // '5'
|
||
|
|
case 54: // '6'
|
||
|
|
case 55: // '7'
|
||
|
|
case 56: // '8'
|
||
|
|
case 57: // '9'
|
||
|
|
return this.parseNumber()
|
||
|
|
default:
|
||
|
|
throw new Error(`${decodeErrPrefix} unexpected character at position ${this.pos}`)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @returns {Token}
|
||
|
|
*/
|
||
|
|
next () {
|
||
|
|
this.skipWhitespace()
|
||
|
|
switch (this.currentMode()) {
|
||
|
|
case 'value':
|
||
|
|
this.modeStack.pop()
|
||
|
|
return this.parseValue()
|
||
|
|
case 'array-value': {
|
||
|
|
this.modeStack.pop()
|
||
|
|
if (this.ch() === 93) { // ']'
|
||
|
|
this.pos++
|
||
|
|
this.skipWhitespace()
|
||
|
|
return new Token(Type.break, undefined, 1)
|
||
|
|
}
|
||
|
|
if (this.ch() !== 44) { // ','
|
||
|
|
throw new Error(`${decodeErrPrefix} unexpected character at position ${this.pos}, was expecting array delimiter but found '${String.fromCharCode(this.ch())}'`)
|
||
|
|
}
|
||
|
|
this.pos++
|
||
|
|
this.modeStack.push('array-value')
|
||
|
|
this.skipWhitespace()
|
||
|
|
return this.parseValue()
|
||
|
|
}
|
||
|
|
case 'array-start': {
|
||
|
|
this.modeStack.pop()
|
||
|
|
if (this.ch() === 93) { // ']'
|
||
|
|
this.pos++
|
||
|
|
this.skipWhitespace()
|
||
|
|
return new Token(Type.break, undefined, 1)
|
||
|
|
}
|
||
|
|
this.modeStack.push('array-value')
|
||
|
|
this.skipWhitespace()
|
||
|
|
return this.parseValue()
|
||
|
|
}
|
||
|
|
// @ts-ignore
|
||
|
|
case 'obj-key':
|
||
|
|
if (this.ch() === 125) { // '}'
|
||
|
|
this.modeStack.pop()
|
||
|
|
this.pos++
|
||
|
|
this.skipWhitespace()
|
||
|
|
return new Token(Type.break, undefined, 1)
|
||
|
|
}
|
||
|
|
if (this.ch() !== 44) { // ','
|
||
|
|
throw new Error(`${decodeErrPrefix} unexpected character at position ${this.pos}, was expecting object delimiter but found '${String.fromCharCode(this.ch())}'`)
|
||
|
|
}
|
||
|
|
this.pos++
|
||
|
|
this.skipWhitespace()
|
||
|
|
case 'obj-start': { // eslint-disable-line no-fallthrough
|
||
|
|
this.modeStack.pop()
|
||
|
|
if (this.ch() === 125) { // '}'
|
||
|
|
this.pos++
|
||
|
|
this.skipWhitespace()
|
||
|
|
return new Token(Type.break, undefined, 1)
|
||
|
|
}
|
||
|
|
const token = this.parseString()
|
||
|
|
this.skipWhitespace()
|
||
|
|
if (this.ch() !== 58) { // ':'
|
||
|
|
throw new Error(`${decodeErrPrefix} unexpected character at position ${this.pos}, was expecting key/value delimiter ':' but found '${String.fromCharCode(this.ch())}'`)
|
||
|
|
}
|
||
|
|
this.pos++
|
||
|
|
this.modeStack.push('obj-value')
|
||
|
|
return token
|
||
|
|
}
|
||
|
|
case 'obj-value': {
|
||
|
|
this.modeStack.pop()
|
||
|
|
this.modeStack.push('obj-key')
|
||
|
|
this.skipWhitespace()
|
||
|
|
return this.parseValue()
|
||
|
|
}
|
||
|
|
/* c8 ignore next 2 */
|
||
|
|
default:
|
||
|
|
throw new Error(`${decodeErrPrefix} unexpected parse state at position ${this.pos}; this shouldn't happen`)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @param {Uint8Array} data
|
||
|
|
* @param {DecodeOptions} [options]
|
||
|
|
* @returns {any}
|
||
|
|
*/
|
||
|
|
function decode (data, options) {
|
||
|
|
options = Object.assign({ tokenizer: new Tokenizer(data, options) }, options)
|
||
|
|
return _decode(data, options)
|
||
|
|
}
|
||
|
|
|
||
|
|
export { decode, Tokenizer }
|