compiler/parser.js

const INIT = 0 // 初始状态
const LEFT_TAG_START = 1 // 左标签的开始符号状态
const LEFT_TAG = 2 // 左标签名字状态
const LEFT_TAG_IN = 3 // 左标签里面状态
const LEFT_TAG_SLASH = 4 // 左标签斜杠
const LEFT_TAG_END = 5 // 左标签结束状态
const RIGHT_TAG_START = 6 // 右标签开始符号状态
const RIGHT_TAG = 7 // 右标签名字状态
const RIGHT_TAG_SLASH = 8 // 右标签斜杆状态
const RIGHT_TAG_END = 9 // 右标签结束状态
const TEXT = 10 // 文本状态
const ATTR_KEY = 11 // 属性名状态
const ATTR_SPACE = 12 // 属性空格状态
const ATTR_EQUAL = 13 // 属性等于号状态
const ATTR_VAL = 14 // 属性值状态
const NOTE_START = 15 // 注释开始状态
const NOTE_HYPHEN = 16 // 注释短横线状态
const NOTE_IN = 17 // 注释里面状态
const NOTE_END = 18 // 注释结束状态

// 当前已有的标签名
const Tag = {
  // 单标签
  single: [
    'input',
    'img',
    'br',
    'hr',
    'link',
    'meta'
  ],
  // // 双标签
  // double: [
  //   'div',
  //   'span',
  //   'ul',
  //   'li',
  //   'ol',
  //   'a',
  //   'i',
  //   'b',
  //   'p',
  //   'header',
  //   'aside'
  // ],
  isSingle (tag) {
    return this.single.indexOf(tag) >= 0
  },
  isDouble (tag) {
    return this.single.indexOf(tag) < 0
  }
}

// 跳转时的判断条件
const Condition = {
  // 是否是字母
  isLetter (ch) {
    return /[a-zA-Z]/.test(ch)
  },
  // 是否是小于号
  isLg (ch) {
    return ch === '<'
  },
  // 是否是大于号
  isGt (ch) {
    return ch === '>'
  },
  // 是否是感叹号
  isExclamatory (ch) {
    return ch === '!'
  },
  // 是否是短横线
  isHyphen (ch) {
    return ch === '-'
  },
  // 是否是斜杠
  isSlash (ch) {
    return ch === '/'
  },
  // 是否是空格
  isSpace (ch) {
    return ch === ' '
  },
  // 是否是等于号
  isEqual (ch) {
    return ch === '='
  },
  // 是否是单引号
  isSingleQuote (ch) {
    return ch === "'"
  },
  // 是否是双引号
  isDoubleQuote (ch) {
    return ch === '"'
  },
  // 是否是引号
  isQuote (ch) {
    return this.isSingleQuote(ch) || this.isDoubleQuote(ch)
  }
}

function parse (str) {
  let isRoot = this.p === 0
  let state = INIT // 初始状态
  let nodes = [] // 该层结点数组
  let curNode // 当前结点
  let tag = '' // 暂存标签名
  let key = '' // 暂存属性名
  let val = '' // 暂存属性值
  let text = '' // 暂存文本值
  let note = '' // 暂存注释值
  let quote = '' // 暂存引号
  let hyphenDirection = '' // 暂存注释横线方向(左右)
  let tmp = '' // 暂存值
  this.len = str.length
  while (this.p < this.len) {
    // console.log(p, len, state)
    // 有穷自动机逻辑
    let ch = str.charAt(this.p) // 当前字符
    switch (state) {
      case INIT:
        // 新建结点对象
        curNode = {}
        nodes.push(curNode)
        if (Condition.isLg(ch)) {
          state = LEFT_TAG_START
          tmp = ch
        } else {
          state = TEXT
          curNode.type = 'text'
          text = ch
        }
        this.p++
        continue
      case LEFT_TAG_START:
        if (Condition.isExclamatory(ch)) {
          state = NOTE_START
        } else if (Condition.isLetter(ch)) {
          state = LEFT_TAG
          curNode.type = 'tag'
          tag = ch
        } else if (Condition.isSpace(ch)) {
          state = TEXT
          curNode.type = 'text'
          text = tmp + ch
          tmp = ''
        } else {
          if (isRoot) {
            throw new Error(`The tag name is illegal in ${this.p}.`)
          } else {
            this.p--
            nodes.pop()
            return nodes
          }
        }
        this.p++
        continue
      case LEFT_TAG:
        if (Condition.isLetter(ch)) {
          tag += ch
        } else if (Condition.isSpace(ch)) {
          state = LEFT_TAG_IN
          curNode.tag = tag
        } else if (Condition.isGt(ch)) {
          state = LEFT_TAG_END
          curNode.tag = tag
        } else {
          throw new Error(`The tag name is illegal in ${this.p}.`)
        }
        this.p++
        continue
      case LEFT_TAG_IN:
        if (!Condition.isSlash(ch) && !Condition.isSpace(ch)) {
          state = ATTR_KEY
          key = ch
        }
        if (Condition.isSlash(ch)) {
          state = LEFT_TAG_SLASH
          tmp = ch
        }
        if (Condition.isGt(ch)) {
          state = LEFT_TAG_END
        }
        this.p++
        continue
      case LEFT_TAG_SLASH:
        if (Condition.isGt(ch)) {
          state = LEFT_TAG_END
        }
        this.p++
        continue
      case LEFT_TAG_END:
        if (Tag.isSingle(tag)) {
          state = INIT
          tag = ''
          continue
        } else if (Condition.isSlash(tmp)) {
          throw new Error(`The ${tag} is not a single tag.`)
        }
        if (Condition.isLg(ch)) {
          state = RIGHT_TAG_START
          this.p++
        } else {
          curNode.children = parse.call(this, str)
        }
        continue
      case RIGHT_TAG_START:
        tmp = ''
        if (!Condition.isSlash(ch)) {
          this.p--
          curNode.children = parse.call(this, str)
          // < / 会死循环 --> 跳过<
          this.p++
        } else {
          state = RIGHT_TAG_SLASH
          this.p++
        }
        continue
      case RIGHT_TAG_SLASH:
        if (Condition.isSpace(ch) || Condition.isGt(ch) || Condition.isSlash(ch)) {
          throw new Error(`A letter is expected in ${this.p} but got ${ch}.`)
        } else {
          state = RIGHT_TAG
          tmp += ch
          this.p++
        }
        continue
      case RIGHT_TAG:
        if (Condition.isSlash(ch)) {
          throw new Error(`A letter is expected in ${this.p} but got ${ch}.`)
        } else if (Condition.isGt(ch)) {
          state = RIGHT_TAG_END
        } else {
          tmp += ch
        }
        this.p++
        continue
      case RIGHT_TAG_END:
        if (tag !== tmp) {
          throw new Error(`The tag name is not similar in ${this.p - 1}`)
        }
        if (!Tag.isDouble(tag)) {
          throw new Error(`The ${tag} is not a double tag in ${this.p - 1}`)
        }
        tag = ''
        state = INIT
        continue
      case TEXT:
        if (!Condition.isLg(ch)) {
          text += ch
          this.p++
        } else {
          curNode.text = text.trim()
          if (curNode.text === '') {
            nodes.pop()
          }
          text = ''
          state = INIT
        }
        continue
      case ATTR_KEY:
        if (Condition.isSpace(ch)) {
          state = ATTR_SPACE
        } else if (Condition.isEqual(ch)) {
          state = ATTR_EQUAL
        } else {
          key += ch
        }
        this.p++
        continue
      case ATTR_SPACE:
        if (!Condition.isSpace(ch) && !Condition.isEqual(ch)) {
          state = ATTR_KEY
        }
        if (Condition.isEqual(ch)) {
          state = ATTR_EQUAL
        }
        this.p++
        continue
      case ATTR_EQUAL:
        if (Condition.isQuote(ch)) {
          state = ATTR_VAL
          quote = ch
        }
        this.p++
        continue
      case ATTR_VAL:
        if (ch !== quote) {
          val += ch
        } else {
          state = LEFT_TAG_IN
          if (curNode.attrs === undefined) {
            curNode.attrs = {}
          }
          curNode.attrs[key] = val
          val = ''
        }
        this.p++
        continue
      case NOTE_START:
        if (Condition.isHyphen(ch)) {
          curNode.type = 'note'
          state = NOTE_HYPHEN
          hyphenDirection = 'left'
        } else {
          throw new Error(`'-' is expected in ${this.p} but got ${ch}.`)
        }
        this.p++
        continue
      case NOTE_HYPHEN:
        if (Condition.isHyphen(ch)) {
        } else if (Condition.isGt(ch)) {
          state = NOTE_END
          tmp = ch
        } else if (hyphenDirection === 'left') {
          state = NOTE_IN
          note += ch
        } else {
          throw new Error(`'-' or '>' is expected in ${this.p} but got ${ch}.`)
        }
        this.p++
        continue
      case NOTE_IN:
        if (Condition.isHyphen(ch)) {
          state = NOTE_HYPHEN
          hyphenDirection = 'right'
        } else if (Condition.isGt(ch)) {
          throw new Error(`'-' is expected in ${this.p} but got ${ch}.`)
        } else {
          note += ch
        }
        this.p++
        continue
      case NOTE_END:
        curNode.text = note
        note = ''
        hyphenDirection = ''
        state = INIT
        continue
    }
  }
  // 添加上字符串末尾状态信息
  if (curNode.type === 'text') {
    curNode.text = text.trim()
    if (curNode.text === '') {
      nodes.pop()
    }
  }
  if (curNode.type === 'note') {
    curNode.note = note
  }
  // 将p重置
  this.p = 0

  return nodes
}

/**
 * @class Parser
 */
function Parser () {
  this.p = 0 // 移动指针
  this.len = 0 // 字符串的长度
}

/**
 * @description parse the html string, defined in Parser.prototype
 * @for Parser
 * @param { string } str html string
 * @return { Array } ast
 */
Parser.prototype.parse = parse

export default Parser