7.lua语法解析-token函数

2024-01-19 14:41:22

1.lua的zio

1.1 zio的定义

typedef const char * (*lua_Reader) (lua_State *L, void *ud, size_t *sz);

struct Zio {
  size_t n;			    //缓存长度
  const char *p;		//缓存
  lua_Reader reader;	//读取内容的回调函数
  void *data;			//回调函数的参数
  lua_State *L;			/* Lua state (for reader) */
};

LUAI_FUNC void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader,void *data);
LUAI_FUNC int luaZ_fill (ZIO *z);
#define zgetc(z)  (((z)->n--)>0 ?  cast_uchar(*(z)->p++) : luaZ_fill(z))

1.2 zio的初始化

void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader, void *data) {
  z->L = L;
  z->reader = reader;
  z->data = data;
  z->n = 0;
  z->p = NULL;
}

1.3 zio的读取

#define EOZ	(-1)

int luaZ_fill (ZIO *z) {
  size_t size;
  lua_State *L = z->L;
  const char *buff;
  lua_unlock(L);
  buff = z->reader(L, z->data, &size);  //调用回调,buff是读取到的数据,size返回的是buff的长度
  lua_lock(L);
  if (buff == NULL || size == 0)        //读到结尾
    return EOZ;                         //返回-1
  z->n = size - 1;                      //长度
  z->p = buff;                          //缓冲区
  return cast_uchar(*(z->p++));         //转成unsigned char
}
//返回一个字符，如果缓冲区数据为空则读取数据
#define zgetc(z)  (((z)->n--)>0 ?  cast_uchar(*(z)->p++) : luaZ_fill(z))

2.lua语法解析

2.1 EBNF词法

statlist -> { stat [';'] }
fieldsel -> ['.' | ':'] NAME
index -> '[' expr ']'
recfield -> (NAME | '['exp']') = exp
listfield -> exp
field -> listfield | recfield
constructor -> '{' [ field { sep field } [sep] ] '}' 
sep -> ',' | ';'
parlist -> [ {NAME ','} (NAME | '...') ]
body ->  '(' parlist ')' block END
explist -> expr { ',' expr }
funcargs -> '(' [ explist ] ')'
funcargs -> constructor
funcargs -> STRING
primaryexp -> NAME | '(' expr ')'
suffixedexp -> primaryexp { '.' NAME | '[' exp ']' | ':' NAME funcargs | funcargs
simpleexp -> FLT | INT | STRING | NIL | TRUE | FALSE | ... | constructor | FUNCTION body | suffixedexp
block -> statlist
restassign -> ',' suffixedexp restassign 
restassign -> '=' explist
cond -> exp
label -> '::' NAME '::'
whilestat -> WHILE cond DO block END
repeatstat -> REPEAT block UNTIL cond
forbody -> DO block
fornum -> NAME = exp,exp[,exp] forbody
forlist -> NAME {,NAME} IN explist forbody
forstat -> FOR (fornum | forlist) END
test_then_block -> [IF | ELSEIF] cond THEN block
ifstat -> IF cond THEN block {ELSEIF cond THEN block} [ELSE block] END
ATTRIB -> ['<' Name '>']
stat -> LOCAL NAME ATTRIB { ',' NAME ATTRIB } ['=' explist]
funcname -> NAME {fieldsel} [':' NAME]
funcstat -> FUNCTION funcname body
stat -> func | assignment
stat -> RETURN [explist] [';']

[]中括号包住的部分表示可选
{}大括号包住的部分,表示会有0次或者多次出现
大写字母表示一个终结符

这些词法来源于lparser.c文件中的注释,一般会有独立的函数处理各个词法

2.2 lua中token定义

enum RESERVED {
  /* terminal symbols denoted by reserved words */
  TK_AND = FIRST_RESERVED, TK_BREAK,
  TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION,
  TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT,
  TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE,
  /* other terminal symbols */
  TK_IDIV, TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE,
  TK_SHL, TK_SHR,
  TK_DBCOLON, TK_EOS,
  TK_FLT, TK_INT, TK_NAME, TK_STRING
};

static const char *const luaX_tokens [] = {
    "and", "break", "do", "else", "elseif",
    "end", "false", "for", "function", "goto", "if",
    "in", "local", "nil", "not", "or", "repeat",
    "return", "then", "true", "until", "while",
    "//", "..", "...", "==", ">=", "<=", "~=",
    "<<", ">>", "::", "<eof>",
    "<number>", "<integer>", "<name>", "<string>"
};

在这些token定义中while之前的token在虚拟机创建时已经加入字符串池

#define NUM_RESERVED	(cast_int(TK_WHILE-FIRST_RESERVED + 1))

void luaX_init (lua_State *L) {
  int i;
  TString *e = luaS_newliteral(L, LUA_ENV);  /* create env name */
  luaC_fix(L, obj2gco(e));  /* never collect this name */
  for (i=0; i<NUM_RESERVED; i++) {
    TString *ts = luaS_new(L, luaX_tokens[i]);
    luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
    ts->extra = cast_byte(i+1);  /* reserved word */
  }
}

这段代码的一个细节就是ts->extra的赋值,这个之在token分割函数中被用到,用来区分关键字

2.3 token分割函数

#define next(ls)	(ls->current = zgetc(ls->z))					  //使用zget读取字符
#define isreserved(s)	((s)->tt == LUA_VSHRSTR && (s)->extra > 0)    //关键字区分

static int llex (LexState *ls, SemInfo *seminfo) {
  luaZ_resetbuffer(ls->buff);
  for (;;) {
    switch (ls->current) {
      case '\n': case '\r': {  //换行
        inclinenumber(ls);
        break;
      }
      case ' ': case '\f': case '\t': case '\v': {  //空格
        next(ls);
        break;
      }
      case '-': {  //可能是负数,可能是注释
        next(ls);
        if (ls->current != '-') return '-';
        /* else is a comment */
        next(ls);
        if (ls->current == '[') {  //多行注释
          size_t sep = skip_sep(ls);
          luaZ_resetbuffer(ls->buff);  /* 'skip_sep' may dirty the buffer */
          if (sep >= 2) {
            read_long_string(ls, NULL, sep);      //读到没注释的地方
            luaZ_resetbuffer(ls->buff);  /* previous call may dirty the buff. */
            break;
          }
        }
        /* else short comment */
        while (!currIsNewline(ls) && ls->current != EOZ)  //单行注释,跳出这一行
          next(ls);  /* skip until end of line (or end of file) */
        break;
      }
      case '[': {  /* long string or simply '[' */
        size_t sep = skip_sep(ls);
        if (sep >= 2) {
          read_long_string(ls, seminfo, sep);   //读到']'之后,把字符串保存到seminfo->ts
          return TK_STRING;
        }
        else if (sep == 0)  /* '[=...' missing second bracket? */
          lexerror(ls, "invalid long string delimiter", TK_STRING);
        return '[';
      }
      case '=': {
        next(ls);
        if (check_next1(ls, '=')) return TK_EQ;  /* '==' */
        else return '=';
      }
      case '<': {
        next(ls);
        if (check_next1(ls, '=')) return TK_LE;  /* '<=' */
        else if (check_next1(ls, '<')) return TK_SHL;  /* '<<' */
        else return '<';
      }
      case '>': {
        next(ls);
        if (check_next1(ls, '=')) return TK_GE;  /* '>=' */
        else if (check_next1(ls, '>')) return TK_SHR;  /* '>>' */
        else return '>';
      }
      case '/': {
        next(ls);
        if (check_next1(ls, '/')) return TK_IDIV;  /* '//' */
        else return '/';
      }
      case '~': {
        next(ls);
        if (check_next1(ls, '=')) return TK_NE;  /* '~=' */
        else return '~';
      }
      case ':': {
        next(ls);
        if (check_next1(ls, ':')) return TK_DBCOLON;  /* '::' */
        else return ':';
      }
      case '"': case '\'': {  //字符串
        read_string(ls, ls->current, seminfo);  //读取字符串,保存在seminfo->ts
        return TK_STRING;
      }
      case '.': {  /* '.', '..', '...', or number */
        save_and_next(ls);
        if (check_next1(ls, '.')) {
          if (check_next1(ls, '.'))
            return TK_DOTS;   /* '...' */
          else return TK_CONCAT;   /* '..' */
        }
        else if (!lisdigit(ls->current)) return '.';
        else return read_numeral(ls, seminfo);
      }
      case '0': case '1': case '2': case '3': case '4':
      case '5': case '6': case '7': case '8': case '9': {
        return read_numeral(ls, seminfo);
      }
      case EOZ: {       //结束
        return TK_EOS;
      }
      default: {
        if (lislalpha(ls->current)) {  //标识符判断
          TString *ts;
          do {
            save_and_next(ls);
          } while (lislalnum(ls->current));
          ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
                                  luaZ_bufflen(ls->buff));
          seminfo->ts = ts;
          if (isreserved(ts))  //关键字
            return ts->extra - 1 + FIRST_RESERVED;  //ts->extra在luaX_init初始化
          else {
            return TK_NAME;    //标识符
          }
        }
        else {  /* single-char tokens ('+', '*', '%', '{', '}', ...) */
          int c = ls->current;
          next(ls);
          return c;
        }
      }
    }
  }
}

2.4 gdb调试脚本token输出

为了方便调试简单在源码了复制了一份代码

define plua_tokens
if $argc != 1
    p "参数数目错误,{plua_tokens LexState *}"
end
printf "%s",print_tokens($arg0)
end

在gdb_print.c实现print_tokens然后封装一个调试脚本

width = 10
height = 30
str = "hello lua"

tab = {a=12,b=14,c=30}

function number_max(a,b)
    if a > b then
        return a
    end

    return b
end

gdb执行

b statlist
plua_tokens ls

ls类型是 LexState

输出结果

居然的做法可以看 https://github.com/huoyang11/read_lua/blob/main/src/gdb_print.c

2.5 gdb调试脚本code输出

plua_lscode ls
ls类型是 LexState

在lua语法解析时可以通过这个脚本看解析的指令

码农公寓