From 0c21be8d6743d36356ba8461d0aa35c37babdad1 Mon Sep 17 00:00:00 2001 From: Max Wash Date: Mon, 11 May 2026 23:02:02 +0100 Subject: [PATCH] parse: lex: add proper data-driven state-machine functionality movement between lexer states is now defined (almost) exclusively by a table of outgoing links defined for each state type. the main lexer system uses this table to determine when, how, and to where the state should be changed. also add a dedicated lexer state for scanning hashtables, due to the particularly unique rules that apply within. --- bshell/parse/lex/arithmetic.c | 81 +++----- bshell/parse/lex/command.c | 53 ++--- bshell/parse/lex/hashtable.c | 160 ++++++++++++++ bshell/parse/lex/lex-internal.h | 73 +++++++ bshell/parse/lex/lex.c | 357 ++++++++++++++++++++++++++++---- bshell/parse/lex/statement.c | 224 ++++---------------- bshell/parse/lex/string.c | 8 - bshell/parse/lex/word.c | 21 +- 8 files changed, 646 insertions(+), 331 deletions(-) create mode 100644 bshell/parse/lex/hashtable.c diff --git a/bshell/parse/lex/arithmetic.c b/bshell/parse/lex/arithmetic.c index 66aea73..dcad905 100644 --- a/bshell/parse/lex/arithmetic.c +++ b/bshell/parse/lex/arithmetic.c @@ -5,7 +5,7 @@ static enum bshell_status arithmetic_hyphen(struct lex_ctx *ctx) fx_wchar c = peek_char(ctx); if (!fx_wchar_is_alnum(c)) { push_symbol(ctx, SYM_HYPHEN); - lex_state_change(ctx, LEX_STATE_ARITHMETIC); + handle_lex_state_transition(ctx, SYM_HYPHEN); return BSHELL_SUCCESS; } @@ -18,10 +18,10 @@ static enum bshell_status arithmetic_hyphen(struct lex_ctx *ctx) return status; } - bool converted = convert_word_to_int(tok); + unsigned int token_type = TOK_WORD; + if (convert_word_to_int(tok)) { + token_type = TOK_INT; - if (converted) { - lex_state_change(ctx, LEX_STATE_ARITHMETIC); /* because of APPEND_HYPHEN (which is needed to ensure operator * tokens are detected properly), the resulting number will be * negative. @@ -29,15 +29,8 @@ static enum bshell_status arithmetic_hyphen(struct lex_ctx *ctx) * must be positive */ tok->tok_int *= -1; push_symbol(ctx, SYM_HYPHEN); - enqueue_token(ctx, tok); - return BSHELL_SUCCESS; - } - - converted = convert_word_to_operator(ctx, tok); - if (converted) { - lex_state_change(ctx, LEX_STATE_ARITHMETIC); - } else { - lex_state_change(ctx, LEX_STATE_COMMAND); + } else if (convert_word_to_operator(ctx, tok)) { + token_type = TOK_OPERATOR; } enqueue_token(ctx, tok); @@ -53,6 +46,8 @@ static enum bshell_status arithmetic_symbol(struct lex_ctx *ctx) return status; } + handle_lex_state_transition(ctx, sym->id); + struct lex_token *tok = NULL; switch (sym->id) { case SYM_SQUOTE: @@ -66,12 +61,6 @@ static enum bshell_status arithmetic_symbol(struct lex_ctx *ctx) return arithmetic_hyphen(ctx); case SYM_HASH: return read_line_comment(ctx); - case SYM_DQUOTE: - if (!lex_state_push(ctx, LEX_STATE_STRING, 0)) { - return BSHELL_ERR_NO_MEMORY; - } - - return BSHELL_SUCCESS; case SYM_DOLLAR: status = read_var(ctx, TOK_VAR, &tok); if (status != BSHELL_SUCCESS) { @@ -94,14 +83,6 @@ static enum bshell_status arithmetic_symbol(struct lex_ctx *ctx) return status; } - enqueue_token(ctx, tok); - return status; - case SYM_AT_LEFT_BRACE: - status = read_braced_var(ctx, TOK_VAR_SPLAT, &tok); - if (status != BSHELL_SUCCESS) { - return status; - } - enqueue_token(ctx, tok); return status; default: @@ -110,26 +91,6 @@ static enum bshell_status arithmetic_symbol(struct lex_ctx *ctx) push_symbol(ctx, sym->id); - switch (sym->id) { - case SYM_LEFT_PAREN: - lex_state_push( - ctx, - LEX_STATE_STATEMENT, - STATEMENT_F_DISABLE_KEYWORDS); - return BSHELL_SUCCESS; - case SYM_DOLLAR_LEFT_PAREN: - lex_state_push(ctx, LEX_STATE_STATEMENT, 0); - return BSHELL_SUCCESS; - case SYM_RIGHT_PAREN: - lex_state_pop(ctx); - return BSHELL_SUCCESS; - case SYM_SEMICOLON: - lex_state_change(ctx, LEX_STATE_STATEMENT); - return BSHELL_SUCCESS; - default: - break; - } - return BSHELL_SUCCESS; } @@ -141,11 +102,16 @@ static enum bshell_status arithmetic_word(struct lex_ctx *ctx) return status; } - bool converted = convert_word_to_keyword(word); - if (!converted) { - converted = convert_word_to_int(word); + unsigned int token_type = TOK_WORD; + bool kw = false, number = false; + if (convert_word_to_keyword(word)) { + token_type = word->tok_keyword; + } else if (convert_word_to_int(word)) { + token_type = TOK_INT; } + handle_lex_state_transition(ctx, token_type); + enqueue_token(ctx, word); return BSHELL_SUCCESS; } @@ -180,7 +146,22 @@ static enum bshell_status arithmetic_pump_token(struct lex_ctx *ctx) return arithmetic_word(ctx); } +static const struct lex_state_link links[] = { + LINK_CHANGE(TOK_WORD, LEX_STATE_COMMAND), + LINK_PUSH(SYM_DQUOTE, LEX_STATE_STRING, 0), + LINK_PUSH(SYM_DOLLAR_LEFT_PAREN, LEX_STATE_STATEMENT, 0), + LINK_POP(SYM_RIGHT_PAREN), + LINK_CHANGE(SYM_SEMICOLON, LEX_STATE_STATEMENT), + LINK_PUSH(SYM_AT_LEFT_BRACE, LEX_STATE_HASHTABLE, 0), + LINK_PUSH( + SYM_LEFT_PAREN, + LEX_STATE_STATEMENT, + STATEMENT_F_DISABLE_KEYWORDS), + LINK_END, +}; + const struct lex_state_type lex_arithmetic_state = { .s_id = LEX_STATE_ARITHMETIC, .s_pump_token = arithmetic_pump_token, + .s_links = links, }; diff --git a/bshell/parse/lex/command.c b/bshell/parse/lex/command.c index 3178e81..0984acb 100644 --- a/bshell/parse/lex/command.c +++ b/bshell/parse/lex/command.c @@ -30,6 +30,8 @@ static enum bshell_status command_symbol(struct lex_ctx *ctx) return status; } + handle_lex_state_transition(ctx, sym->id); + struct lex_token *tok = NULL; switch (sym->id) { case SYM_SQUOTE: @@ -42,12 +44,6 @@ static enum bshell_status command_symbol(struct lex_ctx *ctx) case SYM_HASH: return read_line_comment(ctx); - case SYM_DQUOTE: - if (!lex_state_push(ctx, LEX_STATE_STRING, 0)) { - return BSHELL_ERR_NO_MEMORY; - } - - return BSHELL_SUCCESS; case SYM_DOLLAR: status = read_var(ctx, TOK_VAR, &tok); if (status != BSHELL_SUCCESS) { @@ -78,14 +74,6 @@ static enum bshell_status command_symbol(struct lex_ctx *ctx) lex_state_push(ctx, LEX_STATE_WORD, 0); } - enqueue_token(ctx, tok); - return status; - case SYM_AT_LEFT_BRACE: - status = read_braced_var(ctx, TOK_VAR_SPLAT, &tok); - if (status != BSHELL_SUCCESS) { - return status; - } - enqueue_token(ctx, tok); return status; default: @@ -94,26 +82,6 @@ static enum bshell_status command_symbol(struct lex_ctx *ctx) push_symbol(ctx, sym->id); - switch (sym->id) { - case SYM_LEFT_PAREN: - lex_state_push( - ctx, - LEX_STATE_STATEMENT, - STATEMENT_F_DISABLE_KEYWORDS); - return BSHELL_SUCCESS; - case SYM_DOLLAR_LEFT_PAREN: - lex_state_push(ctx, LEX_STATE_STATEMENT, 0); - return BSHELL_SUCCESS; - case SYM_RIGHT_PAREN: - lex_state_pop(ctx); - return BSHELL_SUCCESS; - case SYM_SEMICOLON: - lex_state_change(ctx, LEX_STATE_STATEMENT); - return BSHELL_SUCCESS; - default: - break; - } - return BSHELL_SUCCESS; } @@ -164,7 +132,7 @@ enum bshell_status command_pump_token(struct lex_ctx *ctx) if (newline) { struct lex_token *tok = lex_token_create(TOK_LINEFEED); enqueue_token(ctx, tok); - lex_state_change(ctx, LEX_STATE_STATEMENT); + handle_lex_state_transition(ctx, TOK_LINEFEED); return BSHELL_SUCCESS; } @@ -175,7 +143,22 @@ enum bshell_status command_pump_token(struct lex_ctx *ctx) return command_word(ctx); } +const struct lex_state_link links[] = { + LINK_PUSH(SYM_DQUOTE, LEX_STATE_STRING, 0), + LINK_PUSH( + SYM_LEFT_PAREN, + LEX_STATE_STATEMENT, + STATEMENT_F_DISABLE_KEYWORDS), + LINK_PUSH(SYM_DOLLAR_LEFT_PAREN, LEX_STATE_STATEMENT, 0), + LINK_POP(SYM_RIGHT_PAREN), + LINK_CHANGE(SYM_SEMICOLON, LEX_STATE_STATEMENT), + LINK_PUSH(SYM_AT_LEFT_BRACE, LEX_STATE_HASHTABLE, 0), + LINK_CHANGE(TOK_LINEFEED, LEX_STATE_STATEMENT), + LINK_END, +}; + const struct lex_state_type lex_command_state = { .s_id = LEX_STATE_COMMAND, .s_pump_token = command_pump_token, + .s_links = links, }; diff --git a/bshell/parse/lex/hashtable.c b/bshell/parse/lex/hashtable.c new file mode 100644 index 0000000..8bf018b --- /dev/null +++ b/bshell/parse/lex/hashtable.c @@ -0,0 +1,160 @@ +#include "lex-internal.h" + +static enum bshell_status hashtable_hyphen(struct lex_ctx *ctx) +{ + fx_wchar c = peek_char(ctx); + if (!fx_wchar_is_alnum(c)) { + push_symbol(ctx, SYM_HYPHEN); + handle_lex_state_transition(ctx, SYM_HYPHEN); + return BSHELL_SUCCESS; + } + + struct lex_token *tok = NULL; + enum bshell_status status = read_word( + ctx, + READ_NO_SET_TOKEN_START | READ_APPEND_HYPHEN, + &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + unsigned int token_type = TOK_WORD; + if (convert_word_to_int(tok)) { + token_type = TOK_INT; + /* because of APPEND_HYPHEN (which is needed to ensure operator + * tokens are detected properly), the resulting number will be + * negative. + * this token will be preceded by a HYPHEN token, so the number + * must be positive */ + tok->tok_int *= -1; + push_symbol(ctx, SYM_HYPHEN); + } else if (convert_word_to_operator(ctx, tok)) { + token_type = tok->tok_operator; + } + + handle_lex_state_transition(ctx, token_type); + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; +} + +static enum bshell_status hashtable_symbol(struct lex_ctx *ctx) +{ + const struct lex_token_def *sym = NULL; + enum bshell_status status = read_symbol(ctx, &sym); + + if (status != BSHELL_SUCCESS) { + return status; + } + + handle_lex_state_transition(ctx, sym->id); + + struct lex_token *tok = NULL; + switch (sym->id) { + case SYM_SQUOTE: + status = read_literal_string(ctx, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; + case SYM_HYPHEN: + return hashtable_hyphen(ctx); + case SYM_HASH: + return read_line_comment(ctx); + case SYM_DOLLAR: + status = read_var(ctx, TOK_VAR, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_AT: + status = read_var(ctx, TOK_VAR_SPLAT, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_DOLLAR_LEFT_BRACE: + status = read_braced_var(ctx, TOK_VAR, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + default: + break; + } + + push_symbol(ctx, sym->id); + + return BSHELL_SUCCESS; +} + +static enum bshell_status hashtable_word(struct lex_ctx *ctx) +{ + struct lex_token *word = NULL; + enum bshell_status status = read_word(ctx, 0, &word); + if (status != BSHELL_SUCCESS) { + return status; + } + + convert_word_to_int(word); + + handle_lex_state_transition(ctx, word->tok_type); + enqueue_token(ctx, word); + return BSHELL_SUCCESS; +} + +static enum bshell_status hashtable_pump_token(struct lex_ctx *ctx) +{ + fx_wchar c = peek_char(ctx); + bool newline = false; + + set_token_start(ctx); + while (fx_wchar_is_space(c)) { + if (c == '\n') { + newline = true; + } + + set_token_end(ctx); + advance_char_noread(ctx); + c = peek_char_noread(ctx); + } + +#if 0 + if (newline) { + struct lex_token *tok = lex_token_create(TOK_LINEFEED); + enqueue_token(ctx, tok); + lex_state_change(ctx, LEX_STATE_STATEMENT); + return BSHELL_SUCCESS; + } +#endif + + if (char_can_begin_symbol(ctx, c)) { + return hashtable_symbol(ctx); + } + + return hashtable_word(ctx); +} + +static const struct lex_state_link links[] = { + LINK_PUSH_WITH_TERM(SYM_EQUAL, LEX_STATE_STATEMENT, 0, SYM_SEMICOLON), + LINK_PUSH(SYM_DQUOTE, LEX_STATE_STRING, 0), + LINK_PUSH( + SYM_LEFT_PAREN, + LEX_STATE_STATEMENT, + STATEMENT_F_DISABLE_KEYWORDS), + LINK_PUSH(SYM_DOLLAR_LEFT_PAREN, LEX_STATE_STATEMENT, 0), + LINK_POP(SYM_RIGHT_BRACE), + LINK_END, +}; + +const struct lex_state_type lex_hashtable_state = { + .s_id = LEX_STATE_HASHTABLE, + .s_pump_token = hashtable_pump_token, + .s_links = links, +}; diff --git a/bshell/parse/lex/lex-internal.h b/bshell/parse/lex/lex-internal.h index 90ed2d7..16612e3 100644 --- a/bshell/parse/lex/lex-internal.h +++ b/bshell/parse/lex/lex-internal.h @@ -8,7 +8,11 @@ struct lex_ctx; enum state_flags { + /* statement: don't convert matching words to keywords */ STATEMENT_F_DISABLE_KEYWORDS = 0x01u, + /* arithmetic: don't switch back to statement mode even when + * encountering a token that would otherwise require it. */ + ARITHMETIC_F_DISABLE_STATEMENTS = 0x01u, }; enum read_flags { @@ -17,6 +21,52 @@ enum read_flags { READ_NO_NUMBER_RECOGNITION = 0x04u, }; +#define LINK_PUSH(tok, target, flags) \ + ((struct lex_state_link) { \ + .l_token = (tok), \ + .l_type = LEX_STATE_LINK_PUSH, \ + .l_target = (target), \ + .l_target_flags = (flags), \ + }) +#define LINK_PUSH_WITH_TERM(tok, target, flags, ...) \ + ((struct lex_state_link) { \ + .l_token = (tok), \ + .l_type = LEX_STATE_LINK_PUSH, \ + .l_target = (target), \ + .l_target_flags = (flags), \ + .l_terminators = {__VA_ARGS__, TOK_NONE}, \ + }) +#define LINK_CHANGE(tok, target) \ + ((struct lex_state_link) { \ + .l_token = (tok), \ + .l_type = LEX_STATE_LINK_CHANGE, \ + .l_target = (target), \ + }) +#define LINK_POP(tok) \ + ((struct lex_state_link) { \ + .l_token = (tok), \ + .l_type = LEX_STATE_LINK_POP, \ + }) +#define LINK_NONE(tok) \ + ((struct lex_state_link) { \ + .l_token = (tok), \ + .l_type = LEX_STATE_LINK_NONE, \ + }) +#define LINK_END ((struct lex_state_link) {}) + +struct lex_state_link { + unsigned int l_token; + enum { + LEX_STATE_LINK_NONE, + LEX_STATE_LINK_PUSH, + LEX_STATE_LINK_CHANGE, + LEX_STATE_LINK_POP, + } l_type; + enum lex_state_type_id l_target; + enum state_flags l_target_flags; + unsigned int l_terminators[LEX_STATE_MAX_TERMINATORS]; +}; + typedef enum bshell_status (*lex_state_pump_token)(struct lex_ctx *); typedef enum bshell_status (*lex_state_begin)(struct lex_ctx *); typedef enum bshell_status (*lex_state_end)(struct lex_ctx *); @@ -26,6 +76,7 @@ struct lex_state_type { lex_state_pump_token s_pump_token; lex_state_begin s_begin; lex_state_end s_end; + const struct lex_state_link *s_links; }; extern enum bshell_status pump_token_statement(struct lex_ctx *ctx); @@ -45,6 +96,10 @@ extern void lex_state_pop(struct lex_ctx *ctx); extern struct lex_state *lex_state_get(struct lex_ctx *ctx); extern void lex_state_change(struct lex_ctx *ctx, enum lex_state_type_id type); extern fx_string *lex_state_get_tempstr(struct lex_ctx *ctx); +extern void lex_state_add_terminator(struct lex_state *state, unsigned int tok); +extern bool lex_state_terminates_at_token( + struct lex_ctx *ctx, + unsigned int tok); extern fx_wchar peek_char(struct lex_ctx *ctx); extern fx_wchar peek_char_noread(struct lex_ctx *ctx); @@ -100,8 +155,26 @@ extern bool char_has_flags( struct lex_ctx *ctx, char c, enum lex_token_flags flags); +extern bool keyword_has_flags( + struct lex_ctx *ctx, + enum token_keyword kw, + enum lex_token_flags flags); +extern enum lex_token_flags keyword_get_flags( + struct lex_ctx *ctx, + enum token_keyword kw); +extern bool symbol_has_flags( + struct lex_ctx *ctx, + enum token_symbol sym, + enum lex_token_flags flags); +extern enum lex_token_flags symbol_get_flags( + struct lex_ctx *ctx, + enum token_symbol sym); extern enum token_operator get_operator_with_string( struct lex_ctx *ctx, const char *s); +extern void handle_lex_state_transition( + struct lex_ctx *ctx, + unsigned int token); + #endif diff --git a/bshell/parse/lex/lex.c b/bshell/parse/lex/lex.c index c17278a..434233c 100644 --- a/bshell/parse/lex/lex.c +++ b/bshell/parse/lex/lex.c @@ -5,6 +5,8 @@ #include "../token.h" #include "lex-internal.h" +#include + #define LEX_TOKEN_DEF(i, n, s) {.id = (i), .name = (n), .enabled_states = (s)} #define LEX_TOKEN_DEF2(i, n, s, f) \ {.id = (i), .name = (n), .enabled_states = (s), .flags = (f)} @@ -13,54 +15,81 @@ ((flags) & (LEX_ENABLE_INT | LEX_ENABLE_KEYWORD)) static struct lex_token_def keywords[] = { - LEX_TOKEN_DEF(KW_FUNC, "func", LEX_STATE_STATEMENT), + LEX_TOKEN_DEF2( + KW_FUNC, + "func", + LEX_STATE_STATEMENT, + LEX_TOKEN_COMMAND_MODE), LEX_TOKEN_DEF(KW_IF, "if", LEX_STATE_STATEMENT), + LEX_TOKEN_DEF(KW_ELSEIF, "elseif", LEX_STATE_STATEMENT), LEX_TOKEN_DEF(KW_ELSE, "else", LEX_STATE_STATEMENT), }; static const size_t nr_keywords = sizeof keywords / sizeof keywords[0]; static struct lex_token_def operators[] = { - LEX_TOKEN_DEF(OP_BAND, "-band", LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_BOR, "-bor", LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_BXOR, "-bxor", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_BAND, "-band", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_BOR, "-bor", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_BXOR, "-bxor", LEX_STATE_ARITHMETIC), LEX_TOKEN_DEF( - OP_BNOT, + TKOP_BNOT, "-bnot", LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_SHL, "-shl", LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_SHR, "-shr", LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_EQ, "-eq", LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_NE, "-ne", LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_GT, "-gt", LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_LT, "-lt", LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_GE, "-ge", LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_LE, "-le", LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_MATCH, "-match", LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_NOTMATCH, "-notmatch", LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_REPLACE, "-replace", LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_LIKE, "-like", LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_NOTLIKE, "-notlike", LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_CONTAINS, "-contains", LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_NOTCONTAINS, "-notcontains", LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_AND, "-and", LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_OR, "-OR", LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_XOR, "-xor", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_SHL, "-shl", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_SHR, "-shr", LEX_STATE_ARITHMETIC), LEX_TOKEN_DEF( - OP_NOT, + TKOP_EQ, + "-eq", + LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF( + TKOP_NE, + "-ne", + LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF( + TKOP_GT, + "-gt", + LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF( + TKOP_LT, + "-lt", + LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF( + TKOP_GE, + "-ge", + LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF( + TKOP_LE, + "-le", + LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_MATCH, "-match", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_NOTMATCH, "-notmatch", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_REPLACE, "-replace", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_LIKE, "-like", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_NOTLIKE, "-notlike", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_CONTAINS, "-contains", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_NOTCONTAINS, "-notcontains", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_AND, "-and", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF( + TKOP_OR, + "-OR", + LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_XOR, "-xor", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF( + TKOP_NOT, "-not", LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_SPLIT, "-split", LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_JOIN, "-join", LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_IS, "-is", LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_ISNOT, "-isnot", LEX_STATE_ARITHMETIC), - LEX_TOKEN_DEF(OP_AS, "-as", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_SPLIT, "-split", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_JOIN, "-join", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_IS, "-is", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_ISNOT, "-isnot", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_AS, "-as", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(TKOP_F, "-f", LEX_STATE_ARITHMETIC), }; static const size_t nr_operators = sizeof operators / sizeof operators[0]; #define LEX_STATES(states) (LEX_STATE_STATEMENT | states) #define LEX_STATE_ALL \ (LEX_STATE_ARITHMETIC | LEX_STATE_STATEMENT | LEX_STATE_COMMAND \ - | LEX_STATE_STRING | LEX_STATE_WORD) + | LEX_STATE_STRING | LEX_STATE_WORD | LEX_STATE_HASHTABLE) static struct lex_token_def symbols[] = { LEX_TOKEN_DEF2( @@ -71,7 +100,7 @@ static struct lex_token_def symbols[] = { LEX_TOKEN_DEF2( SYM_HYPHEN, "-", - LEX_STATES(LEX_STATE_ARITHMETIC), + LEX_STATE_ARITHMETIC, LEX_TOKEN_UNARY_ARITHMETIC), LEX_TOKEN_DEF(SYM_FORWARD_SLASH, "/", LEX_STATE_ARITHMETIC), LEX_TOKEN_DEF(SYM_ASTERISK, "*", LEX_STATE_ARITHMETIC), @@ -112,18 +141,28 @@ static struct lex_token_def symbols[] = { | LEX_STATE_WORD, LEX_TOKEN_UNARY_ARITHMETIC), LEX_TOKEN_DEF(SYM_AT, "@", LEX_STATE_ALL), - LEX_TOKEN_DEF2(SYM_PIPE, "|", LEX_STATE_ALL, LEX_TOKEN_TERMINATES_WORD), + LEX_TOKEN_DEF2( + SYM_PIPE, + "|", + LEX_STATE_ALL, + LEX_TOKEN_TERMINATES_WORD | LEX_TOKEN_COMMAND_MODE), LEX_TOKEN_DEF2( SYM_COMMA, ",", LEX_STATE_ALL, - LEX_TOKEN_TERMINATES_WORD), + LEX_TOKEN_TERMINATES_WORD | LEX_TOKEN_STATEMENT_MODE), LEX_TOKEN_DEF2( SYM_SEMICOLON, ";", LEX_STATE_ALL, - LEX_TOKEN_TERMINATES_WORD), - LEX_TOKEN_DEF(SYM_AT_LEFT_BRACE, "@{", LEX_STATE_ALL), + LEX_TOKEN_TERMINATES_WORD | LEX_TOKEN_STATEMENT_MODE), + LEX_TOKEN_DEF2( + SYM_AT_LEFT_BRACE, + "@{", + LEX_STATE_ARITHMETIC | LEX_STATE_COMMAND | LEX_STATE_STRING + | LEX_STATE_WORD | LEX_STATE_STATEMENT, + LEX_TOKEN_UNARY_ARITHMETIC), + LEX_TOKEN_DEF(SYM_AT_LEFT_PAREN, "@(", LEX_STATE_ALL), LEX_TOKEN_DEF2( SYM_LEFT_BRACE, "{", @@ -136,6 +175,7 @@ static struct lex_token_def symbols[] = { LEX_TOKEN_TERMINATES_WORD), LEX_TOKEN_DEF(SYM_LEFT_BRACKET, "[", LEX_STATES(LEX_STATE_ARITHMETIC)), LEX_TOKEN_DEF(SYM_RIGHT_BRACKET, "]", LEX_STATES(LEX_STATE_ARITHMETIC)), + LEX_TOKEN_DEF(SYM_QUESTION_LEFT_BRACKET, "?[", LEX_STATE_ARITHMETIC), LEX_TOKEN_DEF2( SYM_LEFT_PAREN, "(", @@ -146,12 +186,19 @@ static struct lex_token_def symbols[] = { ")", LEX_STATE_ALL, LEX_TOKEN_TERMINATES_WORD), - LEX_TOKEN_DEF(SYM_EQUAL, "=", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF( + SYM_EQUAL, + "=", + LEX_STATE_ARITHMETIC | LEX_STATE_HASHTABLE), LEX_TOKEN_DEF(SYM_PLUS_EQUAL, "+=", LEX_STATE_ARITHMETIC), LEX_TOKEN_DEF(SYM_HYPHEN_EQUAL, "-=", LEX_STATE_ARITHMETIC), LEX_TOKEN_DEF(SYM_FORWARD_SLASH_EQUAL, "/=", LEX_STATE_ARITHMETIC), LEX_TOKEN_DEF(SYM_ASTERISK_EQUAL, "*=", LEX_STATE_ARITHMETIC), LEX_TOKEN_DEF(SYM_PERCENT_EQUAL, "%=", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(SYM_DOT, ".", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(SYM_COLON_COLON, "::", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(SYM_DOT_DOT, "..", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(SYM_QUESTION_DOT, "?.", LEX_STATE_ARITHMETIC), }; static const size_t nr_symbols = sizeof symbols / sizeof symbols[0]; @@ -160,6 +207,7 @@ extern const struct lex_state_type lex_command_state; extern const struct lex_state_type lex_arithmetic_state; extern const struct lex_state_type lex_string_state; extern const struct lex_state_type lex_word_state; +extern const struct lex_state_type lex_hashtable_state; static const struct lex_state_type *state_types[] = { [LEX_STATE_STATEMENT] = &lex_statement_state, @@ -167,6 +215,7 @@ static const struct lex_state_type *state_types[] = { [LEX_STATE_ARITHMETIC] = &lex_arithmetic_state, [LEX_STATE_STRING] = &lex_string_state, [LEX_STATE_WORD] = &lex_word_state, + [LEX_STATE_HASHTABLE] = &lex_hashtable_state, }; void set_token_start(struct lex_ctx *ctx) @@ -179,6 +228,24 @@ void set_token_end(struct lex_ctx *ctx) memcpy(&ctx->lex_end, &ctx->lex_cursor, sizeof ctx->lex_cursor); } +static const char *lex_state_type_id_to_string(enum lex_state_type_id id) +{ +#define ENUM_STR(v) \ + case v: \ + return #v + switch (id) { + ENUM_STR(LEX_STATE_STATEMENT); + ENUM_STR(LEX_STATE_COMMAND); + ENUM_STR(LEX_STATE_ARITHMETIC); + ENUM_STR(LEX_STATE_STRING); + ENUM_STR(LEX_STATE_WORD); + ENUM_STR(LEX_STATE_HASHTABLE); + default: + return ""; + } +#undef ENUM_STR +} + struct lex_state *lex_state_push( struct lex_ctx *ctx, enum lex_state_type_id state_type, @@ -189,6 +256,11 @@ struct lex_state *lex_state_push( return NULL; } +#if defined(VERBOSE) + printf("push(%s, 0x%04x)\n", + lex_state_type_id_to_string(state_type), + flags); +#endif memset(state, 0x0, sizeof *state); state->s_type = state_types[state_type]; @@ -212,6 +284,10 @@ void lex_state_pop(struct lex_ctx *ctx) struct lex_state *state = fx_unbox(struct lex_state, entry, s_entry); +#if defined(VERBOSE) + printf("pop(%s)\n", lex_state_type_id_to_string(state->s_type->s_id)); +#endif + if (state->s_type->s_end) { state->s_type->s_end(ctx); } @@ -242,6 +318,12 @@ void lex_state_change(struct lex_ctx *ctx, enum lex_state_type_id type) return; } +#if defined(VERBOSE) + printf("change(%s -> %s)\n", + lex_state_type_id_to_string(state->s_type->s_id), + lex_state_type_id_to_string(type)); +#endif + if (state->s_type->s_end) { state->s_type->s_end(ctx); } @@ -271,6 +353,13 @@ fx_string *lex_state_get_tempstr(struct lex_ctx *ctx) return state->s_tempstr; } +void lex_state_add_terminator(struct lex_state *state, unsigned int tok) +{ + if (state->s_nr_terminators < LEX_STATE_MAX_TERMINATORS) { + state->s_terminators[state->s_nr_terminators++] = tok; + } +} + static struct lex_symbol_node *get_symbol_node( struct lex_symbol_node *node, char c) @@ -504,7 +593,7 @@ bool convert_word_to_operator(struct lex_ctx *ctx, struct lex_token *tok) } enum token_operator op = get_operator_with_string(ctx, tok->tok_str); - if (op == OP_NONE) { + if (op == TKOP_NONE) { return false; } @@ -624,8 +713,15 @@ extern void enqueue_token_with_coordinates( const struct char_cell *start, const struct char_cell *end) { + if (tok->tok_type == TOK_LINEFEED + && ctx->lex_prev_token == TOK_LINEFEED) { + lex_token_destroy(tok); + return; + } + tok->tok_start = *start; tok->tok_end = *end; + ctx->lex_prev_token = tok->tok_type; if (tok && (ctx->lex_flags & LEX_PRINT_TOKENS)) { print_lex_token(tok); @@ -813,7 +909,7 @@ enum bshell_status read_word( bool number_recog = !(flags & READ_NO_NUMBER_RECOGNITION); - enum token_operator op = OP_NONE; + enum token_operator op = TKOP_NONE; bool done = false; while (!done) { fx_wchar c = peek_char(ctx); @@ -844,7 +940,7 @@ enum bshell_status read_word( if (!fx_wchar_is_alpha(c)) { op = get_operator_with_string(ctx, s); - if (op != OP_NONE) { + if (op != TKOP_NONE) { done = true; break; } @@ -889,7 +985,9 @@ enum bshell_status read_symbol( struct lex_symbol_node *next = get_symbol_node(node, c); if (!next - || !(next->s_def->enabled_states & state->s_type->s_id)) { + || (next->s_def + && !(next->s_def->enabled_states + & state->s_type->s_id))) { prev = c; break; } @@ -932,10 +1030,7 @@ bool char_can_begin_symbol(struct lex_ctx *ctx, char c) return char_can_begin_symbol_in_state(ctx, c, state->s_type->s_id); } -extern bool char_has_flags( - struct lex_ctx *ctx, - char c, - enum lex_token_flags flags) +bool char_has_flags(struct lex_ctx *ctx, char c, enum lex_token_flags flags) { for (size_t i = 0; i < nr_symbols; i++) { if (symbols[i].name[0] != c) { @@ -948,6 +1043,60 @@ extern bool char_has_flags( return false; } +bool keyword_has_flags( + struct lex_ctx *ctx, + enum token_keyword kw, + enum lex_token_flags flags) +{ + for (size_t i = 0; i < nr_symbols; i++) { + if (keywords[i].id == kw) { + return (keywords[i].flags & flags) == flags; + } + } + + return false; +} + +enum lex_token_flags keyword_get_flags( + struct lex_ctx *ctx, + enum token_keyword kw) +{ + for (size_t i = 0; i < nr_symbols; i++) { + if (keywords[i].id == kw) { + return keywords[i].flags; + } + } + + return false; +} + +bool symbol_has_flags( + struct lex_ctx *ctx, + enum token_symbol sym, + enum lex_token_flags flags) +{ + for (size_t i = 0; i < nr_symbols; i++) { + if (symbols[i].id == sym) { + return (symbols[i].flags & flags) == flags; + } + } + + return false; +} + +enum lex_token_flags symbol_get_flags( + struct lex_ctx *ctx, + enum token_symbol sym) +{ + for (size_t i = 0; i < nr_symbols; i++) { + if (symbols[i].id == sym) { + return symbols[i].flags; + } + } + + return false; +} + enum token_operator get_operator_with_string(struct lex_ctx *ctx, const char *s) { struct lex_state *state = lex_state_get(ctx); @@ -968,6 +1117,126 @@ enum token_operator get_operator_with_string(struct lex_ctx *ctx, const char *s) return false; } +int compare_token_types(unsigned int a, unsigned int b) +{ + if (a == b) { + return 2; + } + +#define BETWEEN(v, lo, hi) ((v) >= (lo) && (v) <= (hi)) + enum token_type a_type = TOK_NONE, b_type = TOK_NONE; + + if (BETWEEN(a, __KW_INDEX_BASE, __KW_INDEX_LIMIT)) { + a_type = TOK_KEYWORD; + } else if (BETWEEN(a, __TKOP_INDEX_BASE, __TKOP_INDEX_LIMIT)) { + a_type = TOK_OPERATOR; + } else if (BETWEEN(a, __SYM_INDEX_BASE, __SYM_INDEX_LIMIT)) { + a_type = TOK_SYMBOL; + } else { + a_type = a; + } + + if (BETWEEN(b, __KW_INDEX_BASE, __KW_INDEX_LIMIT)) { + b_type = TOK_KEYWORD; + } else if (BETWEEN(b, __TKOP_INDEX_BASE, __TKOP_INDEX_LIMIT)) { + b_type = TOK_OPERATOR; + } else if (BETWEEN(b, __SYM_INDEX_BASE, __SYM_INDEX_LIMIT)) { + b_type = TOK_SYMBOL; + } else { + b_type = b; + } +#undef BETWEEN + + int result = 0; + if (a_type == b_type) { + if (a != a_type && b != b_type) { + result = 0; + } else { + result = a == b ? 2 : 1; + } + } + + if (result < 0) { + result = 0; + } + + return result; +} + +void handle_lex_state_transition(struct lex_ctx *ctx, unsigned int token) +{ + struct lex_state *state = lex_state_get(ctx); + for (unsigned int i = 0; i < LEX_STATE_MAX_TERMINATORS; i++) { + if (state->s_terminators[i] == TOK_NONE) { + break; + } + + if (state->s_terminators[i] == token) { + lex_state_pop(ctx); + return; + } + } + + const struct lex_state_link *table = state->s_type->s_links; + if (!table) { + return; + } + +#define MAX_MATCHES 8 + const struct lex_state_link *best_matches[MAX_MATCHES] = {0}; + unsigned int match_count = 0; + int best_score = 0; + + for (unsigned int i = 0; table[i].l_token != TOK_NONE; i++) { + int score = compare_token_types(table[i].l_token, token); + if (score == 0) { + continue; + } + + assert(match_count < MAX_MATCHES + || "lex state has too many matches"); + if (score == best_score) { + best_matches[match_count++] = &table[i]; + } else if (score > best_score) { + match_count = 0; + best_matches[match_count++] = &table[i]; + best_score = score; + } + } +#undef MAX_MATCHES + + if (!match_count) { + return; + } + + for (unsigned int i = 0; i < match_count; i++) { + const struct lex_state_link *link = best_matches[i]; + switch (link->l_type) { + case LEX_STATE_LINK_POP: + lex_state_pop(ctx); + break; + case LEX_STATE_LINK_PUSH: { + struct lex_state *state = lex_state_push( + ctx, + link->l_target, + link->l_target_flags); + for (unsigned int i = 0; link->l_terminators[i]; i++) { + lex_state_add_terminator( + state, + link->l_terminators[i]); + } + break; + } + + case LEX_STATE_LINK_CHANGE: + lex_state_change(ctx, link->l_target); + break; + default: + break; + } + } +} + static enum bshell_status read_string_content(struct lex_ctx *ctx) { fx_wchar c = FX_WCHAR_INVALID; diff --git a/bshell/parse/lex/statement.c b/bshell/parse/lex/statement.c index a7911c2..6e295aa 100644 --- a/bshell/parse/lex/statement.c +++ b/bshell/parse/lex/statement.c @@ -1,88 +1,11 @@ #include "lex-internal.h" -#if 0 -#define APPEND_HYPHEN 0x8000u - -static enum bshell_status __read_word( - struct lex_ctx *ctx, - int flags, - struct lex_token **out) -{ - fx_string *tmp = lex_state_get_tempstr(ctx); - fx_string_clear(tmp); - bool word_is_number = false; - - if (flags & APPEND_HYPHEN) { - fx_string_append_c(tmp, '-'); - } - - if (!(flags & READ_NO_SET_TOKEN_START)) { - set_token_start(ctx); - } - - enum token_operator op = OP_NONE; - - bool done = false; - while (!done) { - fx_wchar c = peek_char(ctx); - if (c == FX_WCHAR_INVALID) { - break; - } - - if (fx_wchar_is_space(c)) { - done = true; - break; - } - - if (char_has_flags(ctx, c, LEX_TOKEN_TERMINATES_WORD)) { - done = true; - break; - } - - if (char_can_begin_symbol(ctx, c)) { - op = get_operator_with_string( - ctx, - fx_string_get_cstr(tmp)); - if (op != OP_NONE) { - done = true; - break; - } - } - - fx_string_append_wc(tmp, c); - set_token_end(ctx); - advance_char(ctx); - } - - if (fx_string_get_size(tmp, FX_STRLEN_NORMAL) == 0) { - if (ctx->lex_status == BSHELL_SUCCESS) { - return BSHELL_ERR_BAD_SYNTAX; - } - - return ctx->lex_status; - } - - struct lex_token *tok = NULL; - if (op != OP_NONE) { - tok = lex_token_create(TOK_OPERATOR); - tok->tok_operator = op; - } else { - tok = lex_token_create_with_string( - TOK_WORD, - fx_string_get_cstr(tmp)); - } - - *out = tok; - return BSHELL_SUCCESS; -} -#endif - static enum bshell_status statement_hyphen(struct lex_ctx *ctx) { fx_wchar c = peek_char(ctx); if (!fx_wchar_is_alnum(c)) { push_symbol(ctx, SYM_HYPHEN); - lex_state_change(ctx, LEX_STATE_ARITHMETIC); + handle_lex_state_transition(ctx, SYM_HYPHEN); return BSHELL_SUCCESS; } @@ -95,10 +18,11 @@ static enum bshell_status statement_hyphen(struct lex_ctx *ctx) return status; } - bool converted = convert_word_to_int(tok); + unsigned int token_type = TOK_WORD; + + if (convert_word_to_int(tok)) { + token_type = TOK_INT; - if (converted) { - lex_state_change(ctx, LEX_STATE_ARITHMETIC); /* because of APPEND_HYPHEN (which is needed to ensure operator * tokens are detected properly), the resulting number will be * negative. @@ -106,18 +30,13 @@ static enum bshell_status statement_hyphen(struct lex_ctx *ctx) * must be positive */ tok->tok_int *= -1; push_symbol(ctx, SYM_HYPHEN); - enqueue_token(ctx, tok); - return BSHELL_SUCCESS; - } - - converted = convert_word_to_operator(ctx, tok); - if (converted) { - lex_state_change(ctx, LEX_STATE_ARITHMETIC); - } else { - lex_state_change(ctx, LEX_STATE_COMMAND); + } else if (convert_word_to_operator(ctx, tok)) { + token_type = TOK_OPERATOR; } + handle_lex_state_transition(ctx, token_type); enqueue_token(ctx, tok); + return BSHELL_SUCCESS; } @@ -130,6 +49,8 @@ static enum bshell_status statement_symbol(struct lex_ctx *ctx) return status; } + handle_lex_state_transition(ctx, sym->id); + struct lex_token *tok = NULL; switch (sym->id) { case SYM_HYPHEN: @@ -144,17 +65,7 @@ static enum bshell_status statement_symbol(struct lex_ctx *ctx) case SYM_HASH: return read_line_comment(ctx); - case SYM_DQUOTE: - if (!lex_state_push(ctx, LEX_STATE_STRING, 0)) { - return BSHELL_ERR_NO_MEMORY; - } - - return BSHELL_SUCCESS; case SYM_DOLLAR: - if (!lex_state_push(ctx, LEX_STATE_ARITHMETIC, 0)) { - return BSHELL_ERR_NO_MEMORY; - } - status = read_var(ctx, TOK_VAR, &tok); if (status != BSHELL_SUCCESS) { return status; @@ -163,10 +74,6 @@ static enum bshell_status statement_symbol(struct lex_ctx *ctx) enqueue_token(ctx, tok); return status; case SYM_AT: - if (!lex_state_push(ctx, LEX_STATE_ARITHMETIC, 0)) { - return BSHELL_ERR_NO_MEMORY; - } - status = read_var(ctx, TOK_VAR_SPLAT, &tok); if (status != BSHELL_SUCCESS) { return status; @@ -175,27 +82,11 @@ static enum bshell_status statement_symbol(struct lex_ctx *ctx) enqueue_token(ctx, tok); return status; case SYM_DOLLAR_LEFT_BRACE: - if (!lex_state_push(ctx, LEX_STATE_ARITHMETIC, 0)) { - return BSHELL_ERR_NO_MEMORY; - } - status = read_braced_var(ctx, TOK_VAR, &tok); if (status != BSHELL_SUCCESS) { return status; } - enqueue_token(ctx, tok); - return status; - case SYM_AT_LEFT_BRACE: - if (!lex_state_push(ctx, LEX_STATE_ARITHMETIC, 0)) { - return BSHELL_ERR_NO_MEMORY; - } - - status = read_braced_var(ctx, TOK_VAR_SPLAT, &tok); - if (status != BSHELL_SUCCESS) { - return status; - } - enqueue_token(ctx, tok); return status; default: @@ -203,32 +94,6 @@ static enum bshell_status statement_symbol(struct lex_ctx *ctx) } push_symbol(ctx, sym->id); - - switch (sym->id) { - case SYM_LEFT_PAREN: - lex_state_push( - ctx, - LEX_STATE_STATEMENT, - STATEMENT_F_DISABLE_KEYWORDS); - return BSHELL_SUCCESS; - case SYM_LEFT_BRACE: - case SYM_DOLLAR_LEFT_PAREN: - lex_state_push(ctx, LEX_STATE_STATEMENT, 0); - return BSHELL_SUCCESS; - case SYM_RIGHT_PAREN: - case SYM_RIGHT_BRACE: - lex_state_pop(ctx); - return BSHELL_SUCCESS; - default: - break; - } - - if (sym->enabled_states & LEX_STATE_COMMAND) { - lex_state_change(ctx, LEX_STATE_COMMAND); - } else if (sym->enabled_states & LEX_STATE_ARITHMETIC) { - lex_state_change(ctx, LEX_STATE_ARITHMETIC); - } - return BSHELL_SUCCESS; } @@ -242,21 +107,16 @@ static enum bshell_status statement_word(struct lex_ctx *ctx) struct lex_state *state = lex_state_get(ctx); - bool converted = false; + bool enable_keywords = !(state->s_flags & STATEMENT_F_DISABLE_KEYWORDS); + unsigned int token = TOK_WORD; - if (!(state->s_flags & STATEMENT_F_DISABLE_KEYWORDS)) { - converted = convert_word_to_keyword(word); + if (enable_keywords && convert_word_to_keyword(word)) { + token = word->tok_keyword; + } else if (convert_word_to_int(word)) { + token = TOK_INT; } - if (!converted) { - converted = convert_word_to_int(word); - } - - if (converted) { - lex_state_change(ctx, LEX_STATE_ARITHMETIC); - } else { - lex_state_change(ctx, LEX_STATE_COMMAND); - } + handle_lex_state_transition(ctx, token); enqueue_token(ctx, word); return BSHELL_SUCCESS; @@ -284,18 +144,6 @@ static enum bshell_status statement_pump_token(struct lex_ctx *ctx) return BSHELL_SUCCESS; } -#if 0 - if (char_can_begin_symbol_in_state(ctx, c, LEX_STATE_ARITHMETIC)) { - lex_state_change(ctx, LEX_STATE_ARITHMETIC); - return BSHELL_SUCCESS; - } - - if (char_can_begin_symbol_in_state(ctx, c, LEX_STATE_COMMAND)) { - lex_state_change(ctx, LEX_STATE_COMMAND); - return BSHELL_SUCCESS; - } -#endif - if (char_can_begin_symbol(ctx, c)) { return statement_symbol(ctx); } @@ -305,18 +153,38 @@ static enum bshell_status statement_pump_token(struct lex_ctx *ctx) return BSHELL_SUCCESS; } -#if 0 - if (fx_wchar_is_number(c)) { - lex_state_change(ctx, LEX_STATE_ARITHMETIC); - } else { - lex_state_change(ctx, LEX_STATE_COMMAND); - } -#endif - return statement_word(ctx); } +static const struct lex_state_link links[] = { + LINK_PUSH(SYM_DQUOTE, LEX_STATE_STRING, 0), + /* arithmetic tokens */ + LINK_CHANGE(TOK_KEYWORD, LEX_STATE_ARITHMETIC), + LINK_CHANGE(TOK_INT, LEX_STATE_ARITHMETIC), + LINK_PUSH(SYM_DOLLAR, LEX_STATE_ARITHMETIC, 0), + LINK_PUSH(SYM_DOLLAR_LEFT_BRACE, LEX_STATE_ARITHMETIC, 0), + LINK_CHANGE(SYM_AT_LEFT_BRACE, LEX_STATE_ARITHMETIC), + LINK_PUSH(SYM_AT_LEFT_BRACE, LEX_STATE_HASHTABLE, 0), + LINK_PUSH(SYM_AT, LEX_STATE_ARITHMETIC, 0), + LINK_CHANGE(SYM_LEFT_PAREN, LEX_STATE_ARITHMETIC), + LINK_PUSH( + SYM_LEFT_PAREN, + LEX_STATE_STATEMENT, + STATEMENT_F_DISABLE_KEYWORDS), + + /* statement tokens */ + LINK_PUSH(SYM_LEFT_BRACE, LEX_STATE_STATEMENT, 0), + LINK_PUSH(SYM_DOLLAR_LEFT_PAREN, LEX_STATE_STATEMENT, 0), + + /* command tokens */ + LINK_CHANGE(KW_FUNC, LEX_STATE_COMMAND), + LINK_CHANGE(SYM_AMPERSAND, LEX_STATE_COMMAND), + LINK_CHANGE(TOK_WORD, LEX_STATE_COMMAND), + LINK_END, +}; + const struct lex_state_type lex_statement_state = { .s_id = LEX_STATE_STATEMENT, .s_pump_token = statement_pump_token, + .s_links = links, }; diff --git a/bshell/parse/lex/string.c b/bshell/parse/lex/string.c index 0243b92..c6a9402 100644 --- a/bshell/parse/lex/string.c +++ b/bshell/parse/lex/string.c @@ -45,14 +45,6 @@ static enum bshell_status string_symbol(struct lex_ctx *ctx) return status; } - enqueue_token(ctx, tok); - return status; - case SYM_AT_LEFT_BRACE: - status = read_braced_var(ctx, TOK_VAR_SPLAT, &tok); - if (status != BSHELL_SUCCESS) { - return status; - } - enqueue_token(ctx, tok); return status; default: diff --git a/bshell/parse/lex/word.c b/bshell/parse/lex/word.c index 63d5b4e..1d3f97c 100644 --- a/bshell/parse/lex/word.c +++ b/bshell/parse/lex/word.c @@ -42,22 +42,6 @@ static enum bshell_status word_symbol(struct lex_ctx *ctx) return status; } - enqueue_token(ctx, tok); - return status; - case SYM_DOLLAR_LEFT_BRACE: - status = read_braced_var(ctx, TOK_VAR, &tok); - if (status != BSHELL_SUCCESS) { - return status; - } - - enqueue_token(ctx, tok); - return status; - case SYM_AT_LEFT_BRACE: - status = read_braced_var(ctx, TOK_VAR_SPLAT, &tok); - if (status != BSHELL_SUCCESS) { - return status; - } - enqueue_token(ctx, tok); return status; default: @@ -153,9 +137,14 @@ static enum bshell_status word_pump_token(struct lex_ctx *ctx) return word_content(ctx); } +static const struct lex_state_link links[] = { + LINK_END, +}; + const struct lex_state_type lex_word_state = { .s_id = LEX_STATE_WORD, .s_begin = word_begin, .s_end = word_end, .s_pump_token = word_pump_token, + .s_links = links, };