From ffdb28ba2216d41b3103052e8424ac403c2186ba Mon Sep 17 00:00:00 2001 From: Max Wash Date: Sun, 10 May 2026 19:14:24 +0100 Subject: [PATCH] parse: lex: replace expression scanner with statement; implement complex-word scanner also fix a bunch of scanning edge-cases --- bshell/parse/lex.h | 21 +- bshell/parse/lex/arithmetic.c | 47 +++- bshell/parse/lex/command.c | 47 +++- bshell/parse/lex/expression.c | 134 ----------- bshell/parse/lex/lex-internal.h | 19 ++ bshell/parse/lex/lex.c | 398 ++++++++++++++++---------------- bshell/parse/lex/statement.c | 157 ++++++++++++- bshell/parse/lex/word.c | 161 +++++++++++++ bshell/parse/token.c | 54 +++++ bshell/parse/token.h | 44 +++- 10 files changed, 737 insertions(+), 345 deletions(-) delete mode 100644 bshell/parse/lex/expression.c create mode 100644 bshell/parse/lex/word.c diff --git a/bshell/parse/lex.h b/bshell/parse/lex.h index 5154e1b..e05232e 100644 --- a/bshell/parse/lex.h +++ b/bshell/parse/lex.h @@ -2,24 +2,34 @@ #define LEX_H_ #include "../status.h" +#include "token.h" #include #include #include -struct lex_token; struct line_source; enum lex_flags { LEX_PRINT_TOKENS = 0x01u, }; +enum lex_token_flags { + /* a token with this flag not only interrupts the word currently being + * scanned, but also stops multi-words */ + LEX_TOKEN_TERMINATES_WORD = 0x01u, + /* a token with this flag can appear at the start of an arithmetic + * expression. a statement that encounters this token as its first char + * will switch to arithmetic mode */ + LEX_TOKEN_UNARY_ARITHMETIC = 0x02u, +}; + enum lex_state_type_id { LEX_STATE_STATEMENT = 0x01u, - LEX_STATE_EXPRESSION = 0x02u, - LEX_STATE_COMMAND = 0x04u, - LEX_STATE_ARITHMETIC = 0x08u, - LEX_STATE_STRING = 0x10u, + LEX_STATE_COMMAND = 0x02u, + LEX_STATE_ARITHMETIC = 0x04u, + LEX_STATE_STRING = 0x08u, + LEX_STATE_WORD = 0x10u, }; struct lex_token_def { @@ -27,6 +37,7 @@ struct lex_token_def { const char *name; uint64_t name_hash; enum lex_state_type_id enabled_states; + enum lex_token_flags flags; }; struct lex_symbol_node { diff --git a/bshell/parse/lex/arithmetic.c b/bshell/parse/lex/arithmetic.c index 5279f03..66aea73 100644 --- a/bshell/parse/lex/arithmetic.c +++ b/bshell/parse/lex/arithmetic.c @@ -1,5 +1,49 @@ #include "lex-internal.h" +static enum bshell_status arithmetic_hyphen(struct lex_ctx *ctx) +{ + fx_wchar c = peek_char(ctx); + if (!fx_wchar_is_alnum(c)) { + push_symbol(ctx, SYM_HYPHEN); + lex_state_change(ctx, LEX_STATE_ARITHMETIC); + return BSHELL_SUCCESS; + } + + struct lex_token *tok = NULL; + enum bshell_status status = read_word( + ctx, + READ_NO_SET_TOKEN_START | READ_APPEND_HYPHEN, + &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + bool converted = convert_word_to_int(tok); + + if (converted) { + lex_state_change(ctx, LEX_STATE_ARITHMETIC); + /* because of APPEND_HYPHEN (which is needed to ensure operator + * tokens are detected properly), the resulting number will be + * negative. + * this token will be preceded by a HYPHEN token, so the number + * must be positive */ + tok->tok_int *= -1; + push_symbol(ctx, SYM_HYPHEN); + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; + } + + converted = convert_word_to_operator(ctx, tok); + if (converted) { + lex_state_change(ctx, LEX_STATE_ARITHMETIC); + } else { + lex_state_change(ctx, LEX_STATE_COMMAND); + } + + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; +} + static enum bshell_status arithmetic_symbol(struct lex_ctx *ctx) { const struct lex_token_def *sym = NULL; @@ -18,7 +62,8 @@ static enum bshell_status arithmetic_symbol(struct lex_ctx *ctx) } enqueue_token(ctx, tok); return BSHELL_SUCCESS; - + case SYM_HYPHEN: + return arithmetic_hyphen(ctx); case SYM_HASH: return read_line_comment(ctx); case SYM_DQUOTE: diff --git a/bshell/parse/lex/command.c b/bshell/parse/lex/command.c index d524f1c..3178e81 100644 --- a/bshell/parse/lex/command.c +++ b/bshell/parse/lex/command.c @@ -1,5 +1,26 @@ #include "lex-internal.h" +static bool char_can_continue_word(struct lex_ctx *ctx, fx_wchar c) +{ + if (fx_wchar_is_alnum(c)) { + return true; + } + + if (fx_wchar_is_space(c)) { + return false; + } + + if (c == '$') { + return true; + } + + if (char_can_begin_symbol_in_state(ctx, c, LEX_STATE_WORD)) { + return false; + } + + return true; +} + static enum bshell_status command_symbol(struct lex_ctx *ctx) { const struct lex_token_def *sym = NULL; @@ -33,6 +54,10 @@ static enum bshell_status command_symbol(struct lex_ctx *ctx) return status; } + if (char_can_continue_word(ctx, peek_char(ctx))) { + lex_state_push(ctx, LEX_STATE_WORD, 0); + } + enqueue_token(ctx, tok); return status; case SYM_AT: @@ -49,6 +74,10 @@ static enum bshell_status command_symbol(struct lex_ctx *ctx) return status; } + if (char_can_continue_word(ctx, peek_char(ctx))) { + lex_state_push(ctx, LEX_STATE_WORD, 0); + } + enqueue_token(ctx, tok); return status; case SYM_AT_LEFT_BRACE: @@ -91,11 +120,27 @@ static enum bshell_status command_symbol(struct lex_ctx *ctx) static enum bshell_status command_word(struct lex_ctx *ctx) { struct lex_token *word = NULL; - enum bshell_status status = read_word(ctx, &word); + enum bshell_status status + = read_word(ctx, READ_NO_NUMBER_RECOGNITION, &word); if (status != BSHELL_SUCCESS) { return status; } + bool continue_word = false; + + fx_wchar c = peek_char(ctx); + if (char_can_begin_symbol_in_state(ctx, c, LEX_STATE_WORD)) { + continue_word = true; + } + + if (char_has_flags(ctx, c, LEX_TOKEN_TERMINATES_WORD)) { + continue_word = false; + } + + if (continue_word) { + lex_state_push(ctx, LEX_STATE_WORD, 0); + } + enqueue_token(ctx, word); return BSHELL_SUCCESS; } diff --git a/bshell/parse/lex/expression.c b/bshell/parse/lex/expression.c deleted file mode 100644 index 9b37e99..0000000 --- a/bshell/parse/lex/expression.c +++ /dev/null @@ -1,134 +0,0 @@ -#include "lex-internal.h" - -static enum bshell_status expression_symbol(struct lex_ctx *ctx) -{ - const struct lex_token_def *sym = NULL; - enum bshell_status status = read_symbol(ctx, &sym); - - if (status != BSHELL_SUCCESS) { - return status; - } - - struct lex_token *tok = NULL; - - switch (sym->id) { - case SYM_DQUOTE: - if (!lex_state_push(ctx, LEX_STATE_STRING)) { - return BSHELL_ERR_NO_MEMORY; - } - - return BSHELL_SUCCESS; - case SYM_DOLLAR: - status = read_var(ctx, TOK_VAR, &tok); - if (status != BSHELL_SUCCESS) { - return status; - } - - enqueue_token(ctx, tok); - lex_state_change(ctx, LEX_STATE_ARITHMETIC); - return status; - case SYM_AT: - status = read_var(ctx, TOK_VAR_SPLAT, &tok); - if (status != BSHELL_SUCCESS) { - return status; - } - - enqueue_token(ctx, tok); - lex_state_change(ctx, LEX_STATE_ARITHMETIC); - return status; - case SYM_DOLLAR_LEFT_BRACE: - status = read_braced_var(ctx, TOK_VAR, &tok); - if (status != BSHELL_SUCCESS) { - return status; - } - - enqueue_token(ctx, tok); - lex_state_change(ctx, LEX_STATE_ARITHMETIC); - return status; - case SYM_AT_LEFT_BRACE: - status = read_braced_var(ctx, TOK_VAR_SPLAT, &tok); - if (status != BSHELL_SUCCESS) { - return status; - } - - enqueue_token(ctx, tok); - lex_state_change(ctx, LEX_STATE_ARITHMETIC); - return status; - default: - break; - } - - push_symbol(ctx, sym->id); - - switch (sym->id) { - case SYM_LEFT_PAREN: - lex_state_push(ctx, LEX_STATE_EXPRESSION); - return BSHELL_SUCCESS; - case SYM_DOLLAR_LEFT_PAREN: - lex_state_push(ctx, LEX_STATE_STATEMENT); - return BSHELL_SUCCESS; - case SYM_RIGHT_PAREN: - lex_state_pop(ctx); - return BSHELL_SUCCESS; - case SYM_SEMICOLON: - lex_state_change(ctx, LEX_STATE_STATEMENT); - return BSHELL_SUCCESS; - default: - break; - } - - return BSHELL_SUCCESS; -} - -static enum bshell_status expression_word(struct lex_ctx *ctx) -{ - struct lex_token *word = NULL; - enum bshell_status status = read_word(ctx, &word); - if (status != BSHELL_SUCCESS) { - return status; - } - - bool converted = convert_word_to_int(word); - - if (converted) { - lex_state_change(ctx, LEX_STATE_ARITHMETIC); - } else { - lex_state_change(ctx, LEX_STATE_COMMAND); - } - - enqueue_token(ctx, word); - return BSHELL_SUCCESS; -} - -static enum bshell_status expression_pump_token(struct lex_ctx *ctx) -{ - fx_wchar c = peek_char(ctx); - bool newline = false; - - while (fx_wchar_is_space(c)) { - if (c == '\n') { - newline = true; - } - - advance_char_noread(ctx); - c = peek_char_noread(ctx); - } - - if (newline) { - struct lex_token *tok = lex_token_create(TOK_LINEFEED); - enqueue_token(ctx, tok); - lex_state_change(ctx, LEX_STATE_STATEMENT); - return BSHELL_SUCCESS; - } - - if (char_can_begin_symbol(ctx, c)) { - return expression_symbol(ctx); - } - - return expression_word(ctx); -} - -const struct lex_state_type lex_expression_state = { - .s_id = LEX_STATE_EXPRESSION, - .s_pump_token = expression_pump_token, -}; diff --git a/bshell/parse/lex/lex-internal.h b/bshell/parse/lex/lex-internal.h index 5405a04..90ed2d7 100644 --- a/bshell/parse/lex/lex-internal.h +++ b/bshell/parse/lex/lex-internal.h @@ -11,6 +11,12 @@ enum state_flags { STATEMENT_F_DISABLE_KEYWORDS = 0x01u, }; +enum read_flags { + READ_APPEND_HYPHEN = 0x01u, + READ_NO_SET_TOKEN_START = 0x02u, + READ_NO_NUMBER_RECOGNITION = 0x04u, +}; + typedef enum bshell_status (*lex_state_pump_token)(struct lex_ctx *); typedef enum bshell_status (*lex_state_begin)(struct lex_ctx *); typedef enum bshell_status (*lex_state_end)(struct lex_ctx *); @@ -42,12 +48,17 @@ extern fx_string *lex_state_get_tempstr(struct lex_ctx *ctx); extern fx_wchar peek_char(struct lex_ctx *ctx); extern fx_wchar peek_char_noread(struct lex_ctx *ctx); +extern fx_wchar peek2_char(struct lex_ctx *ctx); +extern fx_wchar peek2_char_noread(struct lex_ctx *ctx); extern void advance_char(struct lex_ctx *ctx); extern void advance_char_noread(struct lex_ctx *ctx); extern bool string_is_valid_number(const char *s, long long *out); extern bool convert_word_to_int(struct lex_token *tok); extern bool convert_word_to_keyword(struct lex_token *tok); +extern bool convert_word_to_operator( + struct lex_ctx *ctx, + struct lex_token *tok); extern void enqueue_token(struct lex_ctx *ctx, struct lex_token *tok); extern void enqueue_token_with_coordinates( @@ -58,6 +69,7 @@ extern void enqueue_token_with_coordinates( extern enum bshell_status read_word( struct lex_ctx *ctx, + enum read_flags flags, struct lex_token **out); extern enum bshell_status read_symbol( struct lex_ctx *ctx, @@ -84,5 +96,12 @@ extern bool char_can_begin_symbol_in_state( struct lex_ctx *ctx, char c, enum lex_state_type_id state_type); +extern bool char_has_flags( + struct lex_ctx *ctx, + char c, + enum lex_token_flags flags); +extern enum token_operator get_operator_with_string( + struct lex_ctx *ctx, + const char *s); #endif diff --git a/bshell/parse/lex/lex.c b/bshell/parse/lex/lex.c index 4a0edda..c17278a 100644 --- a/bshell/parse/lex/lex.c +++ b/bshell/parse/lex/lex.c @@ -6,6 +6,8 @@ #include "lex-internal.h" #define LEX_TOKEN_DEF(i, n, s) {.id = (i), .name = (n), .enabled_states = (s)} +#define LEX_TOKEN_DEF2(i, n, s, f) \ + {.id = (i), .name = (n), .enabled_states = (s), .flags = (f)} #define CONVERSION_REQUESTED(flags) \ ((flags) & (LEX_ENABLE_INT | LEX_ENABLE_KEYWORD)) @@ -17,20 +19,68 @@ static struct lex_token_def keywords[] = { }; static const size_t nr_keywords = sizeof keywords / sizeof keywords[0]; -#define LEX_STATES(states) (LEX_STATE_STATEMENT | LEX_STATE_EXPRESSION | states) +static struct lex_token_def operators[] = { + LEX_TOKEN_DEF(OP_BAND, "-band", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_BOR, "-bor", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_BXOR, "-bxor", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF( + OP_BNOT, + "-bnot", + LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_SHL, "-shl", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_SHR, "-shr", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_EQ, "-eq", LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_NE, "-ne", LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_GT, "-gt", LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_LT, "-lt", LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_GE, "-ge", LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_LE, "-le", LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_MATCH, "-match", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_NOTMATCH, "-notmatch", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_REPLACE, "-replace", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_LIKE, "-like", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_NOTLIKE, "-notlike", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_CONTAINS, "-contains", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_NOTCONTAINS, "-notcontains", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_AND, "-and", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_OR, "-OR", LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_XOR, "-xor", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF( + OP_NOT, + "-not", + LEX_STATE_STATEMENT | LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_SPLIT, "-split", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_JOIN, "-join", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_IS, "-is", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_ISNOT, "-isnot", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(OP_AS, "-as", LEX_STATE_ARITHMETIC), +}; +static const size_t nr_operators = sizeof operators / sizeof operators[0]; + +#define LEX_STATES(states) (LEX_STATE_STATEMENT | states) #define LEX_STATE_ALL \ (LEX_STATE_ARITHMETIC | LEX_STATE_STATEMENT | LEX_STATE_COMMAND \ - | LEX_STATE_STRING | LEX_STATE_EXPRESSION) + | LEX_STATE_STRING | LEX_STATE_WORD) static struct lex_token_def symbols[] = { - LEX_TOKEN_DEF(SYM_PLUS, "+", LEX_STATES(LEX_STATE_ARITHMETIC)), - LEX_TOKEN_DEF(SYM_HYPHEN, "-", LEX_STATES(LEX_STATE_ARITHMETIC)), - LEX_TOKEN_DEF(SYM_FORWARD_SLASH, "/", LEX_STATES(LEX_STATE_ARITHMETIC)), - LEX_TOKEN_DEF(SYM_ASTERISK, "*", LEX_STATES(LEX_STATE_ARITHMETIC)), + LEX_TOKEN_DEF2( + SYM_PLUS, + "+", + LEX_STATE_ARITHMETIC, + LEX_TOKEN_UNARY_ARITHMETIC), + LEX_TOKEN_DEF2( + SYM_HYPHEN, + "-", + LEX_STATES(LEX_STATE_ARITHMETIC), + LEX_TOKEN_UNARY_ARITHMETIC), + LEX_TOKEN_DEF(SYM_FORWARD_SLASH, "/", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(SYM_ASTERISK, "*", LEX_STATE_ARITHMETIC), LEX_TOKEN_DEF( SYM_AMPERSAND, "&", - LEX_STATES(LEX_STATE_ARITHMETIC | LEX_STATE_COMMAND)), + LEX_STATES( + LEX_STATE_ARITHMETIC | LEX_STATE_COMMAND + | LEX_STATE_WORD)), LEX_TOKEN_DEF(SYM_PERCENT, "%", LEX_STATE_ARITHMETIC), LEX_TOKEN_DEF( SYM_SQUOTE, @@ -40,26 +90,62 @@ static struct lex_token_def symbols[] = { LEX_TOKEN_DEF( SYM_HASH, "#", - LEX_STATES(LEX_STATE_ARITHMETIC | LEX_STATE_COMMAND)), - LEX_TOKEN_DEF( - SYM_DOLLAR, - "$", LEX_STATES( LEX_STATE_ARITHMETIC | LEX_STATE_COMMAND - | LEX_STATE_STRING)), - LEX_TOKEN_DEF(SYM_DOLLAR_LEFT_PAREN, "$(", LEX_STATE_ALL), - LEX_TOKEN_DEF(SYM_DOLLAR_LEFT_BRACE, "${", LEX_STATE_ALL), + | LEX_STATE_WORD)), + LEX_TOKEN_DEF2( + SYM_DOLLAR, + "$", + LEX_STATE_ARITHMETIC | LEX_STATE_COMMAND | LEX_STATE_STRING + | LEX_STATE_WORD, + LEX_TOKEN_UNARY_ARITHMETIC), + LEX_TOKEN_DEF2( + SYM_DOLLAR_LEFT_PAREN, + "$(", + LEX_STATE_ARITHMETIC | LEX_STATE_COMMAND | LEX_STATE_STRING + | LEX_STATE_WORD, + LEX_TOKEN_UNARY_ARITHMETIC), + LEX_TOKEN_DEF2( + SYM_DOLLAR_LEFT_BRACE, + "${", + LEX_STATE_ARITHMETIC | LEX_STATE_COMMAND | LEX_STATE_STRING + | LEX_STATE_WORD, + LEX_TOKEN_UNARY_ARITHMETIC), LEX_TOKEN_DEF(SYM_AT, "@", LEX_STATE_ALL), - LEX_TOKEN_DEF(SYM_PIPE, "|", LEX_STATE_ALL), - LEX_TOKEN_DEF(SYM_COMMA, ",", LEX_STATE_ALL), - LEX_TOKEN_DEF(SYM_SEMICOLON, ";", LEX_STATE_ALL), + LEX_TOKEN_DEF2(SYM_PIPE, "|", LEX_STATE_ALL, LEX_TOKEN_TERMINATES_WORD), + LEX_TOKEN_DEF2( + SYM_COMMA, + ",", + LEX_STATE_ALL, + LEX_TOKEN_TERMINATES_WORD), + LEX_TOKEN_DEF2( + SYM_SEMICOLON, + ";", + LEX_STATE_ALL, + LEX_TOKEN_TERMINATES_WORD), LEX_TOKEN_DEF(SYM_AT_LEFT_BRACE, "@{", LEX_STATE_ALL), - LEX_TOKEN_DEF(SYM_LEFT_BRACE, "{", LEX_STATE_ALL), - LEX_TOKEN_DEF(SYM_RIGHT_BRACE, "}", LEX_STATE_ALL), + LEX_TOKEN_DEF2( + SYM_LEFT_BRACE, + "{", + LEX_STATE_ALL, + LEX_TOKEN_TERMINATES_WORD), + LEX_TOKEN_DEF2( + SYM_RIGHT_BRACE, + "}", + LEX_STATE_ALL, + LEX_TOKEN_TERMINATES_WORD), LEX_TOKEN_DEF(SYM_LEFT_BRACKET, "[", LEX_STATES(LEX_STATE_ARITHMETIC)), LEX_TOKEN_DEF(SYM_RIGHT_BRACKET, "]", LEX_STATES(LEX_STATE_ARITHMETIC)), - LEX_TOKEN_DEF(SYM_LEFT_PAREN, "(", LEX_STATE_ALL), - LEX_TOKEN_DEF(SYM_RIGHT_PAREN, ")", LEX_STATE_ALL), + LEX_TOKEN_DEF2( + SYM_LEFT_PAREN, + "(", + LEX_STATE_ALL, + LEX_TOKEN_TERMINATES_WORD), + LEX_TOKEN_DEF2( + SYM_RIGHT_PAREN, + ")", + LEX_STATE_ALL, + LEX_TOKEN_TERMINATES_WORD), LEX_TOKEN_DEF(SYM_EQUAL, "=", LEX_STATE_ARITHMETIC), LEX_TOKEN_DEF(SYM_PLUS_EQUAL, "+=", LEX_STATE_ARITHMETIC), LEX_TOKEN_DEF(SYM_HYPHEN_EQUAL, "-=", LEX_STATE_ARITHMETIC), @@ -70,17 +156,17 @@ static struct lex_token_def symbols[] = { static const size_t nr_symbols = sizeof symbols / sizeof symbols[0]; extern const struct lex_state_type lex_statement_state; -extern const struct lex_state_type lex_expression_state; extern const struct lex_state_type lex_command_state; extern const struct lex_state_type lex_arithmetic_state; extern const struct lex_state_type lex_string_state; +extern const struct lex_state_type lex_word_state; static const struct lex_state_type *state_types[] = { [LEX_STATE_STATEMENT] = &lex_statement_state, - [LEX_STATE_EXPRESSION] = &lex_expression_state, [LEX_STATE_COMMAND] = &lex_command_state, [LEX_STATE_ARITHMETIC] = &lex_arithmetic_state, [LEX_STATE_STRING] = &lex_string_state, + [LEX_STATE_WORD] = &lex_word_state, }; void set_token_start(struct lex_ctx *ctx) @@ -411,6 +497,22 @@ bool convert_word_to_keyword(struct lex_token *tok) return false; } +bool convert_word_to_operator(struct lex_ctx *ctx, struct lex_token *tok) +{ + if (!lex_token_has_string_value(tok)) { + return false; + } + + enum token_operator op = get_operator_with_string(ctx, tok->tok_str); + if (op == OP_NONE) { + return false; + } + + lex_token_change_type(tok, TOK_OPERATOR); + tok->tok_operator = op; + return true; +} + static int get_int_base_by_prefix(const char **s) { #define CH(x) (tolower(value[x])) @@ -461,6 +563,10 @@ static size_t get_int_multiplier_by_suffix(const char *suffix) bool string_is_valid_number(const char *s, long long *out) { + if (s[0] == '\0') { + return NULL; + } + int base = get_int_base_by_prefix(&s); char *ep = NULL; @@ -636,93 +742,6 @@ enum bshell_status read_braced_var( return BSHELL_SUCCESS; } -#if 0 -static enum bshell_status read_flag(struct lex_ctx *ctx) -{ - fx_string *tmp = get_temp_string(ctx); - - bool done = false; - while (!done) { - fx_wchar c = peek_char(ctx); - if (c == FX_WCHAR_INVALID) { - break; - } - - if (fx_wchar_is_space(c)) { - break; - } - - switch (c) { - case '{': - case '}': - case '(': - case ')': - case ';': - case ',': - case '|': - case '&': - case '$': - done = true; - break; - default: - break; - } - - if (done) { - break; - } - - fx_string_append_wc(tmp, c); - advance_char(ctx); - } - - struct lex_token *tok = NULL; - if (fx_string_get_size(tmp, FX_STRLEN_NORMAL) == 1) { - tok = lex_token_create(TOK_SYMBOL); - tok->tok_symbol = SYM_HYPHEN; - } else { - tok = lex_token_create_with_string( - TOK_FLAG, - fx_string_get_cstr(tmp)); - } - - if (!tok) { - return BSHELL_ERR_NO_MEMORY; - } - -#if 0 - if (convert_word_to_int(tok)) { - tok->tok_int *= -1; - struct lex_token *prefix = lex_token_create(TOK_SYMBOL); - prefix->tok_symbol = SYM_HYPHEN; - enqueue_token(ctx, prefix); - } -#endif - - enqueue_token(ctx, tok); - return BSHELL_SUCCESS; -} - -static enum bshell_status read_interpolation_marker(struct lex_ctx *ctx) -{ - enum bshell_status status = BSHELL_SUCCESS; - struct lex_state *state = lex_state_get(ctx); - - struct lex_token *tok = NULL; - - if (state->s_type != LEX_STATE_STRING) { - return BSHELL_ERR_INTERNAL_FAILURE; - } - - /* start of a new interpolation */ - if (!lex_state_push(ctx, LEX_STATE_STATEMENT)) { - return BSHELL_ERR_NO_MEMORY; - } - - return BSHELL_SUCCESS; -} -#endif - enum bshell_status read_literal_string( struct lex_ctx *ctx, struct lex_token **out) @@ -776,39 +795,25 @@ enum bshell_status read_line_comment(struct lex_ctx *lex) return BSHELL_SUCCESS; } -#if 0 -enum bshell_status read_dquote_marker(struct lex_ctx *ctx) -{ - enum bshell_status status = BSHELL_SUCCESS; - struct lex_state *state = lex_state_get(ctx); - - struct lex_token *tok = NULL; - - if (state->s_type == LEX_STATE_STRING) { - /* already within an fstring */ - lex_state_pop(ctx); - tok = lex_token_create(TOK_STR_END); - enqueue_token(ctx, tok); - return BSHELL_SUCCESS; - } - - /* start of a new fstring */ - tok = lex_token_create(TOK_STR_START); - enqueue_token(ctx, tok); - - if (!lex_state_push(ctx, LEX_STATE_STRING)) { - return BSHELL_ERR_NO_MEMORY; - } - - return BSHELL_SUCCESS; -} -#endif - -enum bshell_status read_word(struct lex_ctx *ctx, struct lex_token **out) +enum bshell_status read_word( + struct lex_ctx *ctx, + enum read_flags flags, + struct lex_token **out) { fx_string *tmp = get_temp_string(ctx); bool word_is_number = false; + if (!(flags & READ_NO_SET_TOKEN_START)) { + set_token_start(ctx); + } + + if (flags & READ_APPEND_HYPHEN) { + fx_string_append_c(tmp, '-'); + } + + bool number_recog = !(flags & READ_NO_NUMBER_RECOGNITION); + + enum token_operator op = OP_NONE; bool done = false; while (!done) { fx_wchar c = peek_char(ctx); @@ -821,39 +826,32 @@ enum bshell_status read_word(struct lex_ctx *ctx, struct lex_token **out) break; } - if (word_is_number && char_can_begin_symbol(ctx, c)) { - done = true; - break; - } - if (char_can_begin_symbol(ctx, c)) { done = true; break; } - switch (c) { - case '{': - case '}': - case '(': - case ')': - case ';': - case ',': - case '|': - case '&': - case '$': - done = true; - break; - default: - break; + const char *s = fx_string_get_cstr(tmp); + if (number_recog && string_is_valid_number(s, NULL)) { + if (char_can_begin_symbol_in_state( + ctx, + c, + LEX_STATE_ARITHMETIC)) { + done = true; + break; + } } - if (done) { - break; + if (!fx_wchar_is_alpha(c)) { + op = get_operator_with_string(ctx, s); + if (op != OP_NONE) { + done = true; + break; + } } fx_string_append_wc(tmp, c); - word_is_number - = string_is_valid_number(fx_string_get_cstr(tmp), NULL); + set_token_end(ctx); advance_char(ctx); } @@ -868,12 +866,6 @@ enum bshell_status read_word(struct lex_ctx *ctx, struct lex_token **out) struct lex_token *tok = lex_token_create_with_string( TOK_WORD, fx_string_get_cstr(tmp)); -#if 0 - bool converted = convert_word_to_keyword(tok); - if (!converted) { - converted = convert_word_to_int(tok); - } -#endif *out = tok; return BSHELL_SUCCESS; @@ -912,40 +904,6 @@ enum bshell_status read_symbol( return BSHELL_ERR_BAD_SYNTAX; } -#if 0 - struct lex_token *tok = NULL; - switch (node->s_def->id) { - case SYM_SQUOTE: - return read_literal_string(ctx); - case SYM_DQUOTE: - return read_dquote_marker(ctx); - case SYM_DOLLAR_LEFT_PAREN: - push_symbol(ctx, SYM_DOLLAR_LEFT_PAREN); - if (state->s_type == LEX_STATE_STRING) { - lex_state_push(ctx, LEX_STATE_STRING); - } - break; - case SYM_DOLLAR_LEFT_BRACE: - return read_braced_var(ctx, TOK_VAR); - case SYM_HASH: - return read_line_comment(ctx); - case SYM_LEFT_PAREN: - push_symbol(ctx, SYM_LEFT_PAREN); - lex_state_push(ctx, LEX_STATE_EXPRESSION); - break; - case SYM_RIGHT_PAREN: - push_symbol(ctx, SYM_RIGHT_PAREN); - lex_state_pop(ctx); - break; - case SYM_DOLLAR: - return read_var(ctx, TOK_VAR); - case SYM_AT: - return read_var(ctx, TOK_VAR_SPLAT); - default: - push_symbol(ctx, node->s_def->id); - break; - } -#endif *out = node->s_def; return BSHELL_SUCCESS; } @@ -974,6 +932,42 @@ bool char_can_begin_symbol(struct lex_ctx *ctx, char c) return char_can_begin_symbol_in_state(ctx, c, state->s_type->s_id); } +extern bool char_has_flags( + struct lex_ctx *ctx, + char c, + enum lex_token_flags flags) +{ + for (size_t i = 0; i < nr_symbols; i++) { + if (symbols[i].name[0] != c) { + continue; + } + + return (symbols[i].flags & flags) == flags; + } + + return false; +} + +enum token_operator get_operator_with_string(struct lex_ctx *ctx, const char *s) +{ + struct lex_state *state = lex_state_get(ctx); + + for (size_t i = 0; i < nr_operators; i++) { + const char *op_str = operators[i].name; + if (strcmp(op_str, s) != 0) { + continue; + } + + if (!(operators[i].enabled_states & state->s_type->s_id)) { + continue; + } + + return operators[i].id; + } + + return false; +} + static enum bshell_status read_string_content(struct lex_ctx *ctx) { fx_wchar c = FX_WCHAR_INVALID; diff --git a/bshell/parse/lex/statement.c b/bshell/parse/lex/statement.c index 9b35133..a7911c2 100644 --- a/bshell/parse/lex/statement.c +++ b/bshell/parse/lex/statement.c @@ -1,5 +1,126 @@ #include "lex-internal.h" +#if 0 +#define APPEND_HYPHEN 0x8000u + +static enum bshell_status __read_word( + struct lex_ctx *ctx, + int flags, + struct lex_token **out) +{ + fx_string *tmp = lex_state_get_tempstr(ctx); + fx_string_clear(tmp); + bool word_is_number = false; + + if (flags & APPEND_HYPHEN) { + fx_string_append_c(tmp, '-'); + } + + if (!(flags & READ_NO_SET_TOKEN_START)) { + set_token_start(ctx); + } + + enum token_operator op = OP_NONE; + + bool done = false; + while (!done) { + fx_wchar c = peek_char(ctx); + if (c == FX_WCHAR_INVALID) { + break; + } + + if (fx_wchar_is_space(c)) { + done = true; + break; + } + + if (char_has_flags(ctx, c, LEX_TOKEN_TERMINATES_WORD)) { + done = true; + break; + } + + if (char_can_begin_symbol(ctx, c)) { + op = get_operator_with_string( + ctx, + fx_string_get_cstr(tmp)); + if (op != OP_NONE) { + done = true; + break; + } + } + + fx_string_append_wc(tmp, c); + set_token_end(ctx); + advance_char(ctx); + } + + if (fx_string_get_size(tmp, FX_STRLEN_NORMAL) == 0) { + if (ctx->lex_status == BSHELL_SUCCESS) { + return BSHELL_ERR_BAD_SYNTAX; + } + + return ctx->lex_status; + } + + struct lex_token *tok = NULL; + if (op != OP_NONE) { + tok = lex_token_create(TOK_OPERATOR); + tok->tok_operator = op; + } else { + tok = lex_token_create_with_string( + TOK_WORD, + fx_string_get_cstr(tmp)); + } + + *out = tok; + return BSHELL_SUCCESS; +} +#endif + +static enum bshell_status statement_hyphen(struct lex_ctx *ctx) +{ + fx_wchar c = peek_char(ctx); + if (!fx_wchar_is_alnum(c)) { + push_symbol(ctx, SYM_HYPHEN); + lex_state_change(ctx, LEX_STATE_ARITHMETIC); + return BSHELL_SUCCESS; + } + + struct lex_token *tok = NULL; + enum bshell_status status = read_word( + ctx, + READ_NO_SET_TOKEN_START | READ_APPEND_HYPHEN, + &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + bool converted = convert_word_to_int(tok); + + if (converted) { + lex_state_change(ctx, LEX_STATE_ARITHMETIC); + /* because of APPEND_HYPHEN (which is needed to ensure operator + * tokens are detected properly), the resulting number will be + * negative. + * this token will be preceded by a HYPHEN token, so the number + * must be positive */ + tok->tok_int *= -1; + push_symbol(ctx, SYM_HYPHEN); + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; + } + + converted = convert_word_to_operator(ctx, tok); + if (converted) { + lex_state_change(ctx, LEX_STATE_ARITHMETIC); + } else { + lex_state_change(ctx, LEX_STATE_COMMAND); + } + + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; +} + static enum bshell_status statement_symbol(struct lex_ctx *ctx) { const struct lex_token_def *sym = NULL; @@ -11,6 +132,8 @@ static enum bshell_status statement_symbol(struct lex_ctx *ctx) struct lex_token *tok = NULL; switch (sym->id) { + case SYM_HYPHEN: + return statement_hyphen(ctx); case SYM_SQUOTE: status = read_literal_string(ctx, &tok); if (status != BSHELL_SUCCESS) { @@ -117,7 +240,14 @@ static enum bshell_status statement_word(struct lex_ctx *ctx) return status; } - bool converted = convert_word_to_keyword(word); + struct lex_state *state = lex_state_get(ctx); + + bool converted = false; + + if (!(state->s_flags & STATEMENT_F_DISABLE_KEYWORDS)) { + converted = convert_word_to_keyword(word); + } + if (!converted) { converted = convert_word_to_int(word); } @@ -154,10 +284,35 @@ static enum bshell_status statement_pump_token(struct lex_ctx *ctx) return BSHELL_SUCCESS; } +#if 0 + if (char_can_begin_symbol_in_state(ctx, c, LEX_STATE_ARITHMETIC)) { + lex_state_change(ctx, LEX_STATE_ARITHMETIC); + return BSHELL_SUCCESS; + } + + if (char_can_begin_symbol_in_state(ctx, c, LEX_STATE_COMMAND)) { + lex_state_change(ctx, LEX_STATE_COMMAND); + return BSHELL_SUCCESS; + } +#endif + if (char_can_begin_symbol(ctx, c)) { return statement_symbol(ctx); } + if (char_has_flags(ctx, c, LEX_TOKEN_UNARY_ARITHMETIC)) { + lex_state_change(ctx, LEX_STATE_ARITHMETIC); + return BSHELL_SUCCESS; + } + +#if 0 + if (fx_wchar_is_number(c)) { + lex_state_change(ctx, LEX_STATE_ARITHMETIC); + } else { + lex_state_change(ctx, LEX_STATE_COMMAND); + } +#endif + return statement_word(ctx); } diff --git a/bshell/parse/lex/word.c b/bshell/parse/lex/word.c new file mode 100644 index 0000000..63d5b4e --- /dev/null +++ b/bshell/parse/lex/word.c @@ -0,0 +1,161 @@ +#include "lex-internal.h" + +static enum bshell_status word_symbol(struct lex_ctx *ctx) +{ + const struct lex_token_def *sym = NULL; + enum bshell_status status = read_symbol(ctx, &sym); + + if (status != BSHELL_SUCCESS) { + return status; + } + + struct lex_token *tok = NULL; + + switch (sym->id) { + case SYM_DOLLAR_LEFT_PAREN: + status = push_symbol(ctx, sym->id); + if (status != BSHELL_SUCCESS) { + return status; + } + + lex_state_push(ctx, LEX_STATE_STATEMENT, 0); + return BSHELL_SUCCESS; + case SYM_RIGHT_PAREN: + lex_state_pop(ctx); + + status = push_symbol(ctx, sym->id); + if (status != BSHELL_SUCCESS) { + return status; + } + return BSHELL_SUCCESS; + case SYM_DOLLAR: + status = read_var(ctx, TOK_VAR, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_AT: + status = read_var(ctx, TOK_VAR_SPLAT, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_DOLLAR_LEFT_BRACE: + status = read_braced_var(ctx, TOK_VAR, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_AT_LEFT_BRACE: + status = read_braced_var(ctx, TOK_VAR_SPLAT, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + default: + break; + } + + return BSHELL_ERR_BAD_SYNTAX; +} + +static enum bshell_status word_content(struct lex_ctx *ctx) +{ + fx_wchar c = FX_WCHAR_INVALID; + fx_string *temp = lex_state_get_tempstr(ctx); + set_token_start(ctx); + fx_string_clear(temp); + + while (1) { + c = peek_char(ctx); + if (c == FX_WCHAR_INVALID) { + /* EOF without end of word */ + ctx->lex_status = BSHELL_ERR_BAD_SYNTAX; + } + + if (fx_wchar_is_space(c)) { + break; + } + + if (char_can_begin_symbol(ctx, c)) { + break; + } + + fx_string_append_wc(temp, c); + set_token_end(ctx); + advance_char(ctx); + } + + if (fx_string_get_size(temp, FX_STRLEN_NORMAL) == 0) { + return BSHELL_SUCCESS; + } + + struct lex_token *tok = lex_token_create_with_string( + TOK_WORD, + fx_string_get_cstr(temp)); + enqueue_token(ctx, tok); + + return BSHELL_SUCCESS; +} + +static enum bshell_status word_begin(struct lex_ctx *ctx) +{ + struct lex_token *tok = lex_token_create(TOK_WORD_START); + if (!tok) { + return BSHELL_ERR_NO_MEMORY; + } + + enqueue_token_with_coordinates( + ctx, + tok, + &ctx->lex_start, + &ctx->lex_start); + return BSHELL_SUCCESS; +} + +static enum bshell_status word_end(struct lex_ctx *ctx) +{ + struct lex_token *tok = lex_token_create(TOK_WORD_END); + if (!tok) { + return BSHELL_ERR_NO_MEMORY; + } + + enqueue_token_with_coordinates(ctx, tok, &ctx->lex_end, &ctx->lex_end); + return BSHELL_SUCCESS; +} + +static enum bshell_status word_pump_token(struct lex_ctx *ctx) +{ + fx_wchar c = peek_char(ctx); + + if (fx_wchar_is_space(c)) { + lex_state_pop(ctx); + return BSHELL_SUCCESS; + } + + if (char_has_flags(ctx, c, LEX_TOKEN_TERMINATES_WORD)) { + lex_state_pop(ctx); + return BSHELL_SUCCESS; + } + + if (char_can_begin_symbol(ctx, c)) { + return word_symbol(ctx); + } + + return word_content(ctx); +} + +const struct lex_state_type lex_word_state = { + .s_id = LEX_STATE_WORD, + .s_begin = word_begin, + .s_end = word_end, + .s_pump_token = word_pump_token, +}; diff --git a/bshell/parse/token.c b/bshell/parse/token.c index f3934bb..197822a 100644 --- a/bshell/parse/token.c +++ b/bshell/parse/token.c @@ -75,6 +75,19 @@ struct lex_token *lex_token_change_type( return tok; } +void lex_token_change_string(struct lex_token *tok, const char *s) +{ + if (!lex_token_has_string_value(tok)) { + return; + } + + if (tok->tok_str) { + free(tok->tok_str); + } + + tok->tok_str = fx_strdup(s); +} + #define ENUM_STR(x) \ case x: \ return #x @@ -88,6 +101,9 @@ const char *token_type_to_string(enum token_type type) ENUM_STR(TOK_INT); ENUM_STR(TOK_DOUBLE); ENUM_STR(TOK_WORD); + ENUM_STR(TOK_WORD_START); + ENUM_STR(TOK_WORD_END); + ENUM_STR(TOK_OPERATOR); ENUM_STR(TOK_VAR); ENUM_STR(TOK_VAR_SPLAT); ENUM_STR(TOK_FLAG); @@ -148,3 +164,41 @@ const char *token_symbol_to_string(enum token_symbol sym) return ""; } } + +const char *token_operator_to_string(enum token_operator op) +{ + switch (op) { + ENUM_STR(OP_BAND); + ENUM_STR(OP_BOR); + ENUM_STR(OP_BXOR); + ENUM_STR(OP_BNOT); + ENUM_STR(OP_SHL); + ENUM_STR(OP_SHR); + ENUM_STR(OP_EQ); + ENUM_STR(OP_NE); + ENUM_STR(OP_GT); + ENUM_STR(OP_LT); + ENUM_STR(OP_GE); + ENUM_STR(OP_LE); + ENUM_STR(OP_MATCH); + ENUM_STR(OP_NOTMATCH); + ENUM_STR(OP_REPLACE); + ENUM_STR(OP_LIKE); + ENUM_STR(OP_NOTLIKE); + ENUM_STR(OP_IN); + ENUM_STR(OP_NOTIN); + ENUM_STR(OP_CONTAINS); + ENUM_STR(OP_NOTCONTAINS); + ENUM_STR(OP_AND); + ENUM_STR(OP_OR); + ENUM_STR(OP_XOR); + ENUM_STR(OP_NOT); + ENUM_STR(OP_SPLIT); + ENUM_STR(OP_JOIN); + ENUM_STR(OP_IS); + ENUM_STR(OP_ISNOT); + ENUM_STR(OP_AS); + default: + return ""; + } +} diff --git a/bshell/parse/token.h b/bshell/parse/token.h index dc3ac6b..c1898ea 100644 --- a/bshell/parse/token.h +++ b/bshell/parse/token.h @@ -16,7 +16,10 @@ enum token_type { TOK_INT, TOK_DOUBLE, TOK_WORD, + TOK_WORD_START, + TOK_WORD_END, TOK_FLAG, + TOK_OPERATOR, TOK_VAR, TOK_VAR_SPLAT, TOK_STRING, @@ -35,9 +38,45 @@ enum token_keyword { __KW_INDEX_LIMIT, }; +enum token_operator { + OP_NONE = 0, + __OP_INDEX_BASE = 300, + OP_BAND, + OP_BOR, + OP_BXOR, + OP_BNOT, + OP_SHL, + OP_SHR, + OP_EQ, + OP_NE, + OP_GT, + OP_LT, + OP_GE, + OP_LE, + OP_MATCH, + OP_NOTMATCH, + OP_REPLACE, + OP_LIKE, + OP_NOTLIKE, + OP_IN, + OP_NOTIN, + OP_CONTAINS, + OP_NOTCONTAINS, + OP_AND, + OP_OR, + OP_XOR, + OP_NOT, + OP_SPLIT, + OP_JOIN, + OP_IS, + OP_ISNOT, + OP_AS, + __OP_INDEX_LIMIT, +}; + enum token_symbol { SYM_NONE = 0, - __SYM_INDEX_BASE = 300, + __SYM_INDEX_BASE = 400, SYM_PLUS, SYM_HYPHEN, SYM_FORWARD_SLASH, @@ -80,6 +119,7 @@ struct lex_token { union { enum token_keyword tok_keyword; enum token_symbol tok_symbol; + enum token_operator tok_operator; long long tok_int; double tok_double; char *tok_str; @@ -95,6 +135,7 @@ extern void lex_token_destroy(struct lex_token *tok); extern struct lex_token *lex_token_change_type( struct lex_token *tok, enum token_type new_type); +extern void lex_token_change_string(struct lex_token *tok, const char *s); static inline bool lex_token_is_symbol( struct lex_token *tok, @@ -129,5 +170,6 @@ static inline bool lex_token_has_string_value(const struct lex_token *tok) extern const char *token_type_to_string(enum token_type type); extern const char *token_keyword_to_string(enum token_keyword keyword); extern const char *token_symbol_to_string(enum token_symbol sym); +extern const char *token_operator_to_string(enum token_operator op); #endif