From 5ea41fcc6ed1e477065465e9cc9b975e21e5d1da Mon Sep 17 00:00:00 2001 From: Max Wash Date: Sun, 10 May 2026 14:18:46 +0100 Subject: [PATCH] parse: lex: re-implement lexer as a state machine to allow more complex scanning behaviour --- bshell/parse/lex.h | 40 +- bshell/parse/lex/arithmetic.c | 136 +++++++ bshell/parse/lex/command.c | 131 +++++++ bshell/parse/lex/expression.c | 134 +++++++ bshell/parse/lex/lex-internal.h | 75 ++++ bshell/parse/{ => lex}/lex.c | 627 ++++++++++++++------------------ bshell/parse/lex/statement.c | 162 +++++++++ bshell/parse/lex/string.c | 136 +++++++ bshell/parse/token.c | 2 + bshell/parse/token.h | 2 + 10 files changed, 1066 insertions(+), 379 deletions(-) create mode 100644 bshell/parse/lex/arithmetic.c create mode 100644 bshell/parse/lex/command.c create mode 100644 bshell/parse/lex/expression.c create mode 100644 bshell/parse/lex/lex-internal.h rename bshell/parse/{ => lex}/lex.c (65%) create mode 100644 bshell/parse/lex/statement.c create mode 100644 bshell/parse/lex/string.c diff --git a/bshell/parse/lex.h b/bshell/parse/lex.h index b974bdd..115240a 100644 --- a/bshell/parse/lex.h +++ b/bshell/parse/lex.h @@ -12,23 +12,21 @@ struct line_source; enum lex_flags { LEX_PRINT_TOKENS = 0x01u, - - /* these flags are for lex_ctx_peek and lex_ctx_claim */ - LEX_ENABLE_KEYWORD = 0x0100u, - LEX_ENABLE_INT = 0x0200u, - LEX_ENABLE_SYMBOL = 0x0400u, }; -enum lex_token_flags { - LEX_TOKEN_ENABLE_IN_STRING = 0x01u, - LEX_TOKEN_ENABLE_IN_WORD = 0x02u, +enum lex_state_type_id { + LEX_STATE_STATEMENT = 0x01u, + LEX_STATE_EXPRESSION = 0x02u, + LEX_STATE_COMMAND = 0x04u, + LEX_STATE_ARITHMETIC = 0x08u, + LEX_STATE_STRING = 0x10u, }; struct lex_token_def { int id; const char *name; uint64_t name_hash; - enum lex_token_flags flags; + enum lex_state_type_id enabled_states; }; struct lex_symbol_node { @@ -39,33 +37,19 @@ struct lex_symbol_node { fx_queue s_children; }; -enum lex_state_type { - LEX_STATE_NORMAL = 0, - LEX_STATE_WORD, - LEX_STATE_STRING, - LEX_STATE_INTERPOLATION, -}; - struct lex_state { - enum lex_state_type s_type; + const struct lex_state_type *s_type; unsigned int s_paren_depth; fx_queue_entry s_entry; + fx_string *s_tempstr; }; struct lex_ctx { enum lex_flags lex_flags; - /* lex_ctx maintains two queues of tokens. - * lex_words is a simple queue of all WORDS scanned by the lexer, - * without any further parsing applied to potentially convert the words - * into TOK_INT, TOK_SYMBOL, etc. - * lex_tokens represent all of the tokens generated by applying the - * aforementioned parsing. - * the two queues are kept in sync such that, as tokens are dequeued - * from one queue, the other queue is moved forward too. */ - fx_queue lex_tokens, lex_words; + fx_queue lex_tokens; struct line_source *lex_src; fx_stringstream *lex_buf; - fx_string *lex_tmp, *lex_wordbuf; + fx_string *lex_tmp; fx_wchar lex_ch; fx_queue lex_state; struct lex_symbol_node *lex_sym_tree; @@ -79,9 +63,7 @@ extern enum bshell_status lex_ctx_init( extern enum bshell_status lex_ctx_cleanup(struct lex_ctx *ctx); extern struct lex_token *lex_ctx_peek(struct lex_ctx *ctx); -extern struct lex_token *lex_ctx_peek_word(struct lex_ctx *ctx); extern struct lex_token *lex_ctx_claim(struct lex_ctx *ctx); -extern struct lex_token *lex_ctx_claim_word(struct lex_ctx *ctx); extern void lex_ctx_discard(struct lex_ctx *ctx); #endif diff --git a/bshell/parse/lex/arithmetic.c b/bshell/parse/lex/arithmetic.c new file mode 100644 index 0000000..7bcf56d --- /dev/null +++ b/bshell/parse/lex/arithmetic.c @@ -0,0 +1,136 @@ +#include "lex-internal.h" + +static enum bshell_status arithmetic_symbol(struct lex_ctx *ctx) +{ + const struct lex_token_def *sym = NULL; + enum bshell_status status = read_symbol(ctx, &sym); + + if (status != BSHELL_SUCCESS) { + return status; + } + + struct lex_token *tok = NULL; + switch (sym->id) { + case SYM_SQUOTE: + status = read_literal_string(ctx, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; + + case SYM_HASH: + return read_line_comment(ctx); + case SYM_DQUOTE: + if (!lex_state_push(ctx, LEX_STATE_STRING)) { + return BSHELL_ERR_NO_MEMORY; + } + + return BSHELL_SUCCESS; + case SYM_DOLLAR: + status = read_var(ctx, TOK_VAR, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_AT: + status = read_var(ctx, TOK_VAR_SPLAT, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_DOLLAR_LEFT_BRACE: + status = read_braced_var(ctx, TOK_VAR, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_AT_LEFT_BRACE: + status = read_braced_var(ctx, TOK_VAR_SPLAT, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + default: + break; + } + + push_symbol(ctx, sym->id); + + switch (sym->id) { + case SYM_LEFT_PAREN: + lex_state_push(ctx, LEX_STATE_EXPRESSION); + return BSHELL_SUCCESS; + case SYM_DOLLAR_LEFT_PAREN: + lex_state_push(ctx, LEX_STATE_STATEMENT); + return BSHELL_SUCCESS; + case SYM_RIGHT_PAREN: + lex_state_pop(ctx); + return BSHELL_SUCCESS; + case SYM_SEMICOLON: + lex_state_change(ctx, LEX_STATE_STATEMENT); + return BSHELL_SUCCESS; + default: + break; + } + + return BSHELL_SUCCESS; +} + +static enum bshell_status arithmetic_word(struct lex_ctx *ctx) +{ + struct lex_token *word = NULL; + enum bshell_status status = read_word(ctx, &word); + if (status != BSHELL_SUCCESS) { + return status; + } + + bool converted = convert_word_to_keyword(word); + if (!converted) { + converted = convert_word_to_int(word); + } + + enqueue_token(ctx, word); + return BSHELL_SUCCESS; +} + +static enum bshell_status arithmetic_pump_token(struct lex_ctx *ctx) +{ + fx_wchar c = peek_char(ctx); + bool newline = false; + + while (fx_wchar_is_space(c)) { + if (c == '\n') { + newline = true; + } + + advance_char_noread(ctx); + c = peek_char_noread(ctx); + } + + if (newline) { + struct lex_token *tok = lex_token_create(TOK_LINEFEED); + enqueue_token(ctx, tok); + lex_state_change(ctx, LEX_STATE_STATEMENT); + return BSHELL_SUCCESS; + } + + if (char_can_begin_symbol(ctx, c)) { + return arithmetic_symbol(ctx); + } + + return arithmetic_word(ctx); +} + +const struct lex_state_type lex_arithmetic_state = { + .s_id = LEX_STATE_ARITHMETIC, + .s_pump_token = arithmetic_pump_token, +}; diff --git a/bshell/parse/lex/command.c b/bshell/parse/lex/command.c new file mode 100644 index 0000000..62394a8 --- /dev/null +++ b/bshell/parse/lex/command.c @@ -0,0 +1,131 @@ +#include "lex-internal.h" + +static enum bshell_status command_symbol(struct lex_ctx *ctx) +{ + const struct lex_token_def *sym = NULL; + enum bshell_status status = read_symbol(ctx, &sym); + + if (status != BSHELL_SUCCESS) { + return status; + } + + struct lex_token *tok = NULL; + switch (sym->id) { + case SYM_SQUOTE: + status = read_literal_string(ctx, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; + + case SYM_HASH: + return read_line_comment(ctx); + case SYM_DQUOTE: + if (!lex_state_push(ctx, LEX_STATE_STRING)) { + return BSHELL_ERR_NO_MEMORY; + } + + return BSHELL_SUCCESS; + case SYM_DOLLAR: + status = read_var(ctx, TOK_VAR, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_AT: + status = read_var(ctx, TOK_VAR_SPLAT, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_DOLLAR_LEFT_BRACE: + status = read_braced_var(ctx, TOK_VAR, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_AT_LEFT_BRACE: + status = read_braced_var(ctx, TOK_VAR_SPLAT, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + default: + break; + } + + push_symbol(ctx, sym->id); + + switch (sym->id) { + case SYM_LEFT_PAREN: + lex_state_push(ctx, LEX_STATE_EXPRESSION); + return BSHELL_SUCCESS; + case SYM_DOLLAR_LEFT_PAREN: + lex_state_push(ctx, LEX_STATE_STATEMENT); + return BSHELL_SUCCESS; + case SYM_RIGHT_PAREN: + lex_state_pop(ctx); + return BSHELL_SUCCESS; + case SYM_SEMICOLON: + lex_state_change(ctx, LEX_STATE_STATEMENT); + return BSHELL_SUCCESS; + default: + break; + } + + return BSHELL_SUCCESS; +} + +static enum bshell_status command_word(struct lex_ctx *ctx) +{ + struct lex_token *word = NULL; + enum bshell_status status = read_word(ctx, &word); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, word); + return BSHELL_SUCCESS; +} + +enum bshell_status command_pump_token(struct lex_ctx *ctx) +{ + fx_wchar c = peek_char(ctx); + bool newline = false; + + while (fx_wchar_is_space(c)) { + if (c == '\n') { + newline = true; + } + + advance_char_noread(ctx); + c = peek_char_noread(ctx); + } + + if (newline) { + struct lex_token *tok = lex_token_create(TOK_LINEFEED); + enqueue_token(ctx, tok); + lex_state_change(ctx, LEX_STATE_STATEMENT); + return BSHELL_SUCCESS; + } + + if (char_can_begin_symbol(ctx, c)) { + return command_symbol(ctx); + } + + return command_word(ctx); +} + +const struct lex_state_type lex_command_state = { + .s_id = LEX_STATE_COMMAND, + .s_pump_token = command_pump_token, +}; diff --git a/bshell/parse/lex/expression.c b/bshell/parse/lex/expression.c new file mode 100644 index 0000000..9b37e99 --- /dev/null +++ b/bshell/parse/lex/expression.c @@ -0,0 +1,134 @@ +#include "lex-internal.h" + +static enum bshell_status expression_symbol(struct lex_ctx *ctx) +{ + const struct lex_token_def *sym = NULL; + enum bshell_status status = read_symbol(ctx, &sym); + + if (status != BSHELL_SUCCESS) { + return status; + } + + struct lex_token *tok = NULL; + + switch (sym->id) { + case SYM_DQUOTE: + if (!lex_state_push(ctx, LEX_STATE_STRING)) { + return BSHELL_ERR_NO_MEMORY; + } + + return BSHELL_SUCCESS; + case SYM_DOLLAR: + status = read_var(ctx, TOK_VAR, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + lex_state_change(ctx, LEX_STATE_ARITHMETIC); + return status; + case SYM_AT: + status = read_var(ctx, TOK_VAR_SPLAT, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + lex_state_change(ctx, LEX_STATE_ARITHMETIC); + return status; + case SYM_DOLLAR_LEFT_BRACE: + status = read_braced_var(ctx, TOK_VAR, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + lex_state_change(ctx, LEX_STATE_ARITHMETIC); + return status; + case SYM_AT_LEFT_BRACE: + status = read_braced_var(ctx, TOK_VAR_SPLAT, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + lex_state_change(ctx, LEX_STATE_ARITHMETIC); + return status; + default: + break; + } + + push_symbol(ctx, sym->id); + + switch (sym->id) { + case SYM_LEFT_PAREN: + lex_state_push(ctx, LEX_STATE_EXPRESSION); + return BSHELL_SUCCESS; + case SYM_DOLLAR_LEFT_PAREN: + lex_state_push(ctx, LEX_STATE_STATEMENT); + return BSHELL_SUCCESS; + case SYM_RIGHT_PAREN: + lex_state_pop(ctx); + return BSHELL_SUCCESS; + case SYM_SEMICOLON: + lex_state_change(ctx, LEX_STATE_STATEMENT); + return BSHELL_SUCCESS; + default: + break; + } + + return BSHELL_SUCCESS; +} + +static enum bshell_status expression_word(struct lex_ctx *ctx) +{ + struct lex_token *word = NULL; + enum bshell_status status = read_word(ctx, &word); + if (status != BSHELL_SUCCESS) { + return status; + } + + bool converted = convert_word_to_int(word); + + if (converted) { + lex_state_change(ctx, LEX_STATE_ARITHMETIC); + } else { + lex_state_change(ctx, LEX_STATE_COMMAND); + } + + enqueue_token(ctx, word); + return BSHELL_SUCCESS; +} + +static enum bshell_status expression_pump_token(struct lex_ctx *ctx) +{ + fx_wchar c = peek_char(ctx); + bool newline = false; + + while (fx_wchar_is_space(c)) { + if (c == '\n') { + newline = true; + } + + advance_char_noread(ctx); + c = peek_char_noread(ctx); + } + + if (newline) { + struct lex_token *tok = lex_token_create(TOK_LINEFEED); + enqueue_token(ctx, tok); + lex_state_change(ctx, LEX_STATE_STATEMENT); + return BSHELL_SUCCESS; + } + + if (char_can_begin_symbol(ctx, c)) { + return expression_symbol(ctx); + } + + return expression_word(ctx); +} + +const struct lex_state_type lex_expression_state = { + .s_id = LEX_STATE_EXPRESSION, + .s_pump_token = expression_pump_token, +}; diff --git a/bshell/parse/lex/lex-internal.h b/bshell/parse/lex/lex-internal.h new file mode 100644 index 0000000..76afa0b --- /dev/null +++ b/bshell/parse/lex/lex-internal.h @@ -0,0 +1,75 @@ +#ifndef PARSE_LEX_INTERNAL_H_ +#define PARSE_LEX_INTERNAL_H_ + +#include "../../status.h" +#include "../lex.h" +#include "../token.h" + +struct lex_ctx; + +typedef enum bshell_status (*lex_state_pump_token)(struct lex_ctx *); +typedef enum bshell_status (*lex_state_begin)(struct lex_ctx *); +typedef enum bshell_status (*lex_state_end)(struct lex_ctx *); + +struct lex_state_type { + enum lex_state_type_id s_id; + lex_state_pump_token s_pump_token; + lex_state_begin s_begin; + lex_state_end s_end; +}; + +extern enum bshell_status pump_token_statement(struct lex_ctx *ctx); +extern enum bshell_status pump_token_expression(struct lex_ctx *ctx); +extern enum bshell_status pump_token_command(struct lex_ctx *ctx); +extern enum bshell_status pump_token_arithmetic(struct lex_ctx *ctx); +extern enum bshell_status pump_token_string(struct lex_ctx *ctx); + +extern struct lex_state *lex_state_push( + struct lex_ctx *ctx, + enum lex_state_type_id state_type); +extern void lex_state_pop(struct lex_ctx *ctx); +extern struct lex_state *lex_state_get(struct lex_ctx *ctx); +extern void lex_state_change(struct lex_ctx *ctx, enum lex_state_type_id type); +extern fx_string *lex_state_get_tempstr(struct lex_ctx *ctx); + +extern fx_wchar peek_char(struct lex_ctx *ctx); +extern fx_wchar peek_char_noread(struct lex_ctx *ctx); +extern void advance_char(struct lex_ctx *ctx); +extern void advance_char_noread(struct lex_ctx *ctx); + +extern bool string_is_valid_number(const char *s, long long *out); +extern bool convert_word_to_int(struct lex_token *tok); +extern bool convert_word_to_keyword(struct lex_token *tok); + +extern void enqueue_token(struct lex_ctx *ctx, struct lex_token *tok); + +extern enum bshell_status read_word( + struct lex_ctx *ctx, + struct lex_token **out); +extern enum bshell_status read_symbol( + struct lex_ctx *ctx, + const struct lex_token_def **out); +extern enum bshell_status read_literal_string( + struct lex_ctx *ctx, + struct lex_token **out); +extern enum bshell_status read_line_comment(struct lex_ctx *lex); +extern enum bshell_status read_var( + struct lex_ctx *ctx, + enum token_type type, + struct lex_token **out); +extern enum bshell_status read_braced_var( + struct lex_ctx *ctx, + enum token_type type, + struct lex_token **out); + +extern enum bshell_status push_symbol( + struct lex_ctx *ctx, + enum token_symbol sym); + +extern bool char_can_begin_symbol(struct lex_ctx *ctx, char c); +extern bool char_can_begin_symbol_in_state( + struct lex_ctx *ctx, + char c, + enum lex_state_type_id state_type); + +#endif diff --git a/bshell/parse/lex.c b/bshell/parse/lex/lex.c similarity index 65% rename from bshell/parse/lex.c rename to bshell/parse/lex/lex.c index e9a3d12..945f094 100644 --- a/bshell/parse/lex.c +++ b/bshell/parse/lex/lex.c @@ -1,72 +1,91 @@ -#include "lex.h" +#include "../lex.h" -#include "../debug.h" -#include "../line-source.h" -#include "token.h" +#include "../../debug.h" +#include "../../line-source.h" +#include "../token.h" +#include "lex-internal.h" -#define LEX_TOKEN_DEF(i, n) {.id = (i), .name = (n)} -#define LEX_TOKEN_DEF2(i, n, f) {.id = (i), .name = (n), .flags = (f)} +#define LEX_TOKEN_DEF(i, n, s) {.id = (i), .name = (n), .enabled_states = (s)} #define CONVERSION_REQUESTED(flags) \ ((flags) & (LEX_ENABLE_INT | LEX_ENABLE_KEYWORD)) static struct lex_token_def keywords[] = { - LEX_TOKEN_DEF(KW_FUNC, "func"), + LEX_TOKEN_DEF(KW_FUNC, "func", LEX_STATE_STATEMENT), + LEX_TOKEN_DEF(KW_IF, "if", LEX_STATE_STATEMENT), + LEX_TOKEN_DEF(KW_ELSE, "else", LEX_STATE_STATEMENT), }; static const size_t nr_keywords = sizeof keywords / sizeof keywords[0]; +#define LEX_STATES(states) (LEX_STATE_STATEMENT | LEX_STATE_EXPRESSION | states) +#define LEX_STATE_ALL \ + (LEX_STATE_ARITHMETIC | LEX_STATE_STATEMENT | LEX_STATE_COMMAND \ + | LEX_STATE_STRING | LEX_STATE_EXPRESSION) + static struct lex_token_def symbols[] = { - LEX_TOKEN_DEF(SYM_PLUS, "+"), - LEX_TOKEN_DEF(SYM_HYPHEN, "-"), - LEX_TOKEN_DEF(SYM_FORWARD_SLASH, "/"), - LEX_TOKEN_DEF(SYM_ASTERISK, "*"), - LEX_TOKEN_DEF2(SYM_AMPERSAND, "&", LEX_TOKEN_ENABLE_IN_WORD), - LEX_TOKEN_DEF(SYM_PERCENT, "%"), - LEX_TOKEN_DEF(SYM_SQUOTE, "'"), - LEX_TOKEN_DEF2(SYM_DQUOTE, "\"", LEX_TOKEN_ENABLE_IN_STRING), - LEX_TOKEN_DEF(SYM_HASH, "#"), - LEX_TOKEN_DEF2(SYM_DOLLAR, "$", LEX_TOKEN_ENABLE_IN_STRING), - LEX_TOKEN_DEF2(SYM_DOLLAR_LEFT_PAREN, "$(", LEX_TOKEN_ENABLE_IN_STRING), - LEX_TOKEN_DEF2(SYM_DOLLAR_LEFT_BRACE, "${", LEX_TOKEN_ENABLE_IN_STRING), - LEX_TOKEN_DEF(SYM_AT, "@"), - LEX_TOKEN_DEF2(SYM_PIPE, "|", LEX_TOKEN_ENABLE_IN_WORD), - LEX_TOKEN_DEF2(SYM_COMMA, ",", LEX_TOKEN_ENABLE_IN_WORD), - LEX_TOKEN_DEF2(SYM_SEMICOLON, ";", LEX_TOKEN_ENABLE_IN_WORD), - LEX_TOKEN_DEF(SYM_AT_LEFT_BRACE, "@{"), - LEX_TOKEN_DEF2(SYM_LEFT_BRACE, "{", LEX_TOKEN_ENABLE_IN_WORD), - LEX_TOKEN_DEF2(SYM_RIGHT_BRACE, "}", LEX_TOKEN_ENABLE_IN_WORD), - LEX_TOKEN_DEF(SYM_LEFT_BRACKET, "["), - LEX_TOKEN_DEF(SYM_RIGHT_BRACKET, "]"), - LEX_TOKEN_DEF2(SYM_LEFT_PAREN, "(", LEX_TOKEN_ENABLE_IN_WORD), - LEX_TOKEN_DEF2(SYM_RIGHT_PAREN, ")", LEX_TOKEN_ENABLE_IN_WORD), - LEX_TOKEN_DEF(SYM_EQUAL, "="), - LEX_TOKEN_DEF(SYM_PLUS_EQUAL, "+="), - LEX_TOKEN_DEF(SYM_HYPHEN_EQUAL, "-="), - LEX_TOKEN_DEF(SYM_FORWARD_SLASH_EQUAL, "/="), - LEX_TOKEN_DEF(SYM_ASTERISK_EQUAL, "*="), - LEX_TOKEN_DEF(SYM_PERCENT_EQUAL, "%="), + LEX_TOKEN_DEF(SYM_PLUS, "+", LEX_STATES(LEX_STATE_ARITHMETIC)), + LEX_TOKEN_DEF(SYM_HYPHEN, "-", LEX_STATES(LEX_STATE_ARITHMETIC)), + LEX_TOKEN_DEF(SYM_FORWARD_SLASH, "/", LEX_STATES(LEX_STATE_ARITHMETIC)), + LEX_TOKEN_DEF(SYM_ASTERISK, "*", LEX_STATES(LEX_STATE_ARITHMETIC)), + LEX_TOKEN_DEF( + SYM_AMPERSAND, + "&", + LEX_STATES(LEX_STATE_ARITHMETIC | LEX_STATE_COMMAND)), + LEX_TOKEN_DEF(SYM_PERCENT, "%", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF( + SYM_SQUOTE, + "'", + LEX_STATES(LEX_STATE_ARITHMETIC | LEX_STATE_COMMAND)), + LEX_TOKEN_DEF(SYM_DQUOTE, "\"", LEX_STATE_ALL), + LEX_TOKEN_DEF( + SYM_HASH, + "#", + LEX_STATES(LEX_STATE_ARITHMETIC | LEX_STATE_COMMAND)), + LEX_TOKEN_DEF( + SYM_DOLLAR, + "$", + LEX_STATES( + LEX_STATE_ARITHMETIC | LEX_STATE_COMMAND + | LEX_STATE_STRING)), + LEX_TOKEN_DEF(SYM_DOLLAR_LEFT_PAREN, "$(", LEX_STATE_ALL), + LEX_TOKEN_DEF(SYM_DOLLAR_LEFT_BRACE, "${", LEX_STATE_ALL), + LEX_TOKEN_DEF(SYM_AT, "@", LEX_STATE_ALL), + LEX_TOKEN_DEF(SYM_PIPE, "|", LEX_STATE_ALL), + LEX_TOKEN_DEF(SYM_COMMA, ",", LEX_STATE_ALL), + LEX_TOKEN_DEF(SYM_SEMICOLON, ";", LEX_STATE_ALL), + LEX_TOKEN_DEF(SYM_AT_LEFT_BRACE, "@{", LEX_STATE_ALL), + LEX_TOKEN_DEF(SYM_LEFT_BRACE, "{", LEX_STATE_ALL), + LEX_TOKEN_DEF(SYM_RIGHT_BRACE, "}", LEX_STATE_ALL), + LEX_TOKEN_DEF(SYM_LEFT_BRACKET, "[", LEX_STATES(LEX_STATE_ARITHMETIC)), + LEX_TOKEN_DEF(SYM_RIGHT_BRACKET, "]", LEX_STATES(LEX_STATE_ARITHMETIC)), + LEX_TOKEN_DEF(SYM_LEFT_PAREN, "(", LEX_STATE_ALL), + LEX_TOKEN_DEF(SYM_RIGHT_PAREN, ")", LEX_STATE_ALL), + LEX_TOKEN_DEF(SYM_EQUAL, "=", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(SYM_PLUS_EQUAL, "+=", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(SYM_HYPHEN_EQUAL, "-=", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(SYM_FORWARD_SLASH_EQUAL, "/=", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(SYM_ASTERISK_EQUAL, "*=", LEX_STATE_ARITHMETIC), + LEX_TOKEN_DEF(SYM_PERCENT_EQUAL, "%=", LEX_STATE_ARITHMETIC), }; static const size_t nr_symbols = sizeof symbols / sizeof symbols[0]; -typedef enum bshell_status (*pump_token_impl)(struct lex_ctx *); +extern const struct lex_state_type lex_statement_state; +extern const struct lex_state_type lex_expression_state; +extern const struct lex_state_type lex_command_state; +extern const struct lex_state_type lex_arithmetic_state; +extern const struct lex_state_type lex_string_state; -static enum bshell_status do_pump_token_normal(struct lex_ctx *); -static enum bshell_status do_pump_token_string(struct lex_ctx *ctx); -static const pump_token_impl token_pump_functions[] = { - [LEX_STATE_NORMAL] = do_pump_token_normal, - [LEX_STATE_STRING] = do_pump_token_string, - [LEX_STATE_INTERPOLATION] = do_pump_token_normal, +static const struct lex_state_type *state_types[] = { + [LEX_STATE_STATEMENT] = &lex_statement_state, + [LEX_STATE_EXPRESSION] = &lex_expression_state, + [LEX_STATE_COMMAND] = &lex_command_state, + [LEX_STATE_ARITHMETIC] = &lex_arithmetic_state, + [LEX_STATE_STRING] = &lex_string_state, }; -static bool char_can_begin_symbol(struct lex_ctx *ctx, char c); -static bool char_can_begin_symbol_in_context( +struct lex_state *lex_state_push( struct lex_ctx *ctx, - char c, - enum token_type context); - -static struct lex_state *push_lex_state( - struct lex_ctx *ctx, - enum lex_state_type state_type) + enum lex_state_type_id state_type) { struct lex_state *state = malloc(sizeof *state); if (!state) { @@ -75,24 +94,40 @@ static struct lex_state *push_lex_state( memset(state, 0x0, sizeof *state); - state->s_type = state_type; + state->s_type = state_types[state_type]; fx_queue_push_back(&ctx->lex_state, &state->s_entry); + if (state->s_type->s_begin) { + state->s_type->s_begin(ctx); + } + return state; } -static void pop_lex_state(struct lex_ctx *ctx) +void lex_state_pop(struct lex_ctx *ctx) { - fx_queue_entry *entry = fx_queue_pop_back(&ctx->lex_state); - if (!entry) { + fx_queue_entry *entry = fx_queue_last(&ctx->lex_state); + if (!entry || !fx_queue_prev(entry)) { + /* don't pop if this is the root state */ return; } struct lex_state *state = fx_unbox(struct lex_state, entry, s_entry); + + if (state->s_type->s_end) { + state->s_type->s_end(ctx); + } + + fx_queue_pop_back(&ctx->lex_state); + + if (state->s_tempstr) { + fx_string_unref(state->s_tempstr); + } + free(state); } -static struct lex_state *get_lex_state(struct lex_ctx *ctx) +struct lex_state *lex_state_get(struct lex_ctx *ctx) { fx_queue_entry *entry = fx_queue_last(&ctx->lex_state); if (!entry) { @@ -102,6 +137,42 @@ static struct lex_state *get_lex_state(struct lex_ctx *ctx) return fx_unbox(struct lex_state, entry, s_entry); } +void lex_state_change(struct lex_ctx *ctx, enum lex_state_type_id type) +{ + struct lex_state *state = lex_state_get(ctx); + if (!state) { + return; + } + + if (state->s_type->s_end) { + state->s_type->s_end(ctx); + } + + state->s_type = state_types[type]; + + if (state->s_type->s_begin) { + state->s_type->s_begin(ctx); + } +} + +fx_string *lex_state_get_tempstr(struct lex_ctx *ctx) +{ + struct lex_state *state = lex_state_get(ctx); + if (!state) { + return NULL; + } + + if (!state->s_tempstr) { + state->s_tempstr = fx_string_create(); + } + + if (!state->s_tempstr) { + return NULL; + } + + return state->s_tempstr; +} + static struct lex_symbol_node *get_symbol_node( struct lex_symbol_node *node, char c) @@ -201,8 +272,7 @@ enum bshell_status lex_ctx_init( ctx->lex_status = BSHELL_SUCCESS; ctx->lex_buf = fx_stringstream_create(); ctx->lex_sym_tree = build_symbol_tree(); - ctx->lex_wordbuf = fx_string_create(); - push_lex_state(ctx, LEX_STATE_NORMAL); + lex_state_push(ctx, LEX_STATE_STATEMENT); ctx->lex_src = src; ctx->lex_ch = FX_WCHAR_INVALID; @@ -259,22 +329,18 @@ static fx_wchar __peek_char(struct lex_ctx *ctx, bool noread) return ctx->lex_ch; } -static fx_wchar peek_char(struct lex_ctx *ctx) +fx_wchar peek_char(struct lex_ctx *ctx) { return __peek_char(ctx, false); } -static fx_wchar peek_char_noread(struct lex_ctx *ctx) +fx_wchar peek_char_noread(struct lex_ctx *ctx) { return __peek_char(ctx, true); } static void __advance_char(struct lex_ctx *ctx, bool noread) { - if (!fx_wchar_is_space(ctx->lex_ch)) { - fx_string_append_wc(ctx->lex_wordbuf, ctx->lex_ch); - } - if (ctx->lex_ch != FX_WCHAR_INVALID) { ctx->lex_ch = FX_WCHAR_INVALID; return; @@ -296,19 +362,17 @@ static void __advance_char(struct lex_ctx *ctx, bool noread) } } -static void advance_char(struct lex_ctx *ctx) +void advance_char(struct lex_ctx *ctx) { return __advance_char(ctx, false); } -static void advance_char_noread(struct lex_ctx *ctx) +void advance_char_noread(struct lex_ctx *ctx) { return __advance_char(ctx, true); } -static bool convert_word_to_keyword( - const struct lex_token *tok, - struct lex_token *out) +bool convert_word_to_keyword(struct lex_token *tok) { if (!lex_token_has_string_value(tok)) { return false; @@ -320,10 +384,8 @@ static bool convert_word_to_keyword( continue; } - memcpy(out, tok, sizeof *out); - memset(&out->tok_entry, 0x0, sizeof out->tok_entry); - out->tok_type = TOK_KEYWORD; - out->tok_keyword = keywords[i].id; + lex_token_change_type(tok, TOK_KEYWORD); + tok->tok_keyword = keywords[i].id; return true; } @@ -378,7 +440,7 @@ static size_t get_int_multiplier_by_suffix(const char *suffix) return 0; } -static bool string_is_valid_number(const char *s, long long *out) +bool string_is_valid_number(const char *s, long long *out) { int base = get_int_base_by_prefix(&s); @@ -398,9 +460,7 @@ static bool string_is_valid_number(const char *s, long long *out) return false; } -static bool convert_word_to_int( - const struct lex_token *tok, - struct lex_token *out) +bool convert_word_to_int(struct lex_token *tok) { if (!lex_token_has_string_value(tok)) { return false; @@ -408,15 +468,14 @@ static bool convert_word_to_int( const char *s = tok->tok_str; long long value = 0; - bool ok = string_is_valid_number(s, &value); - if (ok) { - memcpy(out, tok, sizeof *out); - memset(&out->tok_entry, 0x0, sizeof out->tok_entry); - out->tok_type = TOK_INT; - out->tok_int = value; + if (!string_is_valid_number(s, &value)) { + return false; } - return ok; + lex_token_change_type(tok, TOK_INT); + tok->tok_int = value; + + return true; } static struct lex_token *get_next_token(struct lex_ctx *ctx) @@ -425,13 +484,7 @@ static struct lex_token *get_next_token(struct lex_ctx *ctx) return fx_unbox(struct lex_token, entry, tok_entry); } -static struct lex_token *get_next_word(struct lex_ctx *ctx) -{ - fx_queue_entry *entry = fx_queue_first(&ctx->lex_words); - return fx_unbox(struct lex_token, entry, tok_entry); -} - -static void enqueue_token(struct lex_ctx *ctx, struct lex_token *tok) +void enqueue_token(struct lex_ctx *ctx, struct lex_token *tok) { if (tok && (ctx->lex_flags & LEX_PRINT_TOKENS)) { print_lex_token(tok); @@ -446,12 +499,6 @@ static struct lex_token *dequeue_next_token(struct lex_ctx *ctx) return fx_unbox(struct lex_token, entry, tok_entry); } -static struct lex_token *dequeue_next_word(struct lex_ctx *ctx) -{ - fx_queue_entry *entry = fx_queue_pop_front(&ctx->lex_words); - return fx_unbox(struct lex_token, entry, tok_entry); -} - static fx_string *get_temp_string(struct lex_ctx *ctx) { if (!ctx->lex_tmp) { @@ -462,9 +509,7 @@ static fx_string *get_temp_string(struct lex_ctx *ctx) return ctx->lex_tmp; } -static enum bshell_status push_symbol( - struct lex_ctx *ctx, - enum token_symbol sym) +enum bshell_status push_symbol(struct lex_ctx *ctx, enum token_symbol sym) { struct lex_token *tok = lex_token_create(TOK_SYMBOL); if (!tok) { @@ -476,82 +521,10 @@ static enum bshell_status push_symbol( return BSHELL_SUCCESS; } -static enum bshell_status read_word(struct lex_ctx *ctx) -{ - fx_string *tmp = get_temp_string(ctx); - bool word_is_number = false; - - bool done = false; - while (!done) { - fx_wchar c = peek_char(ctx); - if (c == FX_WCHAR_INVALID) { - break; - } - - if (fx_wchar_is_space(c)) { - done = true; - break; - } - - if (word_is_number && char_can_begin_symbol(ctx, c)) { - done = true; - break; - } - - if (char_can_begin_symbol_in_context(ctx, c, TOK_WORD)) { - done = true; - break; - } - - switch (c) { - case '{': - case '}': - case '(': - case ')': - case ';': - case ',': - case '|': - case '&': - case '$': - done = true; - break; - default: - break; - } - - if (done) { - break; - } - - fx_string_append_wc(tmp, c); - word_is_number - = string_is_valid_number(fx_string_get_cstr(tmp), NULL); - advance_char(ctx); - } - - if (fx_string_get_size(tmp, FX_STRLEN_NORMAL) == 0) { - if (ctx->lex_status == BSHELL_SUCCESS) { - return BSHELL_ERR_BAD_SYNTAX; - } - - return ctx->lex_status; - } - - struct lex_token *tok = lex_token_create_with_string( - TOK_WORD, - fx_string_get_cstr(tmp)); -#if 0 - bool converted = convert_word_to_keyword(tok); - if (!converted) { - converted = convert_word_to_int(tok); - } -#endif - - enqueue_token(ctx, tok); - return BSHELL_SUCCESS; -} - -static enum bshell_status read_var(struct lex_ctx *ctx, enum token_type type) +enum bshell_status read_var( + struct lex_ctx *ctx, + enum token_type type, + struct lex_token **out) { fx_string *tmp = get_temp_string(ctx); @@ -583,13 +556,14 @@ static enum bshell_status read_var(struct lex_ctx *ctx, enum token_type type) struct lex_token *tok = lex_token_create_with_string(type, fx_string_get_cstr(tmp)); - enqueue_token(ctx, tok); + *out = tok; return BSHELL_SUCCESS; } -static enum bshell_status read_braced_var( +enum bshell_status read_braced_var( struct lex_ctx *ctx, - enum token_type type) + enum token_type type, + struct lex_token **out) { fx_string *tmp = get_temp_string(ctx); bool ok = false; @@ -621,10 +595,11 @@ static enum bshell_status read_braced_var( struct lex_token *tok = lex_token_create_with_string(type, fx_string_get_cstr(tmp)); - enqueue_token(ctx, tok); + *out = tok; return BSHELL_SUCCESS; } +#if 0 static enum bshell_status read_flag(struct lex_ctx *ctx) { fx_string *tmp = get_temp_string(ctx); @@ -691,7 +666,29 @@ static enum bshell_status read_flag(struct lex_ctx *ctx) return BSHELL_SUCCESS; } -static enum bshell_status read_literal_string(struct lex_ctx *ctx) +static enum bshell_status read_interpolation_marker(struct lex_ctx *ctx) +{ + enum bshell_status status = BSHELL_SUCCESS; + struct lex_state *state = lex_state_get(ctx); + + struct lex_token *tok = NULL; + + if (state->s_type != LEX_STATE_STRING) { + return BSHELL_ERR_INTERNAL_FAILURE; + } + + /* start of a new interpolation */ + if (!lex_state_push(ctx, LEX_STATE_STATEMENT)) { + return BSHELL_ERR_NO_MEMORY; + } + + return BSHELL_SUCCESS; +} +#endif + +enum bshell_status read_literal_string( + struct lex_ctx *ctx, + struct lex_token **out) { fx_string *tmp = get_temp_string(ctx); @@ -717,12 +714,12 @@ static enum bshell_status read_literal_string(struct lex_ctx *ctx) struct lex_token *tok = lex_token_create_with_string( TOK_STRING, fx_string_get_cstr(tmp)); - enqueue_token(ctx, tok); + *out = tok; return BSHELL_SUCCESS; } -static enum bshell_status read_line_comment(struct lex_ctx *lex) +enum bshell_status read_line_comment(struct lex_ctx *lex) { while (true) { fx_wchar c = peek_char(lex); @@ -741,16 +738,17 @@ static enum bshell_status read_line_comment(struct lex_ctx *lex) return BSHELL_SUCCESS; } -static enum bshell_status read_dquote_marker(struct lex_ctx *ctx) +#if 0 +enum bshell_status read_dquote_marker(struct lex_ctx *ctx) { enum bshell_status status = BSHELL_SUCCESS; - struct lex_state *state = get_lex_state(ctx); + struct lex_state *state = lex_state_get(ctx); struct lex_token *tok = NULL; if (state->s_type == LEX_STATE_STRING) { /* already within an fstring */ - pop_lex_state(ctx); + lex_state_pop(ctx); tok = lex_token_create(TOK_STR_END); enqueue_token(ctx, tok); return BSHELL_SUCCESS; @@ -760,39 +758,94 @@ static enum bshell_status read_dquote_marker(struct lex_ctx *ctx) tok = lex_token_create(TOK_STR_START); enqueue_token(ctx, tok); - if (!push_lex_state(ctx, LEX_STATE_STRING)) { + if (!lex_state_push(ctx, LEX_STATE_STRING)) { return BSHELL_ERR_NO_MEMORY; } return BSHELL_SUCCESS; } +#endif -static enum bshell_status read_interpolation_marker(struct lex_ctx *ctx) +enum bshell_status read_word(struct lex_ctx *ctx, struct lex_token **out) { - enum bshell_status status = BSHELL_SUCCESS; - struct lex_state *state = get_lex_state(ctx); + fx_string *tmp = get_temp_string(ctx); + bool word_is_number = false; - struct lex_token *tok = NULL; + bool done = false; + while (!done) { + fx_wchar c = peek_char(ctx); + if (c == FX_WCHAR_INVALID) { + break; + } - if (state->s_type != LEX_STATE_STRING) { - return BSHELL_ERR_INTERNAL_FAILURE; + if (fx_wchar_is_space(c)) { + done = true; + break; + } + + if (word_is_number && char_can_begin_symbol(ctx, c)) { + done = true; + break; + } + + if (char_can_begin_symbol(ctx, c)) { + done = true; + break; + } + + switch (c) { + case '{': + case '}': + case '(': + case ')': + case ';': + case ',': + case '|': + case '&': + case '$': + done = true; + break; + default: + break; + } + + if (done) { + break; + } + + fx_string_append_wc(tmp, c); + word_is_number + = string_is_valid_number(fx_string_get_cstr(tmp), NULL); + advance_char(ctx); } - /* start of a new interpolation */ - if (!push_lex_state(ctx, LEX_STATE_INTERPOLATION)) { - return BSHELL_ERR_NO_MEMORY; + if (fx_string_get_size(tmp, FX_STRLEN_NORMAL) == 0) { + if (ctx->lex_status == BSHELL_SUCCESS) { + return BSHELL_ERR_BAD_SYNTAX; + } + + return ctx->lex_status; } + struct lex_token *tok = lex_token_create_with_string( + TOK_WORD, + fx_string_get_cstr(tmp)); +#if 0 + bool converted = convert_word_to_keyword(tok); + if (!converted) { + converted = convert_word_to_int(tok); + } +#endif + + *out = tok; return BSHELL_SUCCESS; } -static enum bshell_status read_symbol(struct lex_ctx *ctx) +enum bshell_status read_symbol( + struct lex_ctx *ctx, + const struct lex_token_def **out) { - struct lex_state *state = get_lex_state(ctx); - enum lex_token_flags required_flags = 0; - if (state->s_type == LEX_STATE_STRING) { - required_flags |= LEX_TOKEN_ENABLE_IN_STRING; - } + struct lex_state *state = lex_state_get(ctx); struct lex_symbol_node *node = ctx->lex_sym_tree; char prev = 0; @@ -805,8 +858,7 @@ static enum bshell_status read_symbol(struct lex_ctx *ctx) struct lex_symbol_node *next = get_symbol_node(node, c); if (!next - || (next->s_def->flags & required_flags) - != required_flags) { + || !(next->s_def->enabled_states & state->s_type->s_id)) { prev = c; break; } @@ -820,6 +872,7 @@ static enum bshell_status read_symbol(struct lex_ctx *ctx) return BSHELL_ERR_BAD_SYNTAX; } +#if 0 struct lex_token *tok = NULL; switch (node->s_def->id) { case SYM_SQUOTE: @@ -829,7 +882,7 @@ static enum bshell_status read_symbol(struct lex_ctx *ctx) case SYM_DOLLAR_LEFT_PAREN: push_symbol(ctx, SYM_DOLLAR_LEFT_PAREN); if (state->s_type == LEX_STATE_STRING) { - push_lex_state(ctx, LEX_STATE_INTERPOLATION); + lex_state_push(ctx, LEX_STATE_STRING); } break; case SYM_DOLLAR_LEFT_BRACE: @@ -838,18 +891,11 @@ static enum bshell_status read_symbol(struct lex_ctx *ctx) return read_line_comment(ctx); case SYM_LEFT_PAREN: push_symbol(ctx, SYM_LEFT_PAREN); - state->s_paren_depth++; + lex_state_push(ctx, LEX_STATE_EXPRESSION); break; case SYM_RIGHT_PAREN: push_symbol(ctx, SYM_RIGHT_PAREN); - - if (state->s_type == LEX_STATE_INTERPOLATION - && state->s_paren_depth == 0) { - pop_lex_state(ctx); - } else { - state->s_paren_depth--; - } - + lex_state_pop(ctx); break; case SYM_DOLLAR: return read_var(ctx, TOK_VAR); @@ -859,70 +905,40 @@ static enum bshell_status read_symbol(struct lex_ctx *ctx) push_symbol(ctx, node->s_def->id); break; } - +#endif + *out = node->s_def; return BSHELL_SUCCESS; } -static bool char_can_begin_symbol(struct lex_ctx *ctx, char c) +bool char_can_begin_symbol_in_state( + struct lex_ctx *ctx, + char c, + enum lex_state_type_id state_type) { - struct lex_state *state = get_lex_state(ctx); - enum lex_token_flags required_flags = 0; - if (state->s_type == LEX_STATE_STRING) { - required_flags |= LEX_TOKEN_ENABLE_IN_STRING; - } - for (size_t i = 0; i < nr_symbols; i++) { if (symbols[i].name[0] != c) { continue; } - if ((symbols[i].flags & required_flags) != required_flags) { - continue; + if (symbols[i].enabled_states & state_type) { + return true; } - - return true; } return false; } -static bool char_can_begin_symbol_in_context( - struct lex_ctx *ctx, - char c, - enum token_type context) +bool char_can_begin_symbol(struct lex_ctx *ctx, char c) { - enum lex_token_flags required_flags = 0; - switch (context) { - case TOK_WORD: - required_flags = LEX_TOKEN_ENABLE_IN_WORD; - break; - case TOK_STRING: - required_flags = LEX_TOKEN_ENABLE_IN_STRING; - break; - default: - break; - } - - for (size_t i = 0; i < nr_symbols; i++) { - if (symbols[i].name[0] != c) { - continue; - } - - if ((symbols[i].flags & required_flags) != required_flags) { - continue; - } - - return true; - } - - return false; + struct lex_state *state = lex_state_get(ctx); + return char_can_begin_symbol_in_state(ctx, c, state->s_type->s_id); } static enum bshell_status read_string_content(struct lex_ctx *ctx) { fx_wchar c = FX_WCHAR_INVALID; fx_string *str = get_temp_string(ctx); - struct lex_state *state = get_lex_state(ctx); + struct lex_state *state = lex_state_get(ctx); if (!str) { return BSHELL_ERR_NO_MEMORY; @@ -955,6 +971,7 @@ static enum bshell_status read_string_content(struct lex_ctx *ctx) return BSHELL_SUCCESS; } +#if 0 static enum bshell_status do_pump_token_string(struct lex_ctx *ctx) { fx_wchar c = peek_char(ctx); @@ -973,32 +990,13 @@ static enum bshell_status do_pump_token_string(struct lex_ctx *ctx) return status; } -static void flush_wordbuf(struct lex_ctx *ctx) -{ - if (fx_string_get_size(ctx->lex_wordbuf, FX_STRLEN_NORMAL) == 0) { - return; - } - - struct lex_token *tok = lex_token_create_with_string( - TOK_WORD, - fx_string_get_cstr(ctx->lex_wordbuf)); - fx_queue_push_back(&ctx->lex_words, &tok->tok_entry); - fx_string_clear(ctx->lex_wordbuf); -} - static enum bshell_status do_pump_token_normal(struct lex_ctx *ctx) { enum bshell_status status = BSHELL_SUCCESS; fx_wchar c = peek_char(ctx); - bool whitespace = false; bool newline = false; - if (fx_wchar_is_space(c)) { - flush_wordbuf(ctx); - whitespace = true; - } - while (fx_wchar_is_space(c)) { if (c == '\n') { newline = true; @@ -1011,9 +1009,6 @@ static enum bshell_status do_pump_token_normal(struct lex_ctx *ctx) if (newline) { struct lex_token *tok = lex_token_create(TOK_LINEFEED); enqueue_token(ctx, tok); - } - - if (whitespace) { return BSHELL_SUCCESS; } @@ -1027,26 +1022,19 @@ static enum bshell_status do_pump_token_normal(struct lex_ctx *ctx) return read_word(ctx); } +#endif static enum bshell_status pump_tokens(struct lex_ctx *ctx) { enum bshell_status status = BSHELL_SUCCESS; - while (fx_queue_empty(&ctx->lex_words) && status == BSHELL_SUCCESS) { - struct lex_state *state = get_lex_state(ctx); - pump_token_impl impl = token_pump_functions[state->s_type]; - - status = impl(ctx); + while (fx_queue_empty(&ctx->lex_tokens) && status == BSHELL_SUCCESS) { + struct lex_state *state = lex_state_get(ctx); + status = state->s_type->s_pump_token(ctx); } return status; } -static bool any_tokens_available(struct lex_ctx *ctx) -{ - return !fx_queue_empty(&ctx->lex_tokens) - || !fx_queue_empty(&ctx->lex_words); -} - static void discard_all_tokens(struct lex_ctx *ctx) { fx_queue_entry *cur = fx_queue_first(&ctx->lex_tokens); @@ -1063,17 +1051,6 @@ static void discard_all_tokens(struct lex_ctx *ctx) } } -static void discard_all_words(struct lex_ctx *ctx) -{ - fx_queue_entry *cur = fx_queue_pop_front(&ctx->lex_words); - while (cur) { - struct lex_token *tok - = fx_unbox(struct lex_token, cur, tok_entry); - lex_token_destroy(tok); - cur = fx_queue_pop_front(&ctx->lex_words); - } -} - struct lex_token *lex_ctx_peek(struct lex_ctx *ctx) { struct lex_token *tok = get_next_token(ctx); @@ -1081,21 +1058,6 @@ struct lex_token *lex_ctx_peek(struct lex_ctx *ctx) return tok; } - discard_all_words(ctx); - pump_tokens(ctx); - tok = get_next_token(ctx); - - return tok; -} - -struct lex_token *lex_ctx_peek_word(struct lex_ctx *ctx) -{ - struct lex_token *tok = get_next_word(ctx); - if (tok) { - return tok; - } - - discard_all_tokens(ctx); pump_tokens(ctx); tok = get_next_token(ctx); @@ -1106,13 +1068,6 @@ struct lex_token *lex_ctx_claim(struct lex_ctx *ctx) { struct lex_token *tok = dequeue_next_token(ctx); if (tok) { - struct lex_token *tmp = get_next_token(ctx); - - if (tmp && tmp->tok_type == TOK_LINEFEED) { - tmp = dequeue_next_word(ctx); - lex_token_destroy(tmp); - } - return tok; } @@ -1125,39 +1080,11 @@ struct lex_token *lex_ctx_claim(struct lex_ctx *ctx) return dequeue_next_token(ctx); } -struct lex_token *lex_ctx_claim_word(struct lex_ctx *ctx) -{ - /* since we're claiming the whole word, discard any sub-tokens already - * generated up to the next linefeed */ - discard_all_tokens(ctx); - struct lex_token *tok = dequeue_next_word(ctx); - if (tok) { - return tok; - } - - if (fx_queue_empty(&ctx->lex_words)) { - pump_tokens(ctx); - - tok = get_next_token(ctx); - } - - return dequeue_next_word(ctx); -} - void lex_ctx_discard(struct lex_ctx *ctx) { struct lex_token *tok = dequeue_next_token(ctx); if (tok) { lex_token_destroy(tok); - tok = get_next_token(ctx); - - /* if the next token is a linefeed, we've reached the end - * of the current word, and should discard it */ - if (tok && tok->tok_type == TOK_LINEFEED) { - tok = dequeue_next_word(ctx); - lex_token_destroy(tok); - } - return; } diff --git a/bshell/parse/lex/statement.c b/bshell/parse/lex/statement.c new file mode 100644 index 0000000..7e8fad7 --- /dev/null +++ b/bshell/parse/lex/statement.c @@ -0,0 +1,162 @@ +#include "lex-internal.h" + +static enum bshell_status statement_symbol(struct lex_ctx *ctx) +{ + const struct lex_token_def *sym = NULL; + enum bshell_status status = read_symbol(ctx, &sym); + + if (status != BSHELL_SUCCESS) { + return status; + } + + struct lex_token *tok = NULL; + switch (sym->id) { + case SYM_SQUOTE: + status = read_literal_string(ctx, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; + + case SYM_HASH: + return read_line_comment(ctx); + case SYM_DQUOTE: + if (!lex_state_push(ctx, LEX_STATE_STRING)) { + return BSHELL_ERR_NO_MEMORY; + } + + return BSHELL_SUCCESS; + case SYM_DOLLAR: + if (!lex_state_push(ctx, LEX_STATE_ARITHMETIC)) { + return BSHELL_ERR_NO_MEMORY; + } + + status = read_var(ctx, TOK_VAR, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_AT: + if (!lex_state_push(ctx, LEX_STATE_ARITHMETIC)) { + return BSHELL_ERR_NO_MEMORY; + } + + status = read_var(ctx, TOK_VAR_SPLAT, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_DOLLAR_LEFT_BRACE: + if (!lex_state_push(ctx, LEX_STATE_ARITHMETIC)) { + return BSHELL_ERR_NO_MEMORY; + } + + status = read_braced_var(ctx, TOK_VAR, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_AT_LEFT_BRACE: + if (!lex_state_push(ctx, LEX_STATE_ARITHMETIC)) { + return BSHELL_ERR_NO_MEMORY; + } + + status = read_braced_var(ctx, TOK_VAR_SPLAT, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + default: + break; + } + + push_symbol(ctx, sym->id); + + switch (sym->id) { + case SYM_LEFT_PAREN: + lex_state_push(ctx, LEX_STATE_EXPRESSION); + return BSHELL_SUCCESS; + case SYM_LEFT_BRACE: + case SYM_DOLLAR_LEFT_PAREN: + lex_state_push(ctx, LEX_STATE_STATEMENT); + return BSHELL_SUCCESS; + case SYM_RIGHT_PAREN: + case SYM_RIGHT_BRACE: + lex_state_pop(ctx); + return BSHELL_SUCCESS; + default: + break; + } + + if (sym->enabled_states & LEX_STATE_COMMAND) { + lex_state_change(ctx, LEX_STATE_COMMAND); + } else if (sym->enabled_states & LEX_STATE_ARITHMETIC) { + lex_state_change(ctx, LEX_STATE_ARITHMETIC); + } + + return BSHELL_SUCCESS; +} + +static enum bshell_status statement_word(struct lex_ctx *ctx) +{ + struct lex_token *word = NULL; + enum bshell_status status = read_word(ctx, &word); + if (status != BSHELL_SUCCESS) { + return status; + } + + bool converted = convert_word_to_keyword(word); + if (!converted) { + converted = convert_word_to_int(word); + } + + if (converted) { + lex_state_change(ctx, LEX_STATE_ARITHMETIC); + } else { + lex_state_change(ctx, LEX_STATE_COMMAND); + } + + enqueue_token(ctx, word); + return BSHELL_SUCCESS; +} + +static enum bshell_status statement_pump_token(struct lex_ctx *ctx) +{ + fx_wchar c = peek_char(ctx); + bool newline = false; + + while (fx_wchar_is_space(c)) { + if (c == '\n') { + newline = true; + } + + advance_char_noread(ctx); + c = peek_char_noread(ctx); + } + + if (newline) { + struct lex_token *tok = lex_token_create(TOK_LINEFEED); + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; + } + + if (char_can_begin_symbol(ctx, c)) { + return statement_symbol(ctx); + } + + return statement_word(ctx); +} + +const struct lex_state_type lex_statement_state = { + .s_id = LEX_STATE_STATEMENT, + .s_pump_token = statement_pump_token, +}; diff --git a/bshell/parse/lex/string.c b/bshell/parse/lex/string.c new file mode 100644 index 0000000..1b986bc --- /dev/null +++ b/bshell/parse/lex/string.c @@ -0,0 +1,136 @@ +#include "lex-internal.h" + +static enum bshell_status string_symbol(struct lex_ctx *ctx) +{ + const struct lex_token_def *sym = NULL; + enum bshell_status status = read_symbol(ctx, &sym); + + if (status != BSHELL_SUCCESS) { + return status; + } + + struct lex_token *tok = NULL; + + switch (sym->id) { + case SYM_DOLLAR_LEFT_PAREN: + status = push_symbol(ctx, sym->id); + if (status != BSHELL_SUCCESS) { + return status; + } + + lex_state_push(ctx, LEX_STATE_STATEMENT); + return BSHELL_SUCCESS; + case SYM_DQUOTE: + lex_state_pop(ctx); + return BSHELL_SUCCESS; + case SYM_DOLLAR: + status = read_var(ctx, TOK_VAR, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_AT: + status = read_var(ctx, TOK_VAR_SPLAT, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_DOLLAR_LEFT_BRACE: + status = read_braced_var(ctx, TOK_VAR, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + case SYM_AT_LEFT_BRACE: + status = read_braced_var(ctx, TOK_VAR_SPLAT, &tok); + if (status != BSHELL_SUCCESS) { + return status; + } + + enqueue_token(ctx, tok); + return status; + default: + break; + } + + return BSHELL_ERR_BAD_SYNTAX; +} + +static enum bshell_status string_content(struct lex_ctx *ctx) +{ + fx_wchar c = FX_WCHAR_INVALID; + fx_string *temp = lex_state_get_tempstr(ctx); + fx_string_clear(temp); + + while (1) { + c = peek_char(ctx); + if (c == FX_WCHAR_INVALID) { + /* EOF without end of string */ + ctx->lex_status = BSHELL_ERR_BAD_SYNTAX; + } + + if (char_can_begin_symbol(ctx, c)) { + break; + } + + fx_string_append_wc(temp, c); + advance_char(ctx); + } + + if (fx_string_get_size(temp, FX_STRLEN_NORMAL) == 0) { + return BSHELL_SUCCESS; + } + + struct lex_token *tok = lex_token_create_with_string( + TOK_STRING, + fx_string_get_cstr(temp)); + enqueue_token(ctx, tok); + + return BSHELL_SUCCESS; +} + +static enum bshell_status string_begin(struct lex_ctx *ctx) +{ + struct lex_token *tok = lex_token_create(TOK_STR_START); + if (!tok) { + return BSHELL_ERR_NO_MEMORY; + } + + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; +} + +static enum bshell_status string_end(struct lex_ctx *ctx) +{ + struct lex_token *tok = lex_token_create(TOK_STR_END); + if (!tok) { + return BSHELL_ERR_NO_MEMORY; + } + + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; +} + +static enum bshell_status string_pump_token(struct lex_ctx *ctx) +{ + fx_wchar c = peek_char(ctx); + + if (char_can_begin_symbol(ctx, c)) { + return string_symbol(ctx); + } + + return string_content(ctx); +} + +const struct lex_state_type lex_string_state = { + .s_id = LEX_STATE_STRING, + .s_begin = string_begin, + .s_end = string_end, + .s_pump_token = string_pump_token, +}; diff --git a/bshell/parse/token.c b/bshell/parse/token.c index fed6184..f3934bb 100644 --- a/bshell/parse/token.c +++ b/bshell/parse/token.c @@ -105,6 +105,8 @@ const char *token_keyword_to_string(enum token_keyword keyword) switch (keyword) { ENUM_STR(KW_NONE); ENUM_STR(KW_FUNC); + ENUM_STR(KW_IF); + ENUM_STR(KW_ELSE); default: return ""; } diff --git a/bshell/parse/token.h b/bshell/parse/token.h index bf6009c..dc3ac6b 100644 --- a/bshell/parse/token.h +++ b/bshell/parse/token.h @@ -30,6 +30,8 @@ enum token_keyword { KW_NONE = 0, __KW_INDEX_BASE = 200, KW_FUNC, + KW_IF, + KW_ELSE, __KW_INDEX_LIMIT, };