From 7d95d57f98ee0c7958c7eed5e171d6081be3f888 Mon Sep 17 00:00:00 2001 From: Max Wash Date: Sat, 9 May 2026 19:00:02 +0100 Subject: [PATCH] bshell: first-pass implementation of a syntax lexer/parser --- bshell/parse/lex.c | 1149 +++++++++++++++++++++++++++++++++ bshell/parse/lex.h | 82 +++ bshell/parse/parse.c | 30 + bshell/parse/parse.h | 21 + bshell/parse/syntax.h | 53 ++ bshell/parse/syntax/arith.c | 11 + bshell/parse/syntax/command.c | 385 +++++++++++ bshell/parse/syntax/expr.c | 15 + bshell/parse/syntax/generic.c | 135 ++++ bshell/parse/token.c | 148 +++++ bshell/parse/token.h | 131 ++++ 11 files changed, 2160 insertions(+) create mode 100644 bshell/parse/lex.c create mode 100644 bshell/parse/lex.h create mode 100644 bshell/parse/parse.c create mode 100644 bshell/parse/parse.h create mode 100644 bshell/parse/syntax.h create mode 100644 bshell/parse/syntax/arith.c create mode 100644 bshell/parse/syntax/command.c create mode 100644 bshell/parse/syntax/expr.c create mode 100644 bshell/parse/syntax/generic.c create mode 100644 bshell/parse/token.c create mode 100644 bshell/parse/token.h diff --git a/bshell/parse/lex.c b/bshell/parse/lex.c new file mode 100644 index 0000000..073c486 --- /dev/null +++ b/bshell/parse/lex.c @@ -0,0 +1,1149 @@ +#include "lex.h" + +#include "../debug.h" +#include "../line-source.h" +#include "token.h" + +#define LEX_TOKEN_DEF(i, n) {.id = (i), .name = (n)} +#define LEX_TOKEN_DEF2(i, n, f) {.id = (i), .name = (n), .flags = (f)} + +#define CONVERSION_REQUESTED(flags) \ + ((flags) & (LEX_ENABLE_INT | LEX_ENABLE_KEYWORD)) + +static struct lex_token_def keywords[] = { + LEX_TOKEN_DEF(KW_FUNC, "func"), +}; +static const size_t nr_keywords = sizeof keywords / sizeof keywords[0]; + +static struct lex_token_def symbols[] = { + LEX_TOKEN_DEF(SYM_PLUS, "+"), + LEX_TOKEN_DEF(SYM_HYPHEN, "-"), + LEX_TOKEN_DEF(SYM_FORWARD_SLASH, "/"), + LEX_TOKEN_DEF(SYM_ASTERISK, "*"), + LEX_TOKEN_DEF2(SYM_AMPERSAND, "&", LEX_TOKEN_ENABLE_IN_WORD), + LEX_TOKEN_DEF(SYM_PERCENT, "%"), + LEX_TOKEN_DEF(SYM_SQUOTE, "'"), + LEX_TOKEN_DEF2(SYM_DQUOTE, "\"", LEX_TOKEN_ENABLE_IN_STRING), + LEX_TOKEN_DEF(SYM_HASH, "#"), + LEX_TOKEN_DEF2(SYM_DOLLAR, "$", LEX_TOKEN_ENABLE_IN_STRING), + LEX_TOKEN_DEF2(SYM_DOLLAR_LEFT_PAREN, "$(", LEX_TOKEN_ENABLE_IN_STRING), + LEX_TOKEN_DEF2(SYM_DOLLAR_LEFT_BRACE, "${", LEX_TOKEN_ENABLE_IN_STRING), + LEX_TOKEN_DEF(SYM_AT, "@"), + LEX_TOKEN_DEF2(SYM_PIPE, "|", LEX_TOKEN_ENABLE_IN_WORD), + LEX_TOKEN_DEF2(SYM_COMMA, ",", LEX_TOKEN_ENABLE_IN_WORD), + LEX_TOKEN_DEF2(SYM_SEMICOLON, ";", LEX_TOKEN_ENABLE_IN_WORD), + LEX_TOKEN_DEF(SYM_AT_LEFT_BRACE, "@{"), + LEX_TOKEN_DEF2(SYM_LEFT_BRACE, "{", LEX_TOKEN_ENABLE_IN_WORD), + LEX_TOKEN_DEF2(SYM_RIGHT_BRACE, "}", LEX_TOKEN_ENABLE_IN_WORD), + LEX_TOKEN_DEF(SYM_LEFT_BRACKET, "["), + LEX_TOKEN_DEF(SYM_RIGHT_BRACKET, "]"), + LEX_TOKEN_DEF2(SYM_LEFT_PAREN, "(", LEX_TOKEN_ENABLE_IN_WORD), + LEX_TOKEN_DEF2(SYM_RIGHT_PAREN, ")", LEX_TOKEN_ENABLE_IN_WORD), + LEX_TOKEN_DEF(SYM_EQUAL, "="), + LEX_TOKEN_DEF(SYM_PLUS_EQUAL, "+="), + LEX_TOKEN_DEF(SYM_HYPHEN_EQUAL, "-="), + LEX_TOKEN_DEF(SYM_FORWARD_SLASH_EQUAL, "/="), + LEX_TOKEN_DEF(SYM_ASTERISK_EQUAL, "*="), + LEX_TOKEN_DEF(SYM_PERCENT_EQUAL, "%="), +}; +static const size_t nr_symbols = sizeof symbols / sizeof symbols[0]; + +typedef enum bshell_status (*pump_token_impl)(struct lex_ctx *, enum lex_flags); + +static enum bshell_status do_pump_token_normal( + struct lex_ctx *, + enum lex_flags); +static enum bshell_status do_pump_token_string( + struct lex_ctx *ctx, + enum lex_flags); +static const pump_token_impl token_pump_functions[] = { + [LEX_STATE_NORMAL] = do_pump_token_normal, + [LEX_STATE_STRING] = do_pump_token_string, + [LEX_STATE_INTERPOLATION] = do_pump_token_normal, +}; + +static bool char_can_begin_symbol( + struct lex_ctx *ctx, + char c, + enum lex_flags flags); +static bool char_can_begin_symbol_in_context( + struct lex_ctx *ctx, + char c, + enum token_type context, + enum lex_flags flags); + +static struct lex_state *push_lex_state( + struct lex_ctx *ctx, + enum lex_state_type state_type) +{ + struct lex_state *state = malloc(sizeof *state); + if (!state) { + return NULL; + } + + memset(state, 0x0, sizeof *state); + + state->s_type = state_type; + fx_queue_push_back(&ctx->lex_state, &state->s_entry); + + return state; +} + +static void pop_lex_state(struct lex_ctx *ctx) +{ + fx_queue_entry *entry = fx_queue_pop_back(&ctx->lex_state); + if (!entry) { + return; + } + + struct lex_state *state = fx_unbox(struct lex_state, entry, s_entry); + free(state); +} + +static struct lex_state *get_lex_state(struct lex_ctx *ctx) +{ + fx_queue_entry *entry = fx_queue_last(&ctx->lex_state); + if (!entry) { + return NULL; + } + + return fx_unbox(struct lex_state, entry, s_entry); +} + +static struct lex_symbol_node *get_symbol_node( + struct lex_symbol_node *node, + char c) +{ + fx_queue_entry *entry = fx_queue_first(&node->s_children); + while (entry) { + struct lex_symbol_node *child + = fx_unbox(struct lex_symbol_node, entry, s_entry); + if (child->s_char == c) { + return child; + } + + entry = fx_queue_next(entry); + } + + return NULL; +} + +static enum bshell_status put_symbol( + struct lex_symbol_node *tree, + struct lex_token_def *sym) +{ + for (size_t i = 0; sym->name[i]; i++) { + char c = sym->name[i]; + struct lex_symbol_node *child = get_symbol_node(tree, c); + if (child) { + tree = child; + continue; + } + + child = malloc(sizeof *child); + if (!child) { + return BSHELL_ERR_NO_MEMORY; + } + + memset(child, 0x0, sizeof *child); + + child->s_def = NULL; + child->s_char = c; + + fx_queue_push_back(&tree->s_children, &child->s_entry); + tree = child; + } + + tree->s_def = sym; + return BSHELL_SUCCESS; +} + +static void destroy_symbol_tree(struct lex_symbol_node *tree) +{ + fx_queue_entry *entry = fx_queue_first(&tree->s_children); + while (entry) { + struct lex_symbol_node *node + = fx_unbox(struct lex_symbol_node, entry, s_entry); + fx_queue_entry *next = fx_queue_next(entry); + fx_queue_delete(&tree->s_children, entry); + + destroy_symbol_tree(node); + + entry = next; + } + + free(tree); +} + +static struct lex_symbol_node *build_symbol_tree(void) +{ + struct lex_symbol_node *root = malloc(sizeof *root); + if (!root) { + return NULL; + } + + memset(root, 0x0, sizeof *root); + root->s_def = NULL; + + enum bshell_status status = BSHELL_SUCCESS; + for (size_t i = 0; i < nr_symbols; i++) { + status = put_symbol(root, &symbols[i]); + + if (status != BSHELL_SUCCESS) { + destroy_symbol_tree(root); + return NULL; + } + } + + return root; +} + +enum bshell_status lex_ctx_init( + struct lex_ctx *ctx, + enum lex_flags flags, + struct line_source *src) +{ + memset(ctx, 0x0, sizeof *ctx); + + ctx->lex_flags = flags; + ctx->lex_status = BSHELL_SUCCESS; + ctx->lex_buf = fx_stringstream_create(); + ctx->lex_sym_tree = build_symbol_tree(); + push_lex_state(ctx, LEX_STATE_NORMAL); + ctx->lex_src = src; + ctx->lex_ch = FX_WCHAR_INVALID; + + return BSHELL_SUCCESS; +} + +enum bshell_status lex_ctx_cleanup(struct lex_ctx *ctx) +{ + if (ctx->lex_sym_tree) { + destroy_symbol_tree(ctx->lex_sym_tree); + } + + if (ctx->lex_buf) { + fx_stringstream_unref(ctx->lex_buf); + } + + if (ctx->lex_tmp) { + fx_string_unref(ctx->lex_tmp); + } + + memset(ctx, 0x0, sizeof *ctx); + return BSHELL_SUCCESS; +} + +static enum bshell_status refill_buffer(struct lex_ctx *ctx) +{ + fx_stringstream_reset(ctx->lex_buf); + ctx->lex_ch = FX_WCHAR_INVALID; + + return line_source_readline(ctx->lex_src, ctx->lex_buf); +} + +static fx_wchar __peek_char(struct lex_ctx *ctx, bool noread) +{ + if (ctx->lex_status != BSHELL_SUCCESS) { + return FX_WCHAR_INVALID; + } + + if (ctx->lex_ch != FX_WCHAR_INVALID) { + return ctx->lex_ch; + } + + fx_status status = fx_stream_read_char(ctx->lex_buf, &ctx->lex_ch); + if (!FX_OK(status) && !noread) { + enum bshell_status status2 = refill_buffer(ctx); + if (status2 != BSHELL_SUCCESS) { + ctx->lex_status = status2; + ctx->lex_ch = FX_WCHAR_INVALID; + } else { + fx_stream_read_char(ctx->lex_buf, &ctx->lex_ch); + } + } + + return ctx->lex_ch; +} + +static fx_wchar peek_char(struct lex_ctx *ctx) +{ + return __peek_char(ctx, false); +} + +static fx_wchar peek_char_noread(struct lex_ctx *ctx) +{ + return __peek_char(ctx, true); +} + +static void __advance_char(struct lex_ctx *ctx, bool noread) +{ + if (ctx->lex_ch != FX_WCHAR_INVALID) { + ctx->lex_ch = FX_WCHAR_INVALID; + return; + } + + if (ctx->lex_status != BSHELL_SUCCESS) { + return; + } + + fx_status status = fx_stream_read_char(ctx->lex_buf, &ctx->lex_ch); + if (!FX_OK(status) && !noread) { + enum bshell_status status2 = refill_buffer(ctx); + if (status2 != BSHELL_SUCCESS) { + ctx->lex_status = status2; + ctx->lex_ch = FX_WCHAR_INVALID; + } else { + fx_stream_read_char(ctx->lex_buf, &ctx->lex_ch); + } + } +} + +static void advance_char(struct lex_ctx *ctx) +{ + return __advance_char(ctx, false); +} + +static void advance_char_noread(struct lex_ctx *ctx) +{ + return __advance_char(ctx, true); +} + +static bool convert_word_to_keyword( + const struct lex_token *tok, + struct lex_token *out) +{ + if (!lex_token_has_string_value(tok)) { + return false; + } + + for (size_t i = 0; i < nr_keywords; i++) { + const char *kw_str = keywords[i].name; + if (strcmp(kw_str, tok->tok_str) != 0) { + continue; + } + + memcpy(out, tok, sizeof *out); + memset(&out->tok_entry, 0x0, sizeof out->tok_entry); + out->tok_type = TOK_KEYWORD; + out->tok_keyword = keywords[i].id; + return true; + } + + return false; +} + +static int get_int_base_by_prefix(const char **s) +{ +#define CH(x) (tolower(value[x])) + const char *value = *s; + if (CH(0) != '0') { + return 10; + } + + switch (CH(1)) { + case 'x': + *s += 2; + return 16; + case 'b': + *s += 2; + return 2; + default: + *s += 1; + return 8; + } +#undef CH +} + +static size_t get_int_multiplier_by_suffix(const char *suffix) +{ +#define CH(x) (tolower(suffix[x])) + if (CH(1) != 'b' || CH(2) != 0) { + return 0; + } + + switch (CH(0)) { + case 'k': + return 0x400; + case 'm': + return 0x100000; + case 'g': + return 0x40000000; + case 't': + return 0x10000000000; + case 'b': + return 0x4000000000000; + default: + return 0; + } + +#undef CH + return 0; +} + +static bool string_is_valid_number(const char *s, long long *out) +{ + int base = get_int_base_by_prefix(&s); + + char *ep = NULL; + long long value = strtoll(s, &ep, base); + if (*ep == '\0') { + out && (*out = value); + return true; + } + + size_t multiplier = get_int_multiplier_by_suffix(ep); + if (multiplier != 0) { + out && (*out = value * multiplier); + return true; + } + + return false; +} + +static bool convert_word_to_int( + const struct lex_token *tok, + struct lex_token *out) +{ + if (!lex_token_has_string_value(tok)) { + return false; + } + + const char *s = tok->tok_str; + long long value = 0; + bool ok = string_is_valid_number(s, &value); + if (ok) { + memcpy(out, tok, sizeof *out); + memset(&out->tok_entry, 0x0, sizeof out->tok_entry); + out->tok_type = TOK_INT; + out->tok_int = value; + } + + return ok; +} + +static struct lex_token *create_alt_token( + struct lex_ctx *ctx, + struct lex_token *tok, + enum lex_flags flags) +{ + if (ctx->lex_alt) { + return ctx->lex_alt; + } + + bool converted = false; + + struct lex_token alt = {0}; + + if (flags & LEX_ENABLE_KEYWORD) { + converted = convert_word_to_keyword(tok, &alt); + } + + if (!converted && (flags & LEX_ENABLE_INT)) { + converted = convert_word_to_int(tok, &alt); + } + + if (!converted) { + return NULL; + } + + ctx->lex_alt = lex_token_create(alt.tok_type); + if (!ctx->lex_alt) { + ctx->lex_status = BSHELL_ERR_NO_MEMORY; + return NULL; + } + + memcpy(ctx->lex_alt, &alt, sizeof alt); + + return ctx->lex_alt; +} + +static struct lex_token *get_next_token( + struct lex_ctx *ctx, + enum lex_flags flags) +{ + fx_queue_entry *entry = fx_queue_first(&ctx->lex_tokens); + struct lex_token *tok = fx_unbox(struct lex_token, entry, tok_entry); + + if (!tok || !CONVERSION_REQUESTED(flags)) { + return tok; + } + + if (ctx->lex_alt) { + return ctx->lex_alt; + } + + ctx->lex_alt = create_alt_token(ctx, tok, flags); + + return ctx->lex_alt ? ctx->lex_alt : tok; +} + +static void enqueue_token(struct lex_ctx *ctx, struct lex_token *tok) +{ + fx_queue_push_back(&ctx->lex_tokens, &tok->tok_entry); +} + +static struct lex_token *dequeue_next_token( + struct lex_ctx *ctx, + enum lex_flags flags) +{ + fx_queue_entry *entry = fx_queue_pop_front(&ctx->lex_tokens); + struct lex_token *tok = fx_unbox(struct lex_token, entry, tok_entry); + + if (!tok || !CONVERSION_REQUESTED(flags)) { + if (ctx->lex_alt) { + lex_token_destroy(ctx->lex_alt); + ctx->lex_alt = NULL; + } + + return tok; + } + + if (ctx->lex_alt) { + lex_token_destroy(tok); + tok = ctx->lex_alt; + ctx->lex_alt = NULL; + return tok; + } + + ctx->lex_alt = create_alt_token(ctx, tok, flags); + + if (ctx->lex_alt) { + lex_token_destroy(tok); + tok = ctx->lex_alt; + ctx->lex_alt = NULL; + } + + return tok; +} + +static fx_string *get_temp_string(struct lex_ctx *ctx) +{ + if (!ctx->lex_tmp) { + ctx->lex_tmp = fx_string_create(); + } + + fx_string_clear(ctx->lex_tmp); + return ctx->lex_tmp; +} + +static enum bshell_status push_symbol( + struct lex_ctx *ctx, + enum token_symbol sym) +{ + struct lex_token *tok = lex_token_create(TOK_SYMBOL); + if (!tok) { + return BSHELL_ERR_NO_MEMORY; + } + + tok->tok_symbol = sym; + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; +} + +static enum bshell_status read_word(struct lex_ctx *ctx, enum lex_flags flags) +{ + fx_string *tmp = get_temp_string(ctx); + bool word_is_number = false; + + bool done = false; + while (!done) { + fx_wchar c = peek_char(ctx); + if (c == FX_WCHAR_INVALID) { + break; + } + + if (fx_wchar_is_space(c)) { + done = true; + break; + } + + if (word_is_number && char_can_begin_symbol(ctx, c, flags)) { + done = true; + break; + } + + if (char_can_begin_symbol_in_context(ctx, c, TOK_WORD, flags)) { + done = true; + break; + } + + switch (c) { + case '{': + case '}': + case '(': + case ')': + case ';': + case ',': + case '|': + case '&': + case '$': + done = true; + break; + default: + break; + } + + if (done) { + break; + } + + fx_string_append_wc(tmp, c); + word_is_number + = string_is_valid_number(fx_string_get_cstr(tmp), NULL); + advance_char(ctx); + } + + if (fx_string_get_size(tmp, FX_STRLEN_NORMAL) == 0) { + if (ctx->lex_status == BSHELL_SUCCESS) { + return BSHELL_ERR_BAD_SYNTAX; + } + + return ctx->lex_status; + } + + struct lex_token *tok = lex_token_create_with_string( + TOK_WORD, + fx_string_get_cstr(tmp)); +#if 0 + bool converted = convert_word_to_keyword(tok); + if (!converted) { + converted = convert_word_to_int(tok); + } +#endif + + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; +} + +static enum bshell_status read_var(struct lex_ctx *ctx, enum token_type type) +{ + fx_string *tmp = get_temp_string(ctx); + + bool done = false; + while (!done) { + fx_wchar c = peek_char(ctx); + if (c == FX_WCHAR_INVALID) { + break; + } + + bool valid = fx_wchar_is_alnum(c) || (c == '_') || (c == ':'); + + if (!valid) { + break; + } + + if (done) { + break; + } + + fx_string_append_wc(tmp, c); + advance_char(ctx); + } + + if (fx_string_get_size(tmp, FX_STRLEN_NORMAL) == 0) { + return ctx->lex_status; + } + + struct lex_token *tok + = lex_token_create_with_string(type, fx_string_get_cstr(tmp)); + + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; +} + +static enum bshell_status read_braced_var( + struct lex_ctx *ctx, + enum token_type type) +{ + fx_string *tmp = get_temp_string(ctx); + bool ok = false; + + while (1) { + fx_wchar c = peek_char(ctx); + if (c == FX_WCHAR_INVALID) { + break; + } + + if (c == '}') { + ok = true; + advance_char(ctx); + break; + } + + fx_string_append_wc(tmp, c); + advance_char(ctx); + } + + if (!ok) { + return BSHELL_ERR_BAD_SYNTAX; + } + + if (fx_string_get_size(tmp, FX_STRLEN_NORMAL) == 0) { + return ctx->lex_status; + } + + struct lex_token *tok + = lex_token_create_with_string(type, fx_string_get_cstr(tmp)); + + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; +} + +static enum bshell_status read_flag(struct lex_ctx *ctx) +{ + fx_string *tmp = get_temp_string(ctx); + + bool done = false; + while (!done) { + fx_wchar c = peek_char(ctx); + if (c == FX_WCHAR_INVALID) { + break; + } + + if (fx_wchar_is_space(c)) { + break; + } + + switch (c) { + case '{': + case '}': + case '(': + case ')': + case ';': + case ',': + case '|': + case '&': + case '$': + done = true; + break; + default: + break; + } + + if (done) { + break; + } + + fx_string_append_wc(tmp, c); + advance_char(ctx); + } + + struct lex_token *tok = NULL; + if (fx_string_get_size(tmp, FX_STRLEN_NORMAL) == 1) { + tok = lex_token_create(TOK_SYMBOL); + tok->tok_symbol = SYM_HYPHEN; + } else { + tok = lex_token_create_with_string( + TOK_FLAG, + fx_string_get_cstr(tmp)); + } + + if (!tok) { + return BSHELL_ERR_NO_MEMORY; + } + +#if 0 + if (convert_word_to_int(tok)) { + tok->tok_int *= -1; + struct lex_token *prefix = lex_token_create(TOK_SYMBOL); + prefix->tok_symbol = SYM_HYPHEN; + enqueue_token(ctx, prefix); + } +#endif + + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; +} + +static enum bshell_status read_literal_string(struct lex_ctx *ctx) +{ + fx_string *tmp = get_temp_string(ctx); + + bool done = false; + bool fail = true; + while (!done) { + fx_wchar c = peek_char(ctx); + if (c == FX_WCHAR_INVALID) { + break; + } + + if (c == '\'') { + fail = false; + done = true; + advance_char(ctx); + break; + } + + fx_string_append_wc(tmp, c); + advance_char(ctx); + } + + struct lex_token *tok = lex_token_create_with_string( + TOK_STRING, + fx_string_get_cstr(tmp)); + enqueue_token(ctx, tok); + + return BSHELL_SUCCESS; +} + +static enum bshell_status read_line_comment(struct lex_ctx *lex) +{ + while (true) { + fx_wchar c = peek_char(lex); + + if (c == FX_WCHAR_INVALID) { + break; + } + + advance_char(lex); + + if (c == '\n') { + break; + } + } + + return BSHELL_SUCCESS; +} + +static enum bshell_status read_dquote_marker(struct lex_ctx *ctx) +{ + enum bshell_status status = BSHELL_SUCCESS; + struct lex_state *state = get_lex_state(ctx); + + struct lex_token *tok = NULL; + + if (state->s_type == LEX_STATE_STRING) { + /* already within an fstring */ + pop_lex_state(ctx); + tok = lex_token_create(TOK_STR_END); + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; + } + + /* start of a new fstring */ + tok = lex_token_create(TOK_STR_START); + enqueue_token(ctx, tok); + + if (!push_lex_state(ctx, LEX_STATE_STRING)) { + return BSHELL_ERR_NO_MEMORY; + } + + return BSHELL_SUCCESS; +} + +static enum bshell_status read_interpolation_marker(struct lex_ctx *ctx) +{ + enum bshell_status status = BSHELL_SUCCESS; + struct lex_state *state = get_lex_state(ctx); + + struct lex_token *tok = NULL; + + if (state->s_type != LEX_STATE_STRING) { + return BSHELL_ERR_INTERNAL_FAILURE; + } + + /* start of a new interpolation */ + if (!push_lex_state(ctx, LEX_STATE_INTERPOLATION)) { + return BSHELL_ERR_NO_MEMORY; + } + + return BSHELL_SUCCESS; +} + +static enum bshell_status read_symbol(struct lex_ctx *ctx, enum lex_flags flags) +{ + struct lex_state *state = get_lex_state(ctx); + enum lex_token_flags required_flags = 0; + if (state->s_type == LEX_STATE_STRING) { + required_flags |= LEX_TOKEN_ENABLE_IN_STRING; + } + + if (!(flags & LEX_ENABLE_SYMBOL)) { + required_flags |= LEX_TOKEN_ENABLE_IN_WORD; + } + + struct lex_symbol_node *node = ctx->lex_sym_tree; + char prev = 0; + + while (true) { + fx_wchar c = peek_char(ctx); + if (c < 0) { + break; + } + + struct lex_symbol_node *next = get_symbol_node(node, c); + if (!next + || (next->s_def->flags & required_flags) + != required_flags) { + prev = c; + break; + } + + node = next; + advance_char(ctx); + prev = c; + } + + if (!node || node->s_def == NULL) { + return BSHELL_ERR_BAD_SYNTAX; + } + + struct lex_token *tok = NULL; + switch (node->s_def->id) { + case SYM_SQUOTE: + return read_literal_string(ctx); + case SYM_DQUOTE: + return read_dquote_marker(ctx); + case SYM_DOLLAR_LEFT_PAREN: + push_symbol(ctx, SYM_DOLLAR_LEFT_PAREN); + if (state->s_type == LEX_STATE_STRING) { + push_lex_state(ctx, LEX_STATE_INTERPOLATION); + } + break; + case SYM_DOLLAR_LEFT_BRACE: + return read_braced_var(ctx, TOK_VAR); + case SYM_HASH: + return read_line_comment(ctx); + case SYM_LEFT_PAREN: + push_symbol(ctx, SYM_LEFT_PAREN); + state->s_paren_depth++; + break; + case SYM_RIGHT_PAREN: + push_symbol(ctx, SYM_RIGHT_PAREN); + + if (state->s_type == LEX_STATE_INTERPOLATION + && state->s_paren_depth == 0) { + pop_lex_state(ctx); + } else { + state->s_paren_depth--; + } + + break; + case SYM_DOLLAR: + return read_var(ctx, TOK_VAR); + case SYM_AT: + return read_var(ctx, TOK_VAR_SPLAT); + default: + push_symbol(ctx, node->s_def->id); + break; + } + + return BSHELL_SUCCESS; +} + +static bool char_can_begin_symbol( + struct lex_ctx *ctx, + char c, + enum lex_flags flags) +{ + struct lex_state *state = get_lex_state(ctx); + enum lex_token_flags required_flags = 0; + if (state->s_type == LEX_STATE_STRING) { + required_flags |= LEX_TOKEN_ENABLE_IN_STRING; + } + + if (!(flags & LEX_ENABLE_SYMBOL)) { + required_flags |= LEX_TOKEN_ENABLE_IN_WORD; + } + + for (size_t i = 0; i < nr_symbols; i++) { + if (symbols[i].name[0] != c) { + continue; + } + + if ((symbols[i].flags & required_flags) != required_flags) { + continue; + } + + return true; + } + + return false; +} + +static bool char_can_begin_symbol_in_context( + struct lex_ctx *ctx, + char c, + enum token_type context, + enum lex_flags flags) +{ + enum lex_token_flags required_flags = 0; + switch (context) { + case TOK_WORD: + required_flags = LEX_TOKEN_ENABLE_IN_WORD; + break; + case TOK_STRING: + required_flags = LEX_TOKEN_ENABLE_IN_STRING; + break; + default: + break; + } + + for (size_t i = 0; i < nr_symbols; i++) { + if (symbols[i].name[0] != c) { + continue; + } + + if ((symbols[i].flags & required_flags) != required_flags) { + continue; + } + + return true; + } + + return false; +} + +static enum bshell_status read_string_content(struct lex_ctx *ctx) +{ + fx_wchar c = FX_WCHAR_INVALID; + fx_string *str = get_temp_string(ctx); + struct lex_state *state = get_lex_state(ctx); + + if (!str) { + return BSHELL_ERR_NO_MEMORY; + } + + while (true) { + c = peek_char(ctx); + if (c < 0) { + break; + } + + if (char_can_begin_symbol(ctx, c, 0)) { + break; + } + + fx_string_append_wc(str, c); + // set_token_end(lex); + advance_char(ctx); + } + + if (fx_string_get_size(str, FX_STRLEN_NORMAL) == 0) { + return BSHELL_SUCCESS; + } + + struct lex_token *tok = lex_token_create_with_string( + TOK_STRING, + fx_string_get_cstr(str)); + enqueue_token(ctx, tok); + + return BSHELL_SUCCESS; +} + +static enum bshell_status do_pump_token_string( + struct lex_ctx *ctx, + enum lex_flags flags) +{ + fx_wchar c = peek_char(ctx); + enum bshell_status status = BSHELL_SUCCESS; + bool ok = false; + + if (char_can_begin_symbol(ctx, c, flags)) { + status = read_symbol(ctx, flags); + ok = true; + } + + if (status != BSHELL_SUCCESS || !ok) { + status = read_string_content(ctx); + } + + return status; +} + +static enum bshell_status do_pump_token_normal( + struct lex_ctx *ctx, + enum lex_flags flags) +{ + enum bshell_status status = BSHELL_SUCCESS; + + fx_wchar c = peek_char(ctx); + bool newline = false; + + while (fx_wchar_is_space(c)) { + if (c == '\n') { + newline = true; + } + + advance_char_noread(ctx); + c = peek_char_noread(ctx); + } + + if (newline) { + struct lex_token *tok = lex_token_create(TOK_LINEFEED); + enqueue_token(ctx, tok); + return BSHELL_SUCCESS; + } + + if (c == '-') { + return read_flag(ctx); + } + + if (char_can_begin_symbol(ctx, c, flags)) { + return read_symbol(ctx, flags); + } + + return read_word(ctx, flags); +} + +static enum bshell_status pump_tokens(struct lex_ctx *ctx, enum lex_flags flags) +{ + enum bshell_status status = BSHELL_SUCCESS; + while (fx_queue_empty(&ctx->lex_tokens) && status == BSHELL_SUCCESS) { + struct lex_state *state = get_lex_state(ctx); + pump_token_impl impl = token_pump_functions[state->s_type]; + + status = impl(ctx, flags); + } + + return status; +} + +struct lex_token *lex_ctx_peek(struct lex_ctx *ctx, enum lex_flags flags) +{ + struct lex_token *tok = get_next_token(ctx, flags); + if (tok) { + return tok; + } + + pump_tokens(ctx, flags); + tok = get_next_token(ctx, flags); + if (tok && (ctx->lex_flags & LEX_PRINT_TOKENS)) { + print_lex_token(tok); + } + + return tok; +} + +struct lex_token *lex_ctx_claim(struct lex_ctx *ctx, enum lex_flags flags) +{ + struct lex_token *tok = dequeue_next_token(ctx, flags); + if (tok) { + return tok; + } + + if (fx_queue_empty(&ctx->lex_tokens)) { + pump_tokens(ctx, flags); + + tok = get_next_token(ctx, flags); + if (tok && (ctx->lex_flags & LEX_PRINT_TOKENS)) { + print_lex_token(tok); + } + } + + return dequeue_next_token(ctx, flags); +} + +void lex_ctx_discard(struct lex_ctx *ctx, enum lex_flags flags) +{ + struct lex_token *tok = dequeue_next_token(ctx, 0); + if (tok) { + lex_token_destroy(tok); + return; + } + + if (fx_queue_empty(&ctx->lex_tokens)) { + pump_tokens(ctx, flags); + } +} diff --git a/bshell/parse/lex.h b/bshell/parse/lex.h new file mode 100644 index 0000000..97bc3cf --- /dev/null +++ b/bshell/parse/lex.h @@ -0,0 +1,82 @@ +#ifndef LEX_H_ +#define LEX_H_ + +#include "../status.h" + +#include +#include +#include + +struct lex_token; +struct line_source; + +enum lex_flags { + LEX_PRINT_TOKENS = 0x01u, + + /* these flags are for lex_ctx_peek and lex_ctx_claim */ + LEX_ENABLE_KEYWORD = 0x0100u, + LEX_ENABLE_INT = 0x0200u, + LEX_ENABLE_SYMBOL = 0x0400u, +}; + +enum lex_token_flags { + LEX_TOKEN_ENABLE_IN_STRING = 0x01u, + LEX_TOKEN_ENABLE_IN_WORD = 0x02u, +}; + +struct lex_token_def { + int id; + const char *name; + uint64_t name_hash; + enum lex_token_flags flags; +}; + +struct lex_symbol_node { + char s_char; + struct lex_token_def *s_def; + + fx_queue_entry s_entry; + fx_queue s_children; +}; + +enum lex_state_type { + LEX_STATE_NORMAL = 0, + LEX_STATE_WORD, + LEX_STATE_STRING, + LEX_STATE_INTERPOLATION, +}; + +struct lex_state { + enum lex_state_type s_type; + unsigned int s_paren_depth; + fx_queue_entry s_entry; +}; + +struct lex_ctx { + enum lex_flags lex_flags; + fx_queue lex_tokens; + struct line_source *lex_src; + fx_stringstream *lex_buf; + fx_string *lex_tmp; + fx_wchar lex_ch; + struct lex_token *lex_alt; + fx_queue lex_state; + struct lex_symbol_node *lex_sym_tree; + enum bshell_status lex_status; +}; + +extern enum bshell_status lex_ctx_init( + struct lex_ctx *ctx, + enum lex_flags flags, + struct line_source *src); +extern enum bshell_status lex_ctx_cleanup(struct lex_ctx *ctx); + +extern struct lex_token *lex_ctx_peek( + struct lex_ctx *ctx, + enum lex_flags flags); +extern struct lex_token *lex_ctx_claim( + struct lex_ctx *ctx, + enum lex_flags flags); +extern void lex_ctx_discard(struct lex_ctx *ctx, enum lex_flags flags); + +#endif diff --git a/bshell/parse/parse.c b/bshell/parse/parse.c new file mode 100644 index 0000000..d760f50 --- /dev/null +++ b/bshell/parse/parse.c @@ -0,0 +1,30 @@ +#include "parse.h" + +#include "../ast/ast.h" +#include "lex.h" +#include "syntax.h" +#include "token.h" + +#include +#include + +enum bshell_status parse_ctx_init(struct parse_ctx *ctx, struct lex_ctx *src) +{ + memset(ctx, 0x0, sizeof *ctx); + + ctx->p_src = src; + + return BSHELL_SUCCESS; +} + +void parse_ctx_cleanup(struct parse_ctx *ctx) +{ +} + +struct ast_node *parse_ctx_read_node(struct parse_ctx *ctx) +{ + struct ast_node *result = NULL; + bool ok = parse_expr(ctx, &result); + + return ok ? result : NULL; +} diff --git a/bshell/parse/parse.h b/bshell/parse/parse.h new file mode 100644 index 0000000..867fe4c --- /dev/null +++ b/bshell/parse/parse.h @@ -0,0 +1,21 @@ +#ifndef PARSE_H_ +#define PARSE_H_ + +#include "../status.h" + +struct lex_ctx; +struct ast_node; + +struct parse_ctx { + struct lex_ctx *p_src; + enum bshell_status p_status; +}; + +extern enum bshell_status parse_ctx_init( + struct parse_ctx *ctx, + struct lex_ctx *src); +extern void parse_ctx_cleanup(struct parse_ctx *ctx); + +extern struct ast_node *parse_ctx_read_node(struct parse_ctx *ctx); + +#endif diff --git a/bshell/parse/syntax.h b/bshell/parse/syntax.h new file mode 100644 index 0000000..72388a5 --- /dev/null +++ b/bshell/parse/syntax.h @@ -0,0 +1,53 @@ +#ifndef PARSE_SYNTAX_H_ +#define PARSE_SYNTAX_H_ + +#include "../ast/ast.h" +#include "lex.h" +#include "parse.h" +#include "token.h" + +#include + +enum parse_operand_flags { + OPERAND_BASIC = 0x01u, +}; + +extern struct lex_token *peek_token( + struct parse_ctx *ctx, + enum lex_flags flags); +extern enum token_type peek_token_type( + struct parse_ctx *ctx, + enum lex_flags flags); +extern enum token_keyword peek_unknown_keyword(struct parse_ctx *ctx); +extern enum token_symbol peek_unknown_symbol(struct parse_ctx *ctx); +extern bool peek_int(struct parse_ctx *ctx); + +extern struct lex_token *claim_token( + struct parse_ctx *ctx, + enum lex_flags flags); +extern void discard_token(struct parse_ctx *ctx); + +extern bool peek_linefeed(struct parse_ctx *ctx); +extern bool peek_symbol(struct parse_ctx *ctx, enum token_symbol sym); + +extern bool parse_linefeed(struct parse_ctx *ctx); +extern bool parse_symbol(struct parse_ctx *ctx, enum token_symbol sym); +extern bool parse_keyword(struct parse_ctx *ctx, enum token_keyword kw); +extern bool parse_int(struct parse_ctx *ctx, long long *out); +extern bool parse_flag(struct parse_ctx *ctx, struct lex_token **out); + +extern bool peek_arith_expr(struct parse_ctx *ctx); +extern bool parse_arith_expr(struct parse_ctx *ctx, struct ast_node **out); +extern bool parse_operand( + struct parse_ctx *ctx, + enum parse_operand_flags flags, + struct ast_node **out); + +extern bool parse_expr(struct parse_ctx *ctx, struct ast_node **out); + +extern bool peek_command(struct parse_ctx *ctx); +extern bool parse_command(struct parse_ctx *ctx, struct ast_node **out); +extern bool parse_cmdcall(struct parse_ctx *ctx, struct ast_node **out); +extern bool parse_redirect(struct parse_ctx *ctx, struct ast_node **out); + +#endif diff --git a/bshell/parse/syntax/arith.c b/bshell/parse/syntax/arith.c new file mode 100644 index 0000000..4758f4c --- /dev/null +++ b/bshell/parse/syntax/arith.c @@ -0,0 +1,11 @@ +#include "../syntax.h" + +bool peek_arith_expr(struct parse_ctx *ctx) +{ + return false; +} + +bool parse_arith_expr(struct parse_ctx *ctx, struct ast_node **out) +{ + return false; +} diff --git a/bshell/parse/syntax/command.c b/bshell/parse/syntax/command.c new file mode 100644 index 0000000..2c8c345 --- /dev/null +++ b/bshell/parse/syntax/command.c @@ -0,0 +1,385 @@ +#include "../syntax.h" + +#include + +static bool parse_cmdcall_arg(struct parse_ctx *ctx, struct ast_node **out) +{ + struct lex_token *tok = peek_token(ctx, 0); + if (!tok) { + return false; + } + + struct ast_node *arg = NULL; + + switch (tok->tok_type) { + case TOK_WORD: { + struct word_ast_node *n + = (struct word_ast_node *)ast_node_create(AST_WORD); + if (!n) { + ctx->p_status = BSHELL_ERR_NO_MEMORY; + return false; + } + + n->n_value = claim_token(ctx, 0); + *out = (struct ast_node *)n; + return true; + } + + case TOK_FLAG: { + struct word_ast_node *n + = (struct word_ast_node *)ast_node_create(AST_WORD); + if (!n) { + ctx->p_status = BSHELL_ERR_NO_MEMORY; + return false; + } + + n->n_value = claim_token(ctx, 0); + *out = (struct ast_node *)n; + return true; + } + + case TOK_VAR: { + struct var_ast_node *n + = (struct var_ast_node *)ast_node_create(AST_VAR); + if (!n) { + ctx->p_status = BSHELL_ERR_NO_MEMORY; + return false; + } + + n->n_ident = claim_token(ctx, 0); + *out = (struct ast_node *)n; + return true; + } + + case TOK_VAR_SPLAT: { + struct var_splat_ast_node *n + = (struct var_splat_ast_node *)ast_node_create( + AST_VAR_SPLAT); + if (!n) { + ctx->p_status = BSHELL_ERR_NO_MEMORY; + return false; + } + + n->n_ident = claim_token(ctx, 0); + *out = (struct ast_node *)n; + return true; + } + + case TOK_STRING: { + struct string_ast_node *n + = (struct string_ast_node *)ast_node_create(AST_STRING); + if (!n) { + ctx->p_status = BSHELL_ERR_NO_MEMORY; + return false; + } + + n->n_value = claim_token(ctx, 0); + *out = (struct ast_node *)n; + return true; + } + + default: + return false; + } + + return true; +} + +static bool parse_redirect_to_fd( + struct parse_ctx *ctx, + unsigned int in_fd, + bool append, + struct ast_node **out) +{ + struct redirection_ast_node *redirect + = (struct redirection_ast_node *)ast_node_create( + AST_REDIRECTION); + + redirect->n_in = in_fd; + redirect->n_append = append; + + if (!parse_symbol(ctx, SYM_AMPERSAND)) { + ast_node_destroy((struct ast_node *)redirect); + return false; + } + + long long out_fd = 0; + if (!parse_int(ctx, &out_fd)) { + ctx->p_status = BSHELL_ERR_BAD_SYNTAX; + ast_node_destroy((struct ast_node *)redirect); + return false; + } + + redirect->n_out_is_fd = true; + redirect->n_out_is_expr = false; + redirect->n_out = (unsigned int)out_fd; + *out = (struct ast_node *)redirect; + + return true; +} + +static bool parse_redirect_to_file_squashed( + struct parse_ctx *ctx, + unsigned int in_fd, + bool append, + const char *str, + struct ast_node **out) +{ + struct lex_token *tok = peek_token(ctx, 0); + if (*str == '\0') { + return false; + } + + struct redirection_ast_node *redirect + = (struct redirection_ast_node *)ast_node_create( + AST_REDIRECTION); + + redirect->n_in = in_fd; + redirect->n_append = append; + redirect->n_out_is_fd = false; + redirect->n_out_is_expr = false; + redirect->n_out_path = str; + + redirect->n_out_tok = claim_token(ctx, 0); + + *out = (struct ast_node *)redirect; + return true; +} + +static bool parse_redirect_to_file_separate( + struct parse_ctx *ctx, + unsigned int in_fd, + bool append, + struct ast_node **out) +{ + struct ast_node *out_path = NULL; + if (!parse_cmdcall_arg(ctx, &out_path)) { + ctx->p_status = BSHELL_ERR_BAD_SYNTAX; + return false; + } + + struct redirection_ast_node *redirect + = (struct redirection_ast_node *)ast_node_create( + AST_REDIRECTION); + + redirect->n_in = in_fd; + redirect->n_append = append; + redirect->n_out_is_fd = false; + redirect->n_out_is_expr = true; + redirect->n_out_path_expr = out_path; + + *out = (struct ast_node *)redirect; + return true; +} + +bool parse_redirect(struct parse_ctx *ctx, struct ast_node **out) +{ + struct lex_token *tok = peek_token(ctx, 0); + if (!tok || tok->tok_type != TOK_WORD) { + return false; + } + + unsigned int in_fd = 1; + const char *str = tok->tok_str; + bool append = false; + + if (fx_wchar_is_number(*str)) { + in_fd = *str - '0'; + str++; + } + + if (*str != '>') { + return false; + } + + str++; + if (*str == '>') { + append = true; + str++; + } + + if (*str != '\0') { + return parse_redirect_to_file_squashed( + ctx, + in_fd, + append, + str, + out); + } + + discard_token(ctx); + + if (parse_redirect_to_fd(ctx, in_fd, append, out)) { + return true; + } + + if (parse_redirect_to_file_separate(ctx, in_fd, append, out)) { + return true; + } + + return false; +} + +static bool peek_cmdcall_item(struct parse_ctx *ctx, bool unrestricted) +{ + /* each token type falls into one of three categories: + * - cmdcall item: the token can be used as part of a command call. the + * token indicates the start of a command call. + * - NOT a cmdcall item: the token cannot be used as part of a command + * call, usually because it as a cmdcall operator like | or &. + * encountering one of these tokens ends the cmdcall currently being + * parsed. + * - RESTRICTED cmdcall item: the token can be used as part of a + * command, but will not be considered the start of a cmdcall. to run + * a command with this token as its name, the call operator must be + * used. + */ + switch (peek_token_type(ctx, LEX_ENABLE_INT | LEX_ENABLE_KEYWORD)) { + case TOK_KEYWORD: + case TOK_INT: + case TOK_DOUBLE: + case TOK_VAR: + case TOK_VAR_SPLAT: + case TOK_STRING: + case TOK_STR_START: + return unrestricted; + case TOK_SYMBOL: + switch (peek_unknown_symbol(ctx)) { + case SYM_PLUS: + case SYM_HYPHEN: + return unrestricted; + case SYM_PIPE: + case SYM_AMPERSAND: + case SYM_SEMICOLON: + return false; + default: + return true; + } + case TOK_NONE: + case TOK_LINEFEED: + return false; + default: + return true; + } +} + +bool parse_cmdcall(struct parse_ctx *ctx, struct ast_node **out) +{ + struct cmdcall_ast_node *node + = (struct cmdcall_ast_node *)ast_node_create(AST_CMDCALL); + if (!node) { + ctx->p_status = BSHELL_ERR_NO_MEMORY; + return false; + } + + struct ast_node *child = NULL; + bool unrestricted = false; + bool ok = true; + bool stop = false; + + if (parse_symbol(ctx, SYM_AMPERSAND)) { + unrestricted = true; + } + + if (!peek_cmdcall_item(ctx, unrestricted)) { + return false; + } + + struct lex_token *tok = peek_token(ctx, 0); + if (!tok) { + return false; + } + + if (!parse_cmdcall_arg(ctx, &child)) { + return false; + } + + fx_queue_push_back(&node->n_args, &child->n_entry); + + while (ok && !stop) { + if (!peek_cmdcall_item(ctx, true)) { + break; + } + + struct lex_token *tok = peek_token(ctx, 0); + if (!tok) { + break; + } + + if (parse_redirect(ctx, &child)) { + fx_queue_push_back(&node->n_redirect, &child->n_entry); + } else if (parse_cmdcall_arg(ctx, &child)) { + fx_queue_push_back(&node->n_args, &child->n_entry); + } else { + ctx->p_status = BSHELL_ERR_BAD_SYNTAX; + ok = false; + break; + } + } + + if (!ok) { + ast_node_destroy((struct ast_node *)node); + node = NULL; + } + + *out = (struct ast_node *)node; + return ok; +} + +bool peek_command(struct parse_ctx *ctx) +{ + if (peek_symbol(ctx, SYM_AMPERSAND)) { + return true; + } + + return peek_cmdcall_item(ctx, false); +} + +bool parse_command(struct parse_ctx *ctx, struct ast_node **out) +{ + struct ast_node *cmdcall = NULL; + if (!parse_cmdcall(ctx, &cmdcall)) { + return false; + } + + struct pipeline_ast_node *pipeline = NULL; + + while (1) { + if (parse_symbol(ctx, SYM_SEMICOLON) || parse_linefeed(ctx)) { + break; + } + + if (!parse_symbol(ctx, SYM_PIPE)) { + break; + } + + if (!pipeline) { + pipeline = (struct pipeline_ast_node *)ast_node_create( + AST_PIPELINE); + if (!pipeline) { + ctx->p_status = BSHELL_ERR_NO_MEMORY; + ast_node_destroy(cmdcall); + return false; + } + + fx_queue_push_back( + &pipeline->n_stages, + &cmdcall->n_entry); + } + + if (!parse_cmdcall(ctx, &cmdcall)) { + ctx->p_status = BSHELL_ERR_BAD_SYNTAX; + return false; + } + + fx_queue_push_back(&pipeline->n_stages, &cmdcall->n_entry); + } + + if (pipeline) { + *out = (struct ast_node *)pipeline; + } else { + *out = cmdcall; + } + + return true; +} diff --git a/bshell/parse/syntax/expr.c b/bshell/parse/syntax/expr.c new file mode 100644 index 0000000..7c33b89 --- /dev/null +++ b/bshell/parse/syntax/expr.c @@ -0,0 +1,15 @@ +#include "../syntax.h" + +bool parse_expr(struct parse_ctx *ctx, struct ast_node **out) +{ + bool ok = false; + if (peek_command(ctx)) { + ok = parse_command(ctx, out); + } + + if (!ok && peek_arith_expr(ctx)) { + ok = parse_arith_expr(ctx, out); + } + + return ok; +} diff --git a/bshell/parse/syntax/generic.c b/bshell/parse/syntax/generic.c new file mode 100644 index 0000000..d3eaaa6 --- /dev/null +++ b/bshell/parse/syntax/generic.c @@ -0,0 +1,135 @@ +#include "../lex.h" +#include "../parse.h" +#include "../syntax.h" +#include "../token.h" + +#define DEFAULT_LEX_FLAGS \ + (LEX_ENABLE_INT | LEX_ENABLE_KEYWORD | LEX_ENABLE_SYMBOL) + +struct lex_token *claim_token(struct parse_ctx *ctx, enum lex_flags flags) +{ + return lex_ctx_claim(ctx->p_src, flags); +} + +void discard_token(struct parse_ctx *ctx) +{ + return lex_ctx_discard(ctx->p_src, DEFAULT_LEX_FLAGS); +} + +struct lex_token *peek_token(struct parse_ctx *ctx, enum lex_flags flags) +{ + return lex_ctx_peek(ctx->p_src, flags); +} + +enum token_type peek_token_type(struct parse_ctx *ctx, enum lex_flags flags) +{ + struct lex_token *tok = peek_token(ctx, flags); + return tok ? tok->tok_type : TOK_NONE; +} + +enum token_symbol peek_unknown_symbol(struct parse_ctx *ctx) +{ + struct lex_token *tok = peek_token(ctx, DEFAULT_LEX_FLAGS); + return (tok && tok->tok_type == TOK_SYMBOL) ? tok->tok_symbol + : SYM_NONE; +} + +enum token_keyword peek_unknown_keyword(struct parse_ctx *ctx) +{ + struct lex_token *tok = peek_token(ctx, DEFAULT_LEX_FLAGS); + return (tok && tok->tok_type == TOK_KEYWORD) ? tok->tok_keyword + : KW_NONE; +} + +bool peek_linefeed(struct parse_ctx *ctx) +{ + struct lex_token *tok = peek_token(ctx, DEFAULT_LEX_FLAGS); + if (tok && tok->tok_type == TOK_LINEFEED) { + return true; + } + + return false; +} + +bool peek_symbol(struct parse_ctx *ctx, enum token_symbol sym) +{ + struct lex_token *tok = peek_token(ctx, DEFAULT_LEX_FLAGS); + if (!tok) { + return false; + } + + if (tok->tok_type != TOK_SYMBOL) { + return false; + } + + if (tok->tok_symbol != sym) { + return false; + } + + return true; +} + +bool parse_linefeed(struct parse_ctx *ctx) +{ + struct lex_token *tok = peek_token(ctx, DEFAULT_LEX_FLAGS); + if (tok && tok->tok_type == TOK_LINEFEED) { + discard_token(ctx); + return true; + } + + return false; +} + +bool parse_symbol(struct parse_ctx *ctx, enum token_symbol sym) +{ + struct lex_token *tok = peek_token(ctx, DEFAULT_LEX_FLAGS); + if (!tok) { + return false; + } + + if (tok->tok_type != TOK_SYMBOL) { + return false; + } + + if (tok->tok_symbol != sym) { + return false; + } + + discard_token(ctx); + return true; +} + +bool parse_keyword(struct parse_ctx *ctx, enum token_keyword kw) +{ + struct lex_token *tok = peek_token(ctx, DEFAULT_LEX_FLAGS); + if (!tok) { + return false; + } + + if (tok->tok_type != TOK_KEYWORD) { + return false; + } + + if (tok->tok_keyword != kw) { + return false; + } + + discard_token(ctx); + return true; +} + +bool parse_int(struct parse_ctx *ctx, long long *out) +{ + struct lex_token *tok = peek_token(ctx, DEFAULT_LEX_FLAGS); + if (!tok) { + return false; + } + + if (tok->tok_type != TOK_INT) { + return false; + } + + *out = tok->tok_int; + discard_token(ctx); + return true; +} diff --git a/bshell/parse/token.c b/bshell/parse/token.c new file mode 100644 index 0000000..fed6184 --- /dev/null +++ b/bshell/parse/token.c @@ -0,0 +1,148 @@ +#include "token.h" + +#include +#include +#include + +struct lex_token *lex_token_create(enum token_type type) +{ + struct lex_token *out = malloc(sizeof *out); + if (!out) { + return NULL; + } + + memset(out, 0x0, sizeof *out); + + out->tok_type = type; + + return out; +} + +struct lex_token *lex_token_create_with_string( + enum token_type type, + const char *s) +{ + struct lex_token *tok = lex_token_create(type); + if (!tok) { + return NULL; + } + + tok->tok_str = fx_strdup(s); + if (!tok->tok_str) { + free(tok); + return NULL; + } + + return tok; +} + +void lex_token_destroy(struct lex_token *tok) +{ + switch (tok->tok_type) { + case TOK_WORD: + case TOK_FLAG: + case TOK_STRING: + if (tok->tok_str) { + free(tok->tok_str); + } + + break; + default: + break; + } + + free(tok); +} + +struct lex_token *lex_token_change_type( + struct lex_token *tok, + enum token_type new_type) +{ + switch (tok->tok_type) { + case TOK_WORD: + case TOK_FLAG: + case TOK_STRING: + if (tok->tok_str) { + free(tok->tok_str); + tok->tok_str = NULL; + } + break; + default: + break; + } + + tok->tok_type = new_type; + return tok; +} + +#define ENUM_STR(x) \ + case x: \ + return #x + +const char *token_type_to_string(enum token_type type) +{ + switch (type) { + ENUM_STR(TOK_NONE); + ENUM_STR(TOK_KEYWORD); + ENUM_STR(TOK_SYMBOL); + ENUM_STR(TOK_INT); + ENUM_STR(TOK_DOUBLE); + ENUM_STR(TOK_WORD); + ENUM_STR(TOK_VAR); + ENUM_STR(TOK_VAR_SPLAT); + ENUM_STR(TOK_FLAG); + ENUM_STR(TOK_STRING); + ENUM_STR(TOK_STR_START); + ENUM_STR(TOK_STR_END); + ENUM_STR(TOK_LINEFEED); + default: + return ""; + } +} + +const char *token_keyword_to_string(enum token_keyword keyword) +{ + switch (keyword) { + ENUM_STR(KW_NONE); + ENUM_STR(KW_FUNC); + default: + return ""; + } +} + +const char *token_symbol_to_string(enum token_symbol sym) +{ + switch (sym) { + ENUM_STR(SYM_NONE); + ENUM_STR(SYM_PLUS); + ENUM_STR(SYM_HYPHEN); + ENUM_STR(SYM_FORWARD_SLASH); + ENUM_STR(SYM_ASTERISK); + ENUM_STR(SYM_AMPERSAND); + ENUM_STR(SYM_PERCENT); + ENUM_STR(SYM_SQUOTE); + ENUM_STR(SYM_DQUOTE); + ENUM_STR(SYM_HASH); + ENUM_STR(SYM_SEMICOLON); + ENUM_STR(SYM_COMMA); + ENUM_STR(SYM_DOLLAR); + ENUM_STR(SYM_DOLLAR_LEFT_PAREN); + ENUM_STR(SYM_PIPE); + ENUM_STR(SYM_AT); + ENUM_STR(SYM_AT_LEFT_BRACE); + ENUM_STR(SYM_LEFT_BRACE); + ENUM_STR(SYM_RIGHT_BRACE); + ENUM_STR(SYM_LEFT_BRACKET); + ENUM_STR(SYM_RIGHT_BRACKET); + ENUM_STR(SYM_LEFT_PAREN); + ENUM_STR(SYM_RIGHT_PAREN); + ENUM_STR(SYM_EQUAL); + ENUM_STR(SYM_PLUS_EQUAL); + ENUM_STR(SYM_HYPHEN_EQUAL); + ENUM_STR(SYM_FORWARD_SLASH_EQUAL); + ENUM_STR(SYM_ASTERISK_EQUAL); + ENUM_STR(SYM_PERCENT_EQUAL); + default: + return ""; + } +} diff --git a/bshell/parse/token.h b/bshell/parse/token.h new file mode 100644 index 0000000..bf6009c --- /dev/null +++ b/bshell/parse/token.h @@ -0,0 +1,131 @@ +#ifndef IVY_LANG_LEX_H_ +#define IVY_LANG_LEX_H_ + +#include +#include + +struct char_cell { + unsigned long c_row, c_col; +}; + +enum token_type { + TOK_NONE = 0, + __TOK_INDEX_BASE = 100, + TOK_KEYWORD, + TOK_SYMBOL, + TOK_INT, + TOK_DOUBLE, + TOK_WORD, + TOK_FLAG, + TOK_VAR, + TOK_VAR_SPLAT, + TOK_STRING, + TOK_STR_START, + TOK_STR_END, + TOK_LINEFEED, + __TOK_INDEX_LIMIT, +}; + +enum token_keyword { + KW_NONE = 0, + __KW_INDEX_BASE = 200, + KW_FUNC, + __KW_INDEX_LIMIT, +}; + +enum token_symbol { + SYM_NONE = 0, + __SYM_INDEX_BASE = 300, + SYM_PLUS, + SYM_HYPHEN, + SYM_FORWARD_SLASH, + SYM_ASTERISK, + SYM_AMPERSAND, + SYM_PERCENT, + SYM_SQUOTE, + SYM_DQUOTE, + SYM_HASH, + SYM_SEMICOLON, + SYM_COMMA, + SYM_DOLLAR, + SYM_DOLLAR_LEFT_PAREN, + SYM_DOLLAR_LEFT_BRACE, + SYM_PIPE, + SYM_AT, + SYM_AT_LEFT_BRACE, + SYM_LEFT_BRACE, + SYM_RIGHT_BRACE, + SYM_LEFT_BRACKET, + SYM_RIGHT_BRACKET, + SYM_LEFT_PAREN, + SYM_RIGHT_PAREN, + SYM_EQUAL, + SYM_PLUS_EQUAL, + SYM_HYPHEN_EQUAL, + SYM_ASTERISK_EQUAL, + SYM_FORWARD_SLASH_EQUAL, + SYM_PERCENT_EQUAL, + __SYM_INDEX_LIMIT, +}; + +struct lex_token { + enum token_type tok_type; + + struct char_cell tok_start, tok_end; + + fx_queue_entry tok_entry; + + union { + enum token_keyword tok_keyword; + enum token_symbol tok_symbol; + long long tok_int; + double tok_double; + char *tok_str; + }; +}; + +extern struct lex_token *lex_token_create(enum token_type type); +extern struct lex_token *lex_token_create_with_string( + enum token_type type, + const char *s); +extern void lex_token_destroy(struct lex_token *tok); + +extern struct lex_token *lex_token_change_type( + struct lex_token *tok, + enum token_type new_type); + +static inline bool lex_token_is_symbol( + struct lex_token *tok, + enum token_symbol sym) +{ + return (tok->tok_type == TOK_SYMBOL && tok->tok_symbol == sym); +} +static inline bool lex_token_is_keyword( + struct lex_token *tok, + enum token_keyword kw) +{ + return (tok->tok_type == TOK_KEYWORD && tok->tok_keyword == kw); +} +static inline bool lex_token_type_has_string_value(enum token_type type) +{ + switch (type) { + case TOK_WORD: + case TOK_STRING: + case TOK_FLAG: + case TOK_VAR: + case TOK_VAR_SPLAT: + return true; + default: + return false; + } +} +static inline bool lex_token_has_string_value(const struct lex_token *tok) +{ + return lex_token_type_has_string_value(tok->tok_type); +} + +extern const char *token_type_to_string(enum token_type type); +extern const char *token_keyword_to_string(enum token_keyword keyword); +extern const char *token_symbol_to_string(enum token_symbol sym); + +#endif