From 682cbdc216cb61e09c44e68ef15339abb7ce708d Mon Sep 17 00:00:00 2001 From: yenru0 Date: Sat, 15 Nov 2025 08:01:34 +0900 Subject: [PATCH] minor implement lexer --- .gitignore | 4 +- Makefile | 23 +++++ README.md | 10 +-- include/globals.h | 13 ++- include/lex.h | 4 +- include/util.h | 12 +++ src/lex.c | 213 +++++++++++++++++++++++++++++++++++++++++++++- src/main.c | 27 ++++++ src/util.c | 107 +++++++++++++++++++++++ test.cval | 14 +-- 10 files changed, 407 insertions(+), 20 deletions(-) create mode 100644 include/util.h create mode 100644 src/main.c create mode 100644 src/util.c diff --git a/.gitignore b/.gitignore index 600d2d3..e37625c 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ -.vscode \ No newline at end of file +.vscode + +build \ No newline at end of file diff --git a/Makefile b/Makefile index e69de29..ff9953f 100644 --- a/Makefile +++ b/Makefile @@ -0,0 +1,23 @@ +CC := gcc + +CFLAGS := -Wall -Wextra -Werror -g -I./include + +BUILD_DIR := build + +SRC := $(wildcard src/*.c) +OBJ := $(patsubst src/%.c,${BUILD_DIR}/%.o,$(SRC)) + +TARGET := cval.out + +.PHONY: all clean + +all: $(TARGET) + +$(TARGET): $(OBJ) + $(CC) $(CFLAGS) -o $(BUILD_DIR)/$(TARGET) $(OBJ) + +$(BUILD_DIR)/%.o: src/%.c + $(CC) $(CFLAGS) -c $< -o $@ + +clean: + rm -f $(OBJ) $(BUILD_DIR)/$(TARGET) \ No newline at end of file diff --git a/README.md b/README.md index 587f82c..5491ef2 100644 --- a/README.md +++ b/README.md @@ -8,18 +8,16 @@ * RCURLY `}` * LPAREN `(` * RPAREN `)` -* ID `[any]` +* ID `[any character without whitespace]+` * SEMI `;` * COMMA `,` * ARROW `->` * STAR `*` * ANDREF `&` * DOLLAR `$` -* COMMENT `#` -* NUM `[0-9]*(.[0-9]+)?` -* RETURN `return` -* IF `if` -* ELSE `else` +* COMMENT `//` +* NUM `[0-9]*` +* VAL * STRING `"{any}"` ## Syntax Spec diff --git a/include/globals.h b/include/globals.h index c8f2fa1..17a95d3 100644 --- a/include/globals.h +++ b/include/globals.h @@ -1,5 +1,6 @@ #pragma once #include +#include typedef enum { LBRACK, @@ -21,15 +22,23 @@ typedef enum { NUM, STRING_LITERAL, + VAL, RETURN, IF, ELSE, - + + EOF_TOKEN, ERROR } TokenType; +typedef struct { + size_t len; + char* string; +} TokenString; + typedef struct { TokenType type; - char *data; uint32_t line; + TokenString data; } Token; + diff --git a/include/lex.h b/include/lex.h index db36da2..bde5b41 100644 --- a/include/lex.h +++ b/include/lex.h @@ -22,7 +22,7 @@ typedef struct Lexer { size_t bytes_in_buffer; } Lexer; -Lexer *new_lexer(); +Lexer *lexer_new(); void lexer_set_source(Lexer *lexer, FILE *source); @@ -36,4 +36,4 @@ char lexer_peek(Lexer *lexer); Token lexer_next_token(Lexer *lexer); -void free_lexer(Lexer *lexer); +void lexer_free(Lexer *lexer); diff --git a/include/util.h b/include/util.h new file mode 100644 index 0000000..31dd233 --- /dev/null +++ b/include/util.h @@ -0,0 +1,12 @@ +#pragma once +#include "globals.h" + +int is_whitespace(char c); + +int is_alpha(char c); + +int is_digit(char c); + +int is_alpha_digit(char c); + +void print_token(Token tok); \ No newline at end of file diff --git a/src/lex.c b/src/lex.c index bf291da..383a4a0 100644 --- a/src/lex.c +++ b/src/lex.c @@ -1,6 +1,11 @@ #include "lex.h" -Lexer *new_lexer() { +#include "util.h" +#include +#include +#include + +Lexer *lexer_new() { Lexer *lexer = malloc(sizeof(Lexer)); lexer->linepos = 0; @@ -13,6 +18,8 @@ Lexer *new_lexer() { lexer->buffer = calloc(LEX_BUF_SIZE, sizeof(char)); lexer->curr = lexer->buffer; lexer->buffer[0] = '\0'; + + return lexer; } void lexer_set_source(Lexer *lexer, FILE *source) { @@ -20,6 +27,9 @@ void lexer_set_source(Lexer *lexer, FILE *source) { lexer->linepos = 0; lexer->source = source; + + lexer_fill_buffer(lexer); + } void lexer_fill_buffer(Lexer *lexer) { @@ -56,6 +66,13 @@ void lexer_consume(Lexer *lexer) { if (read == LEX_BUF_SIZE - 2) { lexer_fill_buffer(lexer); } else { + if (lexer_curr(lexer) == '\n') { + lexer->lineno += 1; + lexer->linepos = 0; + } else { + lexer->linepos++; + } + lexer->curr++; } } @@ -69,10 +86,200 @@ char lexer_peek(Lexer *lexer) { } Token lexer_next_token(Lexer *lexer) { - // TODO: implement + char c = lexer_curr(lexer); + if (c == '\0' && lexer->flag_EOF) { + Token eof_tok; + eof_tok.type = EOF_TOKEN; + eof_tok.line = lexer->lineno; + return eof_tok; + } + Token tok; + char *tmp = NULL; + while (is_whitespace(c)) { + lexer_consume(lexer); + c = lexer_curr(lexer); + } + switch (c) { + case '[': + tok.type = LBRACK; + tok.line = lexer->lineno; + lexer_consume(lexer); + break; + case ']': + tok.type = RBRACK; + tok.line = lexer->lineno; + lexer_consume(lexer); + break; + case '{': + tok.type = LCURLY; + tok.line = lexer->lineno; + lexer_consume(lexer); + break; + case '}': + tok.type = RCURLY; + tok.line = lexer->lineno; + lexer_consume(lexer); + break; + case '(': + tok.type = LPAREN; + tok.line = lexer->lineno; + lexer_consume(lexer); + break; + case ')': + tok.type = RPAREN; + tok.line = lexer->lineno; + lexer_consume(lexer); + break; + case ';': + tok.type = SEMI; + tok.line = lexer->lineno; + lexer_consume(lexer); + break; + case ',': + tok.type = COMMA; + tok.line = lexer->lineno; + lexer_consume(lexer); + break; + case '*': + tok.type = STAR; + tok.line = lexer->lineno; + lexer_consume(lexer); + break; + case '&': + tok.type = ANDREF; + tok.line = lexer->lineno; + lexer_consume(lexer); + break; + case '/': + if (lexer_peek(lexer) == '/') { + while (lexer_curr(lexer) != '\n' && lexer_curr(lexer) != '\0') { + lexer_consume(lexer); + } + if (lexer_curr(lexer) == '\n') { + lexer_consume(lexer); + } + return lexer_next_token(lexer); + } else { + tok.type = ERROR; + tok.line = lexer->lineno; + } + break; + case '$': + tok.type = DOLLAR; + tok.line = lexer->lineno; + lexer_consume(lexer); + break; + case '-': + if (lexer_peek(lexer) == '>') { + tok.type = ARROW; + tok.line = lexer->lineno; + lexer_consume(lexer); + lexer_consume(lexer); + } else { + tok.type = ERROR; + tok.line = lexer->lineno; + } + break; + case '\"': + tmp = calloc(4096, sizeof(char)); + size_t i = 0; + char to_be_buff; + lexer_consume(lexer); + to_be_buff = lexer_curr(lexer); + while (to_be_buff != '\"') { + tmp[i] = to_be_buff; + i++; + lexer_consume(lexer); + to_be_buff = lexer_curr(lexer); + if (to_be_buff == '\n') { + tok.type = ERROR; + tok.line = lexer->lineno; + lexer_consume(lexer); + goto final; + } + } + tok.type = STRING_LITERAL; + tok.line = lexer->lineno; + tok.data.len = i; + tok.data.string = calloc(i + 1, sizeof(char)); + strncpy(tok.data.string, tmp, i + 1); + tok.data.string[i] = '\0'; + + lexer_consume(lexer); + break; + case '\0': + tok.type = ERROR; + tok.line = lexer->lineno; + break; + default: + if (is_digit(c)) { + tmp = calloc(4096, sizeof(char)); + size_t i = 0; + char tbb; + tmp[i] = c; + i++; + lexer_consume(lexer); + tbb = lexer_curr(lexer); + while (is_digit(tbb)) { + tmp[i] = tbb; + i++; + lexer_consume(lexer); + tbb = lexer_curr(lexer); + } + tok.type = NUM; + tok.line = lexer->lineno; + tok.data.len = i; + tok.data.string = calloc(i + 1, sizeof(char)); + tok.data.string[i] = '\0'; + strncpy(tok.data.string, tmp, i + 1); + } else if (is_alpha(c)) { + tmp = calloc(4096, sizeof(char)); + size_t i = 0; + char tbb; + tmp[i] = c; + i++; + lexer_consume(lexer); + tbb = lexer_curr(lexer); + while (is_alpha_digit(tbb)) { + tmp[i] = tbb; + i++; + lexer_consume(lexer); + tbb = lexer_curr(lexer); + } + tmp[i] = '\0'; + + // Check for keywords + tok.line = lexer->lineno; + if (strcmp(tmp, "val") == 0) { + tok.type = VAL;// VAL token type not in enum, treating as ID + } else if (strcmp(tmp, "return") == 0) { + tok.type = RETURN; + } else if (strcmp(tmp, "if") == 0) { + tok.type = IF; + } else if (strcmp(tmp, "else") == 0) { + tok.type = ELSE; + } else { + tok.type = ID; + tok.data.len = i; + tok.data.string = calloc(i + 1, sizeof(char)); + strncpy(tok.data.string, tmp, i + 1); + tok.data.string[i] = '\0'; + } + + } else { + tok.type = ERROR; + tok.line = lexer->lineno; + } + break; + } +final: + if (tmp != NULL) { + free(tmp); + } + return tok; } -void free_lexer(Lexer *lexer) { +void lexer_free(Lexer *lexer) { free(lexer->buffer); free(lexer); } \ No newline at end of file diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..ca9c220 --- /dev/null +++ b/src/main.c @@ -0,0 +1,27 @@ +#include "globals.h" +#include "lex.h" +#include "util.h" + +int main() { + Lexer *lexer = lexer_new(); + + FILE *f = fopen("test.cval", "r"); + + if (f == NULL) { + perror("Failed to open file"); + return 1; + } + + lexer_set_source(lexer, f); + + Token tok; + + do { + tok = lexer_next_token(lexer); + print_token(tok); + } while(tok.type != EOF_TOKEN && tok.type != ERROR); + + fclose(f); + + return 0; +} \ No newline at end of file diff --git a/src/util.c b/src/util.c new file mode 100644 index 0000000..5c6f902 --- /dev/null +++ b/src/util.c @@ -0,0 +1,107 @@ +#include "util.h" + +#include +#include + +int is_whitespace(char c) { + return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\v' || c == '\f'; +} + +int is_alpha(char c) { + char tmp_lower = c - 'a'; + char tmp_upper = c - 'A'; + + return (tmp_lower <= ('z' - 'a') && 0 <= tmp_lower) || (tmp_upper <= ('Z' - 'A') && 0 <= tmp_upper); +} + +int is_digit(char c) { + char tmp = c - '0'; + return 0 <= tmp && tmp <= 9; +} + +int is_alpha_digit(char c) { + return is_digit(c) || is_alpha(c) || c == '_'; +} + +void print_token(Token tok) { + char token_name[20]; + + switch (tok.type) { + case LBRACK: + strcpy(token_name, "LBRACK"); + break; + case RBRACK: + strcpy(token_name, "RBRACK"); + break; + case LCURLY: + strcpy(token_name, "LCURLY"); + break; + case RCURLY: + strcpy(token_name, "RCURLY"); + break; + case LPAREN: + strcpy(token_name, "LPAREN"); + break; + case RPAREN: + strcpy(token_name, "RPAREN"); + break; + case SEMI: + strcpy(token_name, "SEMI"); + break; + case COMMA: + strcpy(token_name, "COMMA"); + break; + case ARROW: + strcpy(token_name, "ARROW"); + break; + case STAR: + strcpy(token_name, "STAR"); + break; + case ANDREF: + strcpy(token_name, "ANDREF"); + break; + case DOLLAR: + strcpy(token_name, "DOLLAR"); + break; + case COMMENT: + strcpy(token_name, "COMMENT"); + break; + case ID: + strcpy(token_name, "ID"); + break; + case NUM: + strcpy(token_name, "NUM"); + break; + case STRING_LITERAL: + strcpy(token_name, "STRING_LITERAL"); + break; + case VAL: + strcpy(token_name, "VAL"); + break; + case RETURN: + strcpy(token_name, "RETURN"); + break; + case IF: + strcpy(token_name, "IF"); + break; + case ELSE: + strcpy(token_name, "ELSE"); + break; + case EOF_TOKEN: + strcpy(token_name, "EOF_TOKEN"); + break; + case ERROR: + strcpy(token_name, "ERROR"); + break; + default: + strcpy(token_name, "UNKNOWN"); + break; + } + + printf("Token Name: %s ", token_name); + printf("Line: %u ", tok.line); + if (tok.type == ID || tok.type == NUM || tok.type == STRING_LITERAL) { + printf("Data: %s", tok.data.string); + } + printf("\n"); +} \ No newline at end of file diff --git a/test.cval b/test.cval index 3efec16..a316d79 100644 --- a/test.cval +++ b/test.cval @@ -1,19 +1,21 @@ -val int s 3; # global +val int s 3; // global val [int->int] fib (int n) { - return if == n 0 { + return if eq n 0 { 1 } else { - n * fib + mul n fib }; }; val [->int] main { val str line input; - val int n to_int line,; - if < n 0 { + val int n to_int line; + if less n 0 { print "illegal number" } else { - print to_str fib n,; + print to_str { + fib n + }; } }; \ No newline at end of file