minor implement lexer

This commit is contained in:
2025-11-15 08:01:34 +09:00
parent 171006117e
commit 682cbdc216
10 changed files with 407 additions and 20 deletions

4
.gitignore vendored
View File

@@ -1 +1,3 @@
.vscode
.vscode
build

View File

@@ -0,0 +1,23 @@
CC := gcc
CFLAGS := -Wall -Wextra -Werror -g -I./include
BUILD_DIR := build
SRC := $(wildcard src/*.c)
OBJ := $(patsubst src/%.c,${BUILD_DIR}/%.o,$(SRC))
TARGET := cval.out
.PHONY: all clean
all: $(TARGET)
$(TARGET): $(OBJ)
$(CC) $(CFLAGS) -o $(BUILD_DIR)/$(TARGET) $(OBJ)
$(BUILD_DIR)/%.o: src/%.c
$(CC) $(CFLAGS) -c $< -o $@
clean:
rm -f $(OBJ) $(BUILD_DIR)/$(TARGET)

View File

@@ -8,18 +8,16 @@
* RCURLY `}`
* LPAREN `(`
* RPAREN `)`
* ID `[any]`
* ID `[any character without whitespace]+`
* SEMI `;`
* COMMA `,`
* ARROW `->`
* STAR `*`
* ANDREF `&`
* DOLLAR `$`
* COMMENT `#`
* NUM `[0-9]*(.[0-9]+)?`
* RETURN `return`
* IF `if`
* ELSE `else`
* COMMENT `//`
* NUM `[0-9]*`
* VAL
* STRING `"{any}"`
## Syntax Spec

View File

@@ -1,5 +1,6 @@
#pragma once
#include <stdint.h>
#include <stddef.h>
typedef enum {
LBRACK,
@@ -21,15 +22,23 @@ typedef enum {
NUM,
STRING_LITERAL,
VAL,
RETURN,
IF,
ELSE,
EOF_TOKEN,
ERROR
} TokenType;
typedef struct {
size_t len;
char* string;
} TokenString;
typedef struct {
TokenType type;
char *data;
uint32_t line;
TokenString data;
} Token;

View File

@@ -22,7 +22,7 @@ typedef struct Lexer {
size_t bytes_in_buffer;
} Lexer;
Lexer *new_lexer();
Lexer *lexer_new();
void lexer_set_source(Lexer *lexer, FILE *source);
@@ -36,4 +36,4 @@ char lexer_peek(Lexer *lexer);
Token lexer_next_token(Lexer *lexer);
void free_lexer(Lexer *lexer);
void lexer_free(Lexer *lexer);

12
include/util.h Normal file
View File

@@ -0,0 +1,12 @@
#pragma once
#include "globals.h"
int is_whitespace(char c);
int is_alpha(char c);
int is_digit(char c);
int is_alpha_digit(char c);
void print_token(Token tok);

213
src/lex.c
View File

@@ -1,6 +1,11 @@
#include "lex.h"
Lexer *new_lexer() {
#include "util.h"
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
Lexer *lexer_new() {
Lexer *lexer = malloc(sizeof(Lexer));
lexer->linepos = 0;
@@ -13,6 +18,8 @@ Lexer *new_lexer() {
lexer->buffer = calloc(LEX_BUF_SIZE, sizeof(char));
lexer->curr = lexer->buffer;
lexer->buffer[0] = '\0';
return lexer;
}
void lexer_set_source(Lexer *lexer, FILE *source) {
@@ -20,6 +27,9 @@ void lexer_set_source(Lexer *lexer, FILE *source) {
lexer->linepos = 0;
lexer->source = source;
lexer_fill_buffer(lexer);
}
void lexer_fill_buffer(Lexer *lexer) {
@@ -56,6 +66,13 @@ void lexer_consume(Lexer *lexer) {
if (read == LEX_BUF_SIZE - 2) {
lexer_fill_buffer(lexer);
} else {
if (lexer_curr(lexer) == '\n') {
lexer->lineno += 1;
lexer->linepos = 0;
} else {
lexer->linepos++;
}
lexer->curr++;
}
}
@@ -69,10 +86,200 @@ char lexer_peek(Lexer *lexer) {
}
Token lexer_next_token(Lexer *lexer) {
// TODO: implement
char c = lexer_curr(lexer);
if (c == '\0' && lexer->flag_EOF) {
Token eof_tok;
eof_tok.type = EOF_TOKEN;
eof_tok.line = lexer->lineno;
return eof_tok;
}
Token tok;
char *tmp = NULL;
while (is_whitespace(c)) {
lexer_consume(lexer);
c = lexer_curr(lexer);
}
switch (c) {
case '[':
tok.type = LBRACK;
tok.line = lexer->lineno;
lexer_consume(lexer);
break;
case ']':
tok.type = RBRACK;
tok.line = lexer->lineno;
lexer_consume(lexer);
break;
case '{':
tok.type = LCURLY;
tok.line = lexer->lineno;
lexer_consume(lexer);
break;
case '}':
tok.type = RCURLY;
tok.line = lexer->lineno;
lexer_consume(lexer);
break;
case '(':
tok.type = LPAREN;
tok.line = lexer->lineno;
lexer_consume(lexer);
break;
case ')':
tok.type = RPAREN;
tok.line = lexer->lineno;
lexer_consume(lexer);
break;
case ';':
tok.type = SEMI;
tok.line = lexer->lineno;
lexer_consume(lexer);
break;
case ',':
tok.type = COMMA;
tok.line = lexer->lineno;
lexer_consume(lexer);
break;
case '*':
tok.type = STAR;
tok.line = lexer->lineno;
lexer_consume(lexer);
break;
case '&':
tok.type = ANDREF;
tok.line = lexer->lineno;
lexer_consume(lexer);
break;
case '/':
if (lexer_peek(lexer) == '/') {
while (lexer_curr(lexer) != '\n' && lexer_curr(lexer) != '\0') {
lexer_consume(lexer);
}
if (lexer_curr(lexer) == '\n') {
lexer_consume(lexer);
}
return lexer_next_token(lexer);
} else {
tok.type = ERROR;
tok.line = lexer->lineno;
}
break;
case '$':
tok.type = DOLLAR;
tok.line = lexer->lineno;
lexer_consume(lexer);
break;
case '-':
if (lexer_peek(lexer) == '>') {
tok.type = ARROW;
tok.line = lexer->lineno;
lexer_consume(lexer);
lexer_consume(lexer);
} else {
tok.type = ERROR;
tok.line = lexer->lineno;
}
break;
case '\"':
tmp = calloc(4096, sizeof(char));
size_t i = 0;
char to_be_buff;
lexer_consume(lexer);
to_be_buff = lexer_curr(lexer);
while (to_be_buff != '\"') {
tmp[i] = to_be_buff;
i++;
lexer_consume(lexer);
to_be_buff = lexer_curr(lexer);
if (to_be_buff == '\n') {
tok.type = ERROR;
tok.line = lexer->lineno;
lexer_consume(lexer);
goto final;
}
}
tok.type = STRING_LITERAL;
tok.line = lexer->lineno;
tok.data.len = i;
tok.data.string = calloc(i + 1, sizeof(char));
strncpy(tok.data.string, tmp, i + 1);
tok.data.string[i] = '\0';
lexer_consume(lexer);
break;
case '\0':
tok.type = ERROR;
tok.line = lexer->lineno;
break;
default:
if (is_digit(c)) {
tmp = calloc(4096, sizeof(char));
size_t i = 0;
char tbb;
tmp[i] = c;
i++;
lexer_consume(lexer);
tbb = lexer_curr(lexer);
while (is_digit(tbb)) {
tmp[i] = tbb;
i++;
lexer_consume(lexer);
tbb = lexer_curr(lexer);
}
tok.type = NUM;
tok.line = lexer->lineno;
tok.data.len = i;
tok.data.string = calloc(i + 1, sizeof(char));
tok.data.string[i] = '\0';
strncpy(tok.data.string, tmp, i + 1);
} else if (is_alpha(c)) {
tmp = calloc(4096, sizeof(char));
size_t i = 0;
char tbb;
tmp[i] = c;
i++;
lexer_consume(lexer);
tbb = lexer_curr(lexer);
while (is_alpha_digit(tbb)) {
tmp[i] = tbb;
i++;
lexer_consume(lexer);
tbb = lexer_curr(lexer);
}
tmp[i] = '\0';
// Check for keywords
tok.line = lexer->lineno;
if (strcmp(tmp, "val") == 0) {
tok.type = VAL;// VAL token type not in enum, treating as ID
} else if (strcmp(tmp, "return") == 0) {
tok.type = RETURN;
} else if (strcmp(tmp, "if") == 0) {
tok.type = IF;
} else if (strcmp(tmp, "else") == 0) {
tok.type = ELSE;
} else {
tok.type = ID;
tok.data.len = i;
tok.data.string = calloc(i + 1, sizeof(char));
strncpy(tok.data.string, tmp, i + 1);
tok.data.string[i] = '\0';
}
} else {
tok.type = ERROR;
tok.line = lexer->lineno;
}
break;
}
final:
if (tmp != NULL) {
free(tmp);
}
return tok;
}
void free_lexer(Lexer *lexer) {
void lexer_free(Lexer *lexer) {
free(lexer->buffer);
free(lexer);
}

27
src/main.c Normal file
View File

@@ -0,0 +1,27 @@
#include "globals.h"
#include "lex.h"
#include "util.h"
int main() {
Lexer *lexer = lexer_new();
FILE *f = fopen("test.cval", "r");
if (f == NULL) {
perror("Failed to open file");
return 1;
}
lexer_set_source(lexer, f);
Token tok;
do {
tok = lexer_next_token(lexer);
print_token(tok);
} while(tok.type != EOF_TOKEN && tok.type != ERROR);
fclose(f);
return 0;
}

107
src/util.c Normal file
View File

@@ -0,0 +1,107 @@
#include "util.h"
#include <stdio.h>
#include <string.h>
int is_whitespace(char c) {
return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\v' || c == '\f';
}
int is_alpha(char c) {
char tmp_lower = c - 'a';
char tmp_upper = c - 'A';
return (tmp_lower <= ('z' - 'a') && 0 <= tmp_lower) || (tmp_upper <= ('Z' - 'A') && 0 <= tmp_upper);
}
int is_digit(char c) {
char tmp = c - '0';
return 0 <= tmp && tmp <= 9;
}
int is_alpha_digit(char c) {
return is_digit(c) || is_alpha(c) || c == '_';
}
void print_token(Token tok) {
char token_name[20];
switch (tok.type) {
case LBRACK:
strcpy(token_name, "LBRACK");
break;
case RBRACK:
strcpy(token_name, "RBRACK");
break;
case LCURLY:
strcpy(token_name, "LCURLY");
break;
case RCURLY:
strcpy(token_name, "RCURLY");
break;
case LPAREN:
strcpy(token_name, "LPAREN");
break;
case RPAREN:
strcpy(token_name, "RPAREN");
break;
case SEMI:
strcpy(token_name, "SEMI");
break;
case COMMA:
strcpy(token_name, "COMMA");
break;
case ARROW:
strcpy(token_name, "ARROW");
break;
case STAR:
strcpy(token_name, "STAR");
break;
case ANDREF:
strcpy(token_name, "ANDREF");
break;
case DOLLAR:
strcpy(token_name, "DOLLAR");
break;
case COMMENT:
strcpy(token_name, "COMMENT");
break;
case ID:
strcpy(token_name, "ID");
break;
case NUM:
strcpy(token_name, "NUM");
break;
case STRING_LITERAL:
strcpy(token_name, "STRING_LITERAL");
break;
case VAL:
strcpy(token_name, "VAL");
break;
case RETURN:
strcpy(token_name, "RETURN");
break;
case IF:
strcpy(token_name, "IF");
break;
case ELSE:
strcpy(token_name, "ELSE");
break;
case EOF_TOKEN:
strcpy(token_name, "EOF_TOKEN");
break;
case ERROR:
strcpy(token_name, "ERROR");
break;
default:
strcpy(token_name, "UNKNOWN");
break;
}
printf("Token Name: %s ", token_name);
printf("Line: %u ", tok.line);
if (tok.type == ID || tok.type == NUM || tok.type == STRING_LITERAL) {
printf("Data: %s", tok.data.string);
}
printf("\n");
}

View File

@@ -1,19 +1,21 @@
val int s 3; # global
val int s 3; // global
val [int->int] fib (int n) {
return if == n 0 {
return if eq n 0 {
1
} else {
n * fib
mul n fib
};
};
val [->int] main {
val str line input;
val int n to_int line,;
if < n 0 {
val int n to_int line;
if less n 0 {
print "illegal number"
} else {
print to_str fib n,;
print to_str {
fib n
};
}
};