|
|
@@ -0,0 +1,141 @@ |
|
|
|
#include <stdlib.h> |
|
|
|
#include <stdio.h> |
|
|
|
#include <regex.h> |
|
|
|
|
|
|
|
#include "lexer.h" |
|
|
|
|
|
|
|
static regex_t regex(const char* str) |
|
|
|
{ |
|
|
|
regex_t reg; |
|
|
|
int err = regcomp(®, str, REG_EXTENDED); |
|
|
|
if (err) |
|
|
|
{ |
|
|
|
fprintf(stderr, "Invalid regex: '%s'\n", str); |
|
|
|
fprintf(stderr, "Error code: %i\n", err); |
|
|
|
exit(1); |
|
|
|
} |
|
|
|
|
|
|
|
return reg; |
|
|
|
} |
|
|
|
|
|
|
|
static int regmatch(regex_t reg, char* str, regmatch_t* pmatch) |
|
|
|
{ |
|
|
|
return (regexec(®, str, 1, pmatch, 0) != REG_NOMATCH); |
|
|
|
} |
|
|
|
|
|
|
|
static int tokens_append( |
|
|
|
lexer_tokens* tokens, |
|
|
|
lexer_token_class tokenClass, |
|
|
|
char* str, |
|
|
|
size_t len) |
|
|
|
{ |
|
|
|
tokens->length += 1; |
|
|
|
if (tokens->length > tokens->allocd) |
|
|
|
{ |
|
|
|
tokens->allocd *= 2; |
|
|
|
tokens->pairs = realloc(tokens->pairs, tokens->allocd); |
|
|
|
if (!tokens->pairs) |
|
|
|
return 1; |
|
|
|
} |
|
|
|
|
|
|
|
lexer_token pair = tokens->pairs[tokens->length - 1]; |
|
|
|
pair.tokenClass = tokenClass; |
|
|
|
pair.str = str; |
|
|
|
pair.len = len; |
|
|
|
|
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
lexer_tokens* lexer_analyze(char* str) |
|
|
|
{ |
|
|
|
regex_t whitespace = regex("\\s+"); |
|
|
|
regex_t identifier = regex("[a-zA-Z][a-zA-Z0-9]*"); |
|
|
|
regex_t keyword = regex("if|ret|func|str|arr|err|null"); |
|
|
|
regex_t operator = regex("=|=="); |
|
|
|
regex_t separator = regex("\\,"); |
|
|
|
|
|
|
|
regex_t integer = regex("[0-9]+"); |
|
|
|
regex_t string = regex("\\\"[^\\\"]*\\\""); |
|
|
|
|
|
|
|
regex_t function_start = regex("\\{"); |
|
|
|
regex_t function_end = regex("\\}"); |
|
|
|
regex_t expression_start = regex("\\("); |
|
|
|
regex_t expression_end = regex("\\)"); |
|
|
|
|
|
|
|
lexer_tokens* tokens = malloc(sizeof(lexer_tokens)); |
|
|
|
if (!tokens) |
|
|
|
return NULL; |
|
|
|
|
|
|
|
tokens->pairs = NULL; |
|
|
|
tokens->length = 0; |
|
|
|
tokens->allocd = 0; |
|
|
|
|
|
|
|
regmatch_t pmatch; |
|
|
|
size_t offset = 0; |
|
|
|
|
|
|
|
#define APPEND(tclass) tokens_append(tokens, tclass, str + offset + pmatch.rm_so, pmatch.rm_eo - pmatch.rm_so) |
|
|
|
|
|
|
|
while (1) |
|
|
|
{ |
|
|
|
if (regmatch(whitespace, str + offset, &pmatch)) |
|
|
|
{ |
|
|
|
if (APPEND(LEXER_TOKEN_WHITESPACE)) |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
else if (regmatch(identifier, str + offset, &pmatch)) |
|
|
|
{ |
|
|
|
if (APPEND(LEXER_TOKEN_IDENTIFIER)) |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
else if (regmatch(keyword, str + offset, &pmatch)) |
|
|
|
{ |
|
|
|
if (APPEND(LEXER_TOKEN_KEYWORD)) |
|
|
|
return NULL; |
|
|
|
} else if (regmatch(operator, str + offset, &pmatch)) |
|
|
|
{ |
|
|
|
if (APPEND(LEXER_TOKEN_OPERATOR)) |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
else if (regmatch(separator, str + offset, &pmatch)) |
|
|
|
{ |
|
|
|
if (APPEND(LEXER_TOKEN_SEPARATOR)) |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
else if (regmatch(integer, str + offset, &pmatch)) |
|
|
|
{ |
|
|
|
if (APPEND(LEXER_TOKEN_INTEGER)) |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
else if (regmatch(string, str + offset, &pmatch)) |
|
|
|
{ |
|
|
|
if (APPEND(LEXER_TOKEN_STRING)) |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
else if (regmatch(function_start, str + offset, &pmatch)) |
|
|
|
{ |
|
|
|
if (APPEND(LEXER_TOKEN_FUNCTION_START)) |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
else if (regmatch(function_end, str + offset, &pmatch)) |
|
|
|
{ |
|
|
|
if (APPEND(LEXER_TOKEN_FUNCTION_END)) |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
else if (regmatch(expression_start, str + offset, &pmatch)) |
|
|
|
{ |
|
|
|
if (APPEND(LEXER_TOKEN_EXPRESSION_START)) |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
else if (regmatch(expression_end, str + offset, &pmatch)) |
|
|
|
{ |
|
|
|
if (APPEND(LEXER_TOKEN_EXPRESSION_END)) |
|
|
|
return NULL; |
|
|
|
} |
|
|
|
else |
|
|
|
{ |
|
|
|
return tokens; |
|
|
|
} |
|
|
|
|
|
|
|
offset += pmatch.rm_eo; |
|
|
|
} |
|
|
|
} |