|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141 |
- #include <stdlib.h>
- #include <stdio.h>
- #include <regex.h>
-
- #include "lexer.h"
-
- static regex_t regex(const char* str)
- {
- regex_t reg;
- int err = regcomp(®, str, REG_EXTENDED);
- if (err)
- {
- fprintf(stderr, "Invalid regex: '%s'\n", str);
- fprintf(stderr, "Error code: %i\n", err);
- exit(1);
- }
-
- return reg;
- }
-
- static int regmatch(regex_t reg, char* str, regmatch_t* pmatch)
- {
- return (regexec(®, str, 1, pmatch, 0) != REG_NOMATCH);
- }
-
- static int tokens_append(
- lexer_tokens* tokens,
- lexer_token_class tokenClass,
- char* str,
- size_t len)
- {
- tokens->length += 1;
- if (tokens->length > tokens->allocd)
- {
- tokens->allocd *= 2;
- tokens->pairs = realloc(tokens->pairs, tokens->allocd);
- if (!tokens->pairs)
- return 1;
- }
-
- lexer_token pair = tokens->pairs[tokens->length - 1];
- pair.tokenClass = tokenClass;
- pair.str = str;
- pair.len = len;
-
- return 0;
- }
-
- lexer_tokens* lexer_analyze(char* str)
- {
- regex_t whitespace = regex("\\s+");
- regex_t identifier = regex("[a-zA-Z][a-zA-Z0-9]*");
- regex_t keyword = regex("if|ret|func|str|arr|err|null");
- regex_t operator = regex("=|==");
- regex_t separator = regex("\\,");
-
- regex_t integer = regex("[0-9]+");
- regex_t string = regex("\\\"[^\\\"]*\\\"");
-
- regex_t function_start = regex("\\{");
- regex_t function_end = regex("\\}");
- regex_t expression_start = regex("\\(");
- regex_t expression_end = regex("\\)");
-
- lexer_tokens* tokens = malloc(sizeof(lexer_tokens));
- if (!tokens)
- return NULL;
-
- tokens->pairs = NULL;
- tokens->length = 0;
- tokens->allocd = 0;
-
- regmatch_t pmatch;
- size_t offset = 0;
-
- #define APPEND(tclass) tokens_append(tokens, tclass, str + offset + pmatch.rm_so, pmatch.rm_eo - pmatch.rm_so)
-
- while (1)
- {
- if (regmatch(whitespace, str + offset, &pmatch))
- {
- if (APPEND(LEXER_TOKEN_WHITESPACE))
- return NULL;
- }
- else if (regmatch(identifier, str + offset, &pmatch))
- {
- if (APPEND(LEXER_TOKEN_IDENTIFIER))
- return NULL;
- }
- else if (regmatch(keyword, str + offset, &pmatch))
- {
- if (APPEND(LEXER_TOKEN_KEYWORD))
- return NULL;
- } else if (regmatch(operator, str + offset, &pmatch))
- {
- if (APPEND(LEXER_TOKEN_OPERATOR))
- return NULL;
- }
- else if (regmatch(separator, str + offset, &pmatch))
- {
- if (APPEND(LEXER_TOKEN_SEPARATOR))
- return NULL;
- }
- else if (regmatch(integer, str + offset, &pmatch))
- {
- if (APPEND(LEXER_TOKEN_INTEGER))
- return NULL;
- }
- else if (regmatch(string, str + offset, &pmatch))
- {
- if (APPEND(LEXER_TOKEN_STRING))
- return NULL;
- }
- else if (regmatch(function_start, str + offset, &pmatch))
- {
- if (APPEND(LEXER_TOKEN_FUNCTION_START))
- return NULL;
- }
- else if (regmatch(function_end, str + offset, &pmatch))
- {
- if (APPEND(LEXER_TOKEN_FUNCTION_END))
- return NULL;
- }
- else if (regmatch(expression_start, str + offset, &pmatch))
- {
- if (APPEND(LEXER_TOKEN_EXPRESSION_START))
- return NULL;
- }
- else if (regmatch(expression_end, str + offset, &pmatch))
- {
- if (APPEND(LEXER_TOKEN_EXPRESSION_END))
- return NULL;
- }
- else
- {
- return tokens;
- }
-
- offset += pmatch.rm_eo;
- }
- }
|