123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 |
- #include <stdlib.h>
- #include <stdio.h>
- #include <string.h>
- #include <regex.h>
-
- #include "lexer.h"
-
- struct token_reg
- {
- regex_t reg;
- lexer_token_class tokenClass;
- int matched;
- regmatch_t pmatch;
- };
-
- static struct token_reg regex(const char* str, lexer_token_class tokenClass)
- {
- regex_t reg;
- int err = regcomp(®, str, REG_EXTENDED);
- if (err)
- {
- fprintf(stderr, "Invalid regex: '%s'\n", str);
- fprintf(stderr, "Error code: %i\n", err);
- exit(1);
- }
-
- struct token_reg res =
- {
- reg,
- tokenClass,
- 0
- };
-
- return res;
- }
-
- static int regmatch(regex_t reg, char* str, regmatch_t* pmatch)
- {
- return (regexec(®, str, 1, pmatch, 0) != REG_NOMATCH);
- }
-
- static int tokens_append(
- lexer_tokens* tokens,
- lexer_token_class tokenClass,
- char* str,
- size_t len)
- {
- puts("hi");
- tokens->length += 1;
- if (tokens->length > tokens->allocd)
- {
- if (tokens->allocd)
- tokens->allocd *= 2;
- else
- tokens->allocd = 1;
-
- tokens->pairs = realloc(tokens->pairs, tokens->allocd);
- if (!tokens->pairs)
- return 1;
- puts("realloced");
- }
-
- puts("appending");
- lexer_token* pair = &(tokens->pairs[tokens->length - 1]);
- pair->tokenClass = tokenClass;
- pair->str = str;
- pair->len = len;
-
- printf("class: %i, pair length: %i, tokens length: %i, allocd: %i, str: %s\n", pair->tokenClass, (int)(pair->len), (int)(tokens->length), (int)(tokens->length), pair->str);
-
- puts("bye\n");
- return 0;
- }
-
- lexer_tokens* lexer_analyze(char* str)
- {
- printf("str: %s\n", str);
-
- size_t len = strlen(str);
-
- struct token_reg tregs[] =
- {
- regex("\\s+", LEXER_TOKEN_WHITESPACE),
- regex("[a-zA-Z][a-zA-Z0-9]*", LEXER_TOKEN_IDENTIFIER),
- regex("if|ret|func|str|arr|err|null", LEXER_TOKEN_KEYWORD),
- regex("=|==", LEXER_TOKEN_OPERATOR),
- regex("\\,", LEXER_TOKEN_SEPARATOR),
-
- regex("[0-9]+", LEXER_TOKEN_INTEGER),
- regex("\\\"[^\\\"]*\\\"", LEXER_TOKEN_STRING),
-
- regex("\\{", LEXER_TOKEN_FUNCTION_START),
- regex("\\}", LEXER_TOKEN_FUNCTION_END),
- regex("\\(", LEXER_TOKEN_EXPRESSION_START),
- regex("\\)", LEXER_TOKEN_EXPRESSION_END)
- };
-
- lexer_tokens* tokens = malloc(sizeof(lexer_tokens));
- if (!tokens)
- return NULL;
-
- tokens->pairs = NULL;
- tokens->length = 0;
- tokens->allocd = 0;
-
- regmatch_t pmatch;
- size_t offset = 0;
-
- #define APPEND(treg) tokens_append( \
- tokens, \
- treg.tokenClass, \
- str + offset + treg.pmatch.rm_so, \
- treg.pmatch.rm_eo - treg.pmatch.rm_so \
- )
-
- #define LENGTH(treg) (treg.pmatch.rm_eo - treg.pmatch.rm_so)
-
- while (1)
- {
- size_t i;
-
- //Reset .matched property for all tregs
- for (i = 0; i < LEXER_TOKEN_NONE; ++i)
- {
- tregs[i].matched = 0;
- }
-
- //See which tregs match
- for (i = 0; i < LEXER_TOKEN_NONE; ++i)
- {
- struct token_reg treg = tregs[i];
- if (regmatch(treg.reg, str + offset, &pmatch))
- {
- treg.matched = 1;
- treg.pmatch = pmatch;
- }
- }
-
- //Find longest match
- struct
- {
- size_t length;
- size_t index;
- } longest;
- longest.length = -1;
- longest.index = -1;
- for (i = 0; i < LEXER_TOKEN_NONE; ++i)
- {
- struct token_reg treg = tregs[i];
- if (!treg.matched)
- continue;
-
- if (LENGTH(treg) >= longest.length)
- {
- longest.length = LENGTH(treg);
- longest.index = i;
- }
- }
-
- //Append longest match
- APPEND(tregs[longest.index]);
-
- offset += pmatch.rm_eo;
- if (offset >= len)
- break;
- }
-
- #undef APPEND
- #undef LENGTH
-
- return tokens;
- }
|