Browse Source

something lexy

master
mort 8 years ago
parent
commit
2addc34b86
3 changed files with 92 additions and 75 deletions
  1. 60
    74
      src/lexer.c
  2. 3
    1
      src/lexer.h
  3. 29
    0
      src/main.c

+ 60
- 74
src/lexer.c View File

@@ -1,10 +1,17 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <regex.h>

#include "lexer.h"

static regex_t regex(const char* str)
struct token_reg
{
regex_t reg;
lexer_token_class tokenClass;
};

static struct token_reg regex(const char* str, lexer_token_class tokenClass)
{
regex_t reg;
int err = regcomp(&reg, str, REG_EXTENDED);
@@ -15,7 +22,13 @@ static regex_t regex(const char* str)
exit(1);
}

return reg;
struct token_reg res =
{
reg,
tokenClass
};

return res;
}

static int regmatch(regex_t reg, char* str, regmatch_t* pmatch)
@@ -29,38 +42,55 @@ static int tokens_append(
char* str,
size_t len)
{
puts("hi");
tokens->length += 1;
if (tokens->length > tokens->allocd)
{
tokens->allocd *= 2;
if (tokens->allocd)
tokens->allocd *= 2;
else
tokens->allocd = 1;

tokens->pairs = realloc(tokens->pairs, tokens->allocd);
if (!tokens->pairs)
return 1;
puts("realloced");
}

lexer_token pair = tokens->pairs[tokens->length - 1];
pair.tokenClass = tokenClass;
pair.str = str;
pair.len = len;
puts("appending");
lexer_token* pair = &(tokens->pairs[tokens->length - 1]);
pair->tokenClass = tokenClass;
pair->str = str;
pair->len = len;

printf("class: %i, pair length: %i, tokens length: %i, allocd: %i, str: %s\n", pair->tokenClass, (int)(pair->len), (int)(tokens->length), (int)(tokens->length), pair->str);

puts("bye\n");
return 0;
}

lexer_tokens* lexer_analyze(char* str)
{
regex_t whitespace = regex("\\s+");
regex_t identifier = regex("[a-zA-Z][a-zA-Z0-9]*");
regex_t keyword = regex("if|ret|func|str|arr|err|null");
regex_t operator = regex("=|==");
regex_t separator = regex("\\,");
printf("str: %s\n", str);

regex_t integer = regex("[0-9]+");
regex_t string = regex("\\\"[^\\\"]*\\\"");
size_t len = strlen(str);

regex_t function_start = regex("\\{");
regex_t function_end = regex("\\}");
regex_t expression_start = regex("\\(");
regex_t expression_end = regex("\\)");
struct token_reg tregs[] =
{
regex("\\s+", LEXER_TOKEN_WHITESPACE),
regex("[a-zA-Z][a-zA-Z0-9]*", LEXER_TOKEN_IDENTIFIER),
regex("if|ret|func|str|arr|err|null", LEXER_TOKEN_KEYWORD),
regex("=|==", LEXER_TOKEN_OPERATOR),
regex("\\,", LEXER_TOKEN_SEPARATOR),

regex("[0-9]+", LEXER_TOKEN_INTEGER),
regex("\\\"[^\\\"]*\\\"", LEXER_TOKEN_STRING),

regex("\\{", LEXER_TOKEN_FUNCTION_START),
regex("\\}", LEXER_TOKEN_FUNCTION_END),
regex("\\(", LEXER_TOKEN_EXPRESSION_START),
regex("\\)", LEXER_TOKEN_EXPRESSION_END)
};

lexer_tokens* tokens = malloc(sizeof(lexer_tokens));
if (!tokens)
@@ -77,65 +107,21 @@ lexer_tokens* lexer_analyze(char* str)

while (1)
{
if (regmatch(whitespace, str + offset, &pmatch))
{
if (APPEND(LEXER_TOKEN_WHITESPACE))
return NULL;
}
else if (regmatch(identifier, str + offset, &pmatch))
{
if (APPEND(LEXER_TOKEN_IDENTIFIER))
return NULL;
}
else if (regmatch(keyword, str + offset, &pmatch))
{
if (APPEND(LEXER_TOKEN_KEYWORD))
return NULL;
} else if (regmatch(operator, str + offset, &pmatch))
{
if (APPEND(LEXER_TOKEN_OPERATOR))
return NULL;
}
else if (regmatch(separator, str + offset, &pmatch))
{
if (APPEND(LEXER_TOKEN_SEPARATOR))
return NULL;
}
else if (regmatch(integer, str + offset, &pmatch))
{
if (APPEND(LEXER_TOKEN_INTEGER))
return NULL;
}
else if (regmatch(string, str + offset, &pmatch))
{
if (APPEND(LEXER_TOKEN_STRING))
return NULL;
}
else if (regmatch(function_start, str + offset, &pmatch))
{
if (APPEND(LEXER_TOKEN_FUNCTION_START))
return NULL;
}
else if (regmatch(function_end, str + offset, &pmatch))
size_t i;
for (i = 0; i < LEXER_TOKEN_NONE; ++i)
{
if (APPEND(LEXER_TOKEN_FUNCTION_END))
return NULL;
}
else if (regmatch(expression_start, str + offset, &pmatch))
{
if (APPEND(LEXER_TOKEN_EXPRESSION_START))
return NULL;
}
else if (regmatch(expression_end, str + offset, &pmatch))
{
if (APPEND(LEXER_TOKEN_EXPRESSION_END))
return NULL;
}
else
{
return tokens;
struct token_reg treg = tregs[i];
if (regmatch(treg.reg, str + offset, &pmatch))
{
APPEND(treg.tokenClass);
break;
}
}

offset += pmatch.rm_eo;
if (offset >= len)
break;
}

return tokens;
}

+ 3
- 1
src/lexer.h View File

@@ -15,7 +15,9 @@ typedef enum lexer_token_class
LEXER_TOKEN_FUNCTION_START,
LEXER_TOKEN_FUNCTION_END,
LEXER_TOKEN_EXPRESSION_START,
LEXER_TOKEN_EXPRESSION_END
LEXER_TOKEN_EXPRESSION_END,

LEXER_TOKEN_NONE
} lexer_token_class;

typedef struct lexer_token

+ 29
- 0
src/main.c View File

@@ -1,7 +1,36 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "vm.h"
#include "lexer.h"

static const char* strs[] =
{
"whitespace",
"identifier",
"keyword",
"operator",
"separator",
"integer",
"string",
"function_start",
"function_end",
"expression_start",
"expression_end"
};

int main(int argc, char** argv)
{
lexer_tokens* tokens = lexer_analyze("int foo = \"hi there\"");
size_t i;
for (i = 0; i < tokens->length; ++i)
{
lexer_token pair = tokens->pairs[i];
char* str = malloc(pair.len);
memcpy(str, pair.str, pair.len);
str[pair.len - 1] = '\0';

printf("class: %s, str: %s\n", strs[pair.tokenClass], str);
}
}

Loading…
Cancel
Save