You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lexer.c 3.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. #include <stdlib.h>
  2. #include <stdio.h>
  3. #include <string.h>
  4. #include <regex.h>
  5. #include "lexer.h"
  6. struct token_reg
  7. {
  8. regex_t reg;
  9. lexer_token_class tokenClass;
  10. int matched;
  11. regmatch_t pmatch;
  12. };
  13. static struct token_reg regex(const char* str, lexer_token_class tokenClass)
  14. {
  15. regex_t reg;
  16. int err = regcomp(&reg, str, REG_EXTENDED);
  17. if (err)
  18. {
  19. fprintf(stderr, "Invalid regex: '%s'\n", str);
  20. fprintf(stderr, "Error code: %i\n", err);
  21. exit(1);
  22. }
  23. struct token_reg res =
  24. {
  25. reg,
  26. tokenClass,
  27. 0
  28. };
  29. return res;
  30. }
  31. static int regmatch(regex_t reg, char* str, regmatch_t* pmatch)
  32. {
  33. return (regexec(&reg, str, 1, pmatch, 0) != REG_NOMATCH);
  34. }
  35. static int tokens_append(
  36. lexer_tokens* tokens,
  37. lexer_token_class tokenClass,
  38. char* str,
  39. size_t len)
  40. {
  41. puts("hi");
  42. tokens->length += 1;
  43. if (tokens->length > tokens->allocd)
  44. {
  45. if (tokens->allocd)
  46. tokens->allocd *= 2;
  47. else
  48. tokens->allocd = 1;
  49. tokens->pairs = realloc(tokens->pairs, tokens->allocd);
  50. if (!tokens->pairs)
  51. return 1;
  52. puts("realloced");
  53. }
  54. puts("appending");
  55. lexer_token* pair = &(tokens->pairs[tokens->length - 1]);
  56. pair->tokenClass = tokenClass;
  57. pair->str = str;
  58. pair->len = len;
  59. printf("class: %i, pair length: %i, tokens length: %i, allocd: %i, str: %s\n", pair->tokenClass, (int)(pair->len), (int)(tokens->length), (int)(tokens->length), pair->str);
  60. puts("bye\n");
  61. return 0;
  62. }
  63. lexer_tokens* lexer_analyze(char* str)
  64. {
  65. printf("str: %s\n", str);
  66. size_t len = strlen(str);
  67. struct token_reg tregs[] =
  68. {
  69. regex("\\s+", LEXER_TOKEN_WHITESPACE),
  70. regex("[a-zA-Z][a-zA-Z0-9]*", LEXER_TOKEN_IDENTIFIER),
  71. regex("if|ret|func|str|arr|err|null", LEXER_TOKEN_KEYWORD),
  72. regex("=|==", LEXER_TOKEN_OPERATOR),
  73. regex("\\,", LEXER_TOKEN_SEPARATOR),
  74. regex("[0-9]+", LEXER_TOKEN_INTEGER),
  75. regex("\\\"[^\\\"]*\\\"", LEXER_TOKEN_STRING),
  76. regex("\\{", LEXER_TOKEN_FUNCTION_START),
  77. regex("\\}", LEXER_TOKEN_FUNCTION_END),
  78. regex("\\(", LEXER_TOKEN_EXPRESSION_START),
  79. regex("\\)", LEXER_TOKEN_EXPRESSION_END)
  80. };
  81. lexer_tokens* tokens = malloc(sizeof(lexer_tokens));
  82. if (!tokens)
  83. return NULL;
  84. tokens->pairs = NULL;
  85. tokens->length = 0;
  86. tokens->allocd = 0;
  87. regmatch_t pmatch;
  88. size_t offset = 0;
  89. #define APPEND(treg) tokens_append( \
  90. tokens, \
  91. treg.tokenClass, \
  92. str + offset + treg.pmatch.rm_so, \
  93. treg.pmatch.rm_eo - treg.pmatch.rm_so \
  94. )
  95. #define LENGTH(treg) (treg.pmatch.rm_eo - treg.pmatch.rm_so)
  96. while (1)
  97. {
  98. size_t i;
  99. //Reset .matched property for all tregs
  100. for (i = 0; i < LEXER_TOKEN_NONE; ++i)
  101. {
  102. tregs[i].matched = 0;
  103. }
  104. //See which tregs match
  105. for (i = 0; i < LEXER_TOKEN_NONE; ++i)
  106. {
  107. struct token_reg treg = tregs[i];
  108. if (regmatch(treg.reg, str + offset, &pmatch))
  109. {
  110. treg.matched = 1;
  111. treg.pmatch = pmatch;
  112. }
  113. }
  114. //Find longest match
  115. struct
  116. {
  117. size_t length;
  118. size_t index;
  119. } longest;
  120. longest.length = -1;
  121. longest.index = -1;
  122. for (i = 0; i < LEXER_TOKEN_NONE; ++i)
  123. {
  124. struct token_reg treg = tregs[i];
  125. if (!treg.matched)
  126. continue;
  127. if (LENGTH(treg) >= longest.length)
  128. {
  129. longest.length = LENGTH(treg);
  130. longest.index = i;
  131. }
  132. }
  133. //Append longest match
  134. APPEND(tregs[longest.index]);
  135. offset += pmatch.rm_eo;
  136. if (offset >= len)
  137. break;
  138. }
  139. #undef APPEND
  140. #undef LENGTH
  141. return tokens;
  142. }