You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lexer.c 3.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. #include <stdlib.h>
  2. #include <stdio.h>
  3. #include <string.h>
  4. #include <regex.h>
  5. #include "lexer.h"
  6. struct token_reg
  7. {
  8. regex_t reg;
  9. lexer_token_class tokenClass;
  10. int matched;
  11. regmatch_t pmatch;
  12. };
  13. static struct token_reg regex(const char* str, lexer_token_class tokenClass)
  14. {
  15. regex_t reg;
  16. int err = regcomp(&reg, str, REG_EXTENDED);
  17. if (err)
  18. {
  19. fprintf(stderr, "Invalid regex: '%s'\n", str);
  20. fprintf(stderr, "Error code: %i\n", err);
  21. exit(1);
  22. }
  23. struct token_reg res =
  24. {
  25. reg,
  26. tokenClass,
  27. 0
  28. };
  29. return res;
  30. }
  31. static int regmatch(regex_t reg, char* str, regmatch_t* pmatch)
  32. {
  33. return (regexec(&reg, str, 1, pmatch, 0) != REG_NOMATCH);
  34. }
  35. static void tokens_append(
  36. lexer_tokens* tokens,
  37. lexer_token_class tokenClass,
  38. char* str,
  39. size_t len)
  40. {
  41. puts("hi");
  42. tokens->length += 1;
  43. if (tokens->length > tokens->allocd)
  44. {
  45. printf("old allocd: %i\n", (int)(tokens->allocd));
  46. if (tokens->allocd)
  47. tokens->allocd *= 2;
  48. else
  49. tokens->allocd = 1;
  50. printf("new allocd: %i\n", (int)(tokens->allocd));
  51. tokens->pairs = realloc(tokens->pairs, tokens->allocd);
  52. if (!tokens->pairs)
  53. {
  54. fprintf(stderr, "Allocation error.");
  55. exit(1);
  56. }
  57. puts("realloced");
  58. }
  59. struct lexer_token pair =
  60. {
  61. tokenClass,
  62. str,
  63. len
  64. };
  65. printf("appending to %i\n", (int)(tokens->length - 1));
  66. tokens->pairs[tokens->length - 1] = pair;
  67. puts("appended");
  68. puts("bye\n");
  69. return;
  70. }
  71. lexer_tokens* lexer_analyze(char* str)
  72. {
  73. printf("str: %s\n", str);
  74. size_t len = strlen(str);
  75. struct token_reg tregs[] =
  76. {
  77. regex(LEXER_REGEX_WHITESPACE, LEXER_TOKEN_WHITESPACE),
  78. regex(LEXER_REGEX_IDENTIFIER, LEXER_TOKEN_IDENTIFIER),
  79. regex(LEXER_REGEX_KEYWORD, LEXER_TOKEN_KEYWORD),
  80. regex(LEXER_REGEX_OPERATOR, LEXER_TOKEN_OPERATOR),
  81. regex(LEXER_REGEX_INTEGER, LEXER_TOKEN_INTEGER),
  82. regex(LEXER_REGEX_STRING, LEXER_TOKEN_STRING),
  83. regex(LEXER_REGEX_COMMA, LEXER_TOKEN_COMMA),
  84. regex(LEXER_REGEX_OPENBRACE, LEXER_TOKEN_OPENBRACE),
  85. regex(LEXER_REGEX_CLOSEBRACE, LEXER_TOKEN_CLOSEBRACE),
  86. regex(LEXER_REGEX_OPENPAREN, LEXER_TOKEN_OPENPAREN),
  87. regex(LEXER_REGEX_CLOSEPAREN, LEXER_TOKEN_CLOSEPAREN)
  88. };
  89. lexer_tokens* tokens = malloc(sizeof(lexer_tokens));
  90. if (!tokens)
  91. return NULL;
  92. tokens->pairs = NULL;
  93. tokens->length = 0;
  94. tokens->allocd = 0;
  95. regmatch_t pmatch;
  96. size_t offset = 0;
  97. #define APPEND(treg) tokens_append( \
  98. tokens, \
  99. treg.tokenClass, \
  100. str + offset + treg.pmatch.rm_so, \
  101. treg.pmatch.rm_eo - treg.pmatch.rm_so \
  102. )
  103. #define LENGTH(treg) (treg.pmatch.rm_eo - treg.pmatch.rm_so)
  104. while (1)
  105. {
  106. size_t i;
  107. //Reset .matched property for all tregs
  108. for (i = 0; i < LEXER_TOKEN_NONE; ++i)
  109. {
  110. tregs[i].matched = 0;
  111. }
  112. //See which tregs match
  113. for (i = 0; i < LEXER_TOKEN_NONE; ++i)
  114. {
  115. struct token_reg treg = tregs[i];
  116. if (regmatch(treg.reg, str + offset, &pmatch))
  117. {
  118. treg.matched = 1;
  119. treg.pmatch = pmatch;
  120. }
  121. }
  122. //Find longest match
  123. struct
  124. {
  125. size_t length;
  126. size_t index;
  127. } longest;
  128. longest.length = -1;
  129. longest.index = -1;
  130. for (i = 0; i < LEXER_TOKEN_NONE; ++i)
  131. {
  132. struct token_reg treg = tregs[i];
  133. if (!treg.matched)
  134. continue;
  135. if (LENGTH(treg) >= longest.length)
  136. {
  137. longest.length = LENGTH(treg);
  138. longest.index = i;
  139. }
  140. }
  141. //Append longest match
  142. APPEND(tregs[longest.index]);
  143. offset += pmatch.rm_eo;
  144. if (offset >= len)
  145. break;
  146. }
  147. #undef APPEND
  148. #undef LENGTH
  149. return tokens;
  150. }