You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lex.c 5.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. #include "parse/lex.h"
  2. #include <stdlib.h>
  3. const char *l2_token_kind_name(enum l2_token_kind kind) {
  4. switch (kind) {
  5. case L2_TOK_OPEN_PAREN:
  6. return "open-paren";
  7. case L2_TOK_CLOSE_PAREN:
  8. return "close-paren";
  9. case L2_TOK_OPEN_BRACE:
  10. return "open-brace";
  11. case L2_TOK_CLOSE_BRACE:
  12. return "close-brace";
  13. case L2_TOK_OPEN_BRACKET:
  14. return "open-bracket";
  15. case L2_TOK_CLOSE_BRACKET:
  16. return "close-bracket";
  17. case L2_TOK_COMMA:
  18. return "comma";
  19. case L2_TOK_PERIOD:
  20. return "period";
  21. case L2_TOK_COLON_EQ:
  22. return "period";
  23. case L2_TOK_EOF:
  24. return "end-of-file";
  25. case L2_TOK_NUMBER:
  26. return "number";
  27. case L2_TOK_STRING:
  28. return "string";
  29. case L2_TOK_IDENT:
  30. return "ident";
  31. case L2_TOK_ERROR:
  32. return "error";
  33. }
  34. }
  35. void l2_token_free(struct l2_token *tok) {
  36. if (tok->kind == L2_TOK_STRING) {
  37. free(tok->v.str);
  38. }
  39. }
  40. struct l2_token l2_token_move(struct l2_token *tok) {
  41. struct l2_token dup = *tok;
  42. if (tok->kind == L2_TOK_STRING) {
  43. tok->v.str = NULL;
  44. }
  45. return dup;
  46. }
  47. void l2_lexer_init(struct l2_lexer *lexer, struct l2_io_reader *r) {
  48. lexer->currtok.kind = L2_TOK_EOF,
  49. lexer->tokidx = 0;
  50. lexer->line = 1;
  51. lexer->ch = 1;
  52. l2_bufio_reader_init(&lexer->reader, r);
  53. }
  54. static int peek_ch(struct l2_lexer *lexer) {
  55. int ch = l2_bufio_peek(&lexer->reader, 1);
  56. return ch;
  57. }
  58. static int read_ch(struct l2_lexer *lexer) {
  59. int ch = l2_bufio_get(&lexer->reader);
  60. lexer->ch += 1;
  61. if (ch == '\n') {
  62. lexer->ch = 1;
  63. lexer->line += 1;
  64. }
  65. return ch;
  66. }
  67. static int is_whitespace(int ch) {
  68. return ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t';
  69. }
  70. static void skip_whitespace(struct l2_lexer *lexer) {
  71. while (is_whitespace(l2_bufio_peek(&lexer->reader, 1))) read_ch(lexer);
  72. }
  73. static void read_string(struct l2_lexer *lexer, struct l2_token *tok) {
  74. tok->kind = L2_TOK_STRING;
  75. tok->v.str = malloc(16);
  76. if (tok->v.str == NULL) {
  77. tok->kind = L2_TOK_ERROR;
  78. tok->v.str = "Allocaton failure";
  79. return;
  80. }
  81. size_t size = 16;
  82. size_t idx = 0;
  83. while (1) {
  84. int ch = read_ch(lexer);
  85. if (ch == '"') {
  86. return;
  87. } else if (ch == EOF) {
  88. tok->kind = L2_TOK_EOF;
  89. free(tok->v.str);
  90. tok->v.str = "Unexpected EOF";
  91. return;
  92. } else if (ch == '\\') {
  93. int ch2 = read_ch(lexer);
  94. switch (ch2) {
  95. case 'n':
  96. ch = '\n';
  97. break;
  98. case 'r':
  99. ch = '\r';
  100. break;
  101. case 't':
  102. ch = '\t';
  103. break;
  104. case EOF:
  105. tok->kind = L2_TOK_EOF;
  106. free(tok->v.str);
  107. tok->v.str = "Unexpected EOF";
  108. return;
  109. default:
  110. ch = ch2;
  111. break;
  112. }
  113. }
  114. tok->v.str[idx++] = (char)ch;
  115. if (idx >= size) {
  116. size *= 2;
  117. char *newbuf = realloc(tok->v.str, size);
  118. if (newbuf == NULL) {
  119. free(tok->v.str);
  120. tok->kind = L2_TOK_ERROR;
  121. tok->v.str = "Allocation failure";
  122. return;
  123. }
  124. tok->v.str = newbuf;
  125. }
  126. }
  127. }
  128. static void read_ident(struct l2_lexer *lexer, struct l2_token *tok) {
  129. tok->kind = L2_TOK_IDENT;
  130. tok->v.str = malloc(16);
  131. if (tok->v.str == NULL) {
  132. tok->kind = L2_TOK_ERROR;
  133. tok->v.str = "Allocaton failure";
  134. return;
  135. }
  136. size_t size = 16;
  137. size_t idx = 0;
  138. while (1) {
  139. int ch = peek_ch(lexer);
  140. if (is_whitespace(ch)) {
  141. return;
  142. }
  143. switch (ch) {
  144. case '(':
  145. case ')':
  146. case '{':
  147. case '}':
  148. case '[':
  149. case ']':
  150. case ',':
  151. case '.':
  152. case ':':
  153. case EOF:
  154. return;
  155. }
  156. tok->v.str[idx++] = (char)read_ch(lexer);
  157. if (idx >= size) {
  158. size *= 2;
  159. char *newbuf = realloc(tok->v.str, size);
  160. if (newbuf == NULL) {
  161. free(tok->v.str);
  162. tok->kind = L2_TOK_ERROR;
  163. tok->v.str = "Allocation failure";
  164. return;
  165. }
  166. tok->v.str = newbuf;
  167. }
  168. }
  169. }
  170. static void read_tok(struct l2_lexer *lexer, struct l2_token *tok) {
  171. skip_whitespace(lexer);
  172. tok->line = lexer->line;
  173. tok->ch = lexer->ch;
  174. int ch = read_ch(lexer);
  175. switch (ch) {
  176. case '(':
  177. tok->kind = L2_TOK_OPEN_PAREN;
  178. break;
  179. case ')':
  180. tok->kind = L2_TOK_CLOSE_PAREN;
  181. break;
  182. case '{':
  183. tok->kind = L2_TOK_OPEN_BRACE;
  184. break;
  185. case '}':
  186. tok->kind = L2_TOK_CLOSE_BRACE;
  187. break;
  188. case '[':
  189. tok->kind = L2_TOK_OPEN_BRACKET;
  190. break;
  191. case ']':
  192. tok->kind = L2_TOK_CLOSE_BRACKET;
  193. break;
  194. case ',':
  195. tok->kind = L2_TOK_COMMA;
  196. break;
  197. case '.':
  198. tok->kind = L2_TOK_PERIOD;
  199. break;
  200. case ':':
  201. {
  202. ch = read_ch(lexer);
  203. switch (ch) {
  204. case '=':
  205. tok->kind = L2_TOK_COLON_EQ;
  206. break;
  207. default:
  208. tok->kind = L2_TOK_ERROR;
  209. tok->v.str = "Unexpected character";
  210. break;
  211. }
  212. }
  213. break;
  214. case EOF:
  215. tok->kind = L2_TOK_EOF;
  216. break;
  217. case '"':
  218. read_string(lexer, tok);
  219. break;
  220. default:
  221. read_ident(lexer, tok);
  222. break;
  223. }
  224. }
  225. struct l2_token *l2_lexer_peek(struct l2_lexer *lexer, int count) {
  226. int offset = count - 1;
  227. while (offset >= lexer->tokidx) {
  228. read_tok(lexer, &lexer->toks[lexer->tokidx++]);
  229. }
  230. return &lexer->toks[offset];
  231. }
  232. struct l2_token *l2_lexer_get(struct l2_lexer *lexer) {
  233. l2_token_free(&lexer->currtok);
  234. if (lexer->tokidx == 0) {
  235. read_tok(lexer, &lexer->currtok);
  236. } else {
  237. memmove(lexer->toks, lexer->toks + 1, lexer->tokidx - 1);
  238. lexer->tokidx -= 1;
  239. }
  240. return &lexer->currtok;
  241. }