You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lex.c 5.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. #include "parse/lex.h"
  2. #include <stdlib.h>
  3. static int parse_number(const char *str, double *num) {
  4. size_t len = strlen(str);
  5. *num = 0;
  6. int power = 1;
  7. for (int i = (int)len - 1; i >= 0; --i) {
  8. char ch = str[i];
  9. if (ch >= '0' && ch <= '9') {
  10. *num += (ch - '0') * power;
  11. power *= 10;
  12. } else {
  13. return -1;
  14. }
  15. }
  16. return 0;
  17. }
  18. const char *l2_token_kind_name(enum l2_token_kind kind) {
  19. switch (kind) {
  20. case L2_TOK_OPEN_PAREN:
  21. return "open-paren";
  22. case L2_TOK_CLOSE_PAREN:
  23. return "close-paren";
  24. case L2_TOK_OPEN_BRACE:
  25. return "open-brace";
  26. case L2_TOK_CLOSE_BRACE:
  27. return "close-brace";
  28. case L2_TOK_OPEN_BRACKET:
  29. return "open-bracket";
  30. case L2_TOK_CLOSE_BRACKET:
  31. return "close-bracket";
  32. case L2_TOK_COMMA:
  33. return "comma";
  34. case L2_TOK_PERIOD:
  35. return "period";
  36. case L2_TOK_COLON_EQ:
  37. return "period";
  38. case L2_TOK_EOF:
  39. return "end-of-file";
  40. case L2_TOK_NUMBER:
  41. return "number";
  42. case L2_TOK_STRING:
  43. return "string";
  44. case L2_TOK_IDENT:
  45. return "ident";
  46. case L2_TOK_ERROR:
  47. return "error";
  48. }
  49. }
  50. void l2_token_free(struct l2_token *tok) {
  51. if (tok->kind == L2_TOK_STRING) {
  52. free(tok->v.str);
  53. }
  54. }
  55. char *l2_token_extract_str(struct l2_token *tok) {
  56. char *str = tok->v.str;
  57. tok->v.str = NULL;
  58. return str;
  59. }
  60. void l2_lexer_init(struct l2_lexer *lexer, struct l2_io_reader *r) {
  61. lexer->currtok.kind = L2_TOK_EOF,
  62. lexer->tokidx = 0;
  63. lexer->line = 1;
  64. lexer->ch = 1;
  65. l2_bufio_reader_init(&lexer->reader, r);
  66. }
  67. static int peek_ch(struct l2_lexer *lexer) {
  68. int ch = l2_bufio_peek(&lexer->reader, 1);
  69. return ch;
  70. }
  71. static int read_ch(struct l2_lexer *lexer) {
  72. int ch = l2_bufio_get(&lexer->reader);
  73. lexer->ch += 1;
  74. if (ch == '\n') {
  75. lexer->ch = 1;
  76. lexer->line += 1;
  77. }
  78. return ch;
  79. }
  80. static int is_whitespace(int ch) {
  81. return ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t';
  82. }
  83. static void skip_whitespace(struct l2_lexer *lexer) {
  84. while (is_whitespace(l2_bufio_peek(&lexer->reader, 1))) read_ch(lexer);
  85. }
  86. static void read_string(struct l2_lexer *lexer, struct l2_token *tok) {
  87. tok->kind = L2_TOK_STRING;
  88. tok->v.str = malloc(16);
  89. if (tok->v.str == NULL) {
  90. tok->kind = L2_TOK_ERROR;
  91. tok->v.str = "Allocaton failure";
  92. return;
  93. }
  94. size_t size = 16;
  95. size_t idx = 0;
  96. while (1) {
  97. int ch = read_ch(lexer);
  98. if (ch == '"') {
  99. return;
  100. } else if (ch == EOF) {
  101. tok->kind = L2_TOK_EOF;
  102. free(tok->v.str);
  103. tok->v.str = "Unexpected EOF";
  104. return;
  105. } else if (ch == '\\') {
  106. int ch2 = read_ch(lexer);
  107. switch (ch2) {
  108. case 'n':
  109. ch = '\n';
  110. break;
  111. case 'r':
  112. ch = '\r';
  113. break;
  114. case 't':
  115. ch = '\t';
  116. break;
  117. case EOF:
  118. tok->kind = L2_TOK_EOF;
  119. free(tok->v.str);
  120. tok->v.str = "Unexpected EOF";
  121. return;
  122. default:
  123. ch = ch2;
  124. break;
  125. }
  126. }
  127. tok->v.str[idx++] = (char)ch;
  128. if (idx >= size) {
  129. size *= 2;
  130. char *newbuf = realloc(tok->v.str, size);
  131. if (newbuf == NULL) {
  132. free(tok->v.str);
  133. tok->kind = L2_TOK_ERROR;
  134. tok->v.str = "Allocation failure";
  135. return;
  136. }
  137. tok->v.str = newbuf;
  138. }
  139. }
  140. }
  141. static void read_ident(struct l2_lexer *lexer, struct l2_token *tok) {
  142. tok->kind = L2_TOK_IDENT;
  143. tok->v.str = malloc(16);
  144. if (tok->v.str == NULL) {
  145. tok->kind = L2_TOK_ERROR;
  146. tok->v.str = "Allocaton failure";
  147. return;
  148. }
  149. size_t size = 16;
  150. size_t idx = 0;
  151. while (1) {
  152. int ch = peek_ch(lexer);
  153. if (is_whitespace(ch)) {
  154. tok->v.str[idx] = '\0';
  155. return;
  156. }
  157. switch (ch) {
  158. case '(':
  159. case ')':
  160. case '{':
  161. case '}':
  162. case '[':
  163. case ']':
  164. case ',':
  165. case '.':
  166. case ':':
  167. case EOF:
  168. tok->v.str[idx] = '\0';
  169. return;
  170. }
  171. tok->v.str[idx++] = (char)read_ch(lexer);
  172. if (idx + 1 >= size) {
  173. size *= 2;
  174. char *newbuf = realloc(tok->v.str, size);
  175. if (newbuf == NULL) {
  176. free(tok->v.str);
  177. tok->kind = L2_TOK_ERROR;
  178. tok->v.str = "Allocation failure";
  179. return;
  180. }
  181. tok->v.str = newbuf;
  182. }
  183. }
  184. }
  185. static void read_tok(struct l2_lexer *lexer, struct l2_token *tok) {
  186. skip_whitespace(lexer);
  187. tok->line = lexer->line;
  188. tok->ch = lexer->ch;
  189. int ch = peek_ch(lexer);
  190. switch (ch) {
  191. case '(':
  192. read_ch(lexer);
  193. tok->kind = L2_TOK_OPEN_PAREN;
  194. break;
  195. case ')':
  196. read_ch(lexer);
  197. tok->kind = L2_TOK_CLOSE_PAREN;
  198. break;
  199. case '{':
  200. read_ch(lexer);
  201. tok->kind = L2_TOK_OPEN_BRACE;
  202. break;
  203. case '}':
  204. read_ch(lexer);
  205. tok->kind = L2_TOK_CLOSE_BRACE;
  206. break;
  207. case '[':
  208. read_ch(lexer);
  209. tok->kind = L2_TOK_OPEN_BRACKET;
  210. break;
  211. case ']':
  212. read_ch(lexer);
  213. tok->kind = L2_TOK_CLOSE_BRACKET;
  214. break;
  215. case ',':
  216. read_ch(lexer);
  217. tok->kind = L2_TOK_COMMA;
  218. break;
  219. case '.':
  220. read_ch(lexer);
  221. tok->kind = L2_TOK_PERIOD;
  222. break;
  223. case ':':
  224. read_ch(lexer);
  225. {
  226. ch = read_ch(lexer);
  227. switch (ch) {
  228. case '=':
  229. tok->kind = L2_TOK_COLON_EQ;
  230. break;
  231. default:
  232. tok->kind = L2_TOK_ERROR;
  233. tok->v.str = "Unexpected character";
  234. break;
  235. }
  236. }
  237. break;
  238. case EOF:
  239. tok->kind = L2_TOK_EOF;
  240. break;
  241. case '"':
  242. read_ch(lexer);
  243. read_string(lexer, tok);
  244. break;
  245. default:
  246. read_ident(lexer, tok);
  247. if (tok->kind != L2_TOK_IDENT) {
  248. break;
  249. }
  250. double num;
  251. if (parse_number(tok->v.str, &num) >= 0) {
  252. free(tok->v.str);
  253. tok->kind = L2_TOK_NUMBER;
  254. tok->v.num = num;
  255. }
  256. break;
  257. }
  258. }
  259. struct l2_token *l2_lexer_peek(struct l2_lexer *lexer, int count) {
  260. int offset = count - 1;
  261. while (offset >= lexer->tokidx) {
  262. read_tok(lexer, &lexer->toks[lexer->tokidx++]);
  263. }
  264. return &lexer->toks[offset];
  265. }
  266. struct l2_token *l2_lexer_get(struct l2_lexer *lexer) {
  267. l2_token_free(&lexer->currtok);
  268. if (lexer->tokidx == 0) {
  269. read_tok(lexer, &lexer->currtok);
  270. } else {
  271. memmove(lexer->toks, lexer->toks + 1, lexer->tokidx - 1);
  272. lexer->tokidx -= 1;
  273. }
  274. return &lexer->currtok;
  275. }