#include #include typedef unsigned char byte; typedef unsigned long unicode; // The minimum value of the first `char` to indicate n bytes #define C4bytes 0xF0 #define C3bytes 0xE0 #define C2bytes 0xC0 // The minimum value for the vm_var_char for the utf8 equivalent to be n bytes #define U4bytes 0x10000 #define U3bytes 0x0800 #define U2bytes 0x0080 void writebyte(FILE *f, byte b) { fwrite(&b, 1, 1, f); } void writeutf8char(FILE *f, unicode u) { if (u < U2bytes) { writebyte(f, (byte)u); return; } int left; int a; int b; if (u >= U4bytes) { left = 18; a = 0b11110000; b = 0b00000111; } else if (u >= U3bytes) { left = 12; a = 0b11100000; b = 0b00001111; } else { left = 6; a = 0b11000000; b = 0b00011111; } while (1) { writebyte(f, a | ((u >> left) & b)); if (left == 0) return; a = 0b10000000; b = 0b00111111; left -= 6; } } int readbyte(FILE *f) { int status; byte c; status = fread(&c, 1, 1, f); if (status <= 0) return -1; return (int)c; } long readutf8char(FILE *f) { unicode u = 0; unicode ch = readbyte(f); if (ch == -1) return -1; if (ch < C2bytes) return ch; int left; int mask; if (ch >= C4bytes) { left = 18; mask = 0b00000111; } else if (ch >= C3bytes) { left = 12; mask = 0b00001111; } else { left = 6; mask = 0b00011111; } while (1) { u |= (ch & mask) << left; if (left == 0) return u; left -= 6; mask = 0b00111111; ch = readbyte(f); } }