123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112 |
- #include <stdio.h>
- #include <stdlib.h>
-
- typedef unsigned char byte;
- typedef unsigned long unicode;
-
- // The minimum value of the first `char` to indicate n bytes
- #define C4bytes 0xF0
- #define C3bytes 0xE0
- #define C2bytes 0xC0
-
- // The minimum value for the vm_var_char for the utf8 equivalent to be n bytes
- #define U4bytes 0x10000
- #define U3bytes 0x0800
- #define U2bytes 0x0080
-
- void writebyte(FILE *f, byte b)
- {
- fwrite(&b, 1, 1, f);
- }
-
- void writeutf8char(FILE *f, unicode u)
- {
- if (u < U2bytes)
- {
- writebyte(f, (byte)u);
- return;
- }
-
- int left;
- int a;
- int b;
- if (u >= U4bytes)
- {
- left = 18;
- a = 0b11110000;
- b = 0b00000111;
- }
- else if (u >= U3bytes)
- {
- left = 12;
- a = 0b11100000;
- b = 0b00001111;
- }
- else
- {
- left = 6;
- a = 0b11000000;
- b = 0b00011111;
- }
-
- while (1)
- {
- writebyte(f, a | ((u >> left) & b));
- if (left == 0)
- return;
- a = 0b10000000;
- b = 0b00111111;
- left -= 6;
- }
- }
-
- int readbyte(FILE *f)
- {
- int status;
- byte c;
- status = fread(&c, 1, 1, f);
- if (status <= 0) return -1;
- return (int)c;
- }
-
- long readutf8char(FILE *f)
- {
- unicode u = 0;
- unicode ch = readbyte(f);
-
- if (ch == -1)
- return -1;
-
- if (ch < C2bytes)
- return ch;
-
- int left;
- int mask;
- if (ch >= C4bytes)
- {
- left = 18;
- mask = 0b00000111;
- }
- else if (ch >= C3bytes)
- {
- left = 12;
- mask = 0b00001111;
- }
- else
- {
- left = 6;
- mask = 0b00011111;
- }
-
- while (1)
- {
- u |= (ch & mask) << left;
-
- if (left == 0)
- return u;
-
- left -= 6;
- mask = 0b00111111;
- ch = readbyte(f);
- }
- }
|