diff b8ae7708fb3ef3acbb30ccf3181897f8157c18de uncommitted --- /dev/null +++ b//lib/ucd/mkfile @@ -1,0 +1,70 @@ + $target >[2]/dev/null || hget $URL^'auxiliary/'^$target > $target +%.pdf: + hget $URL^$target > $target + +txt:V: $TXT + +pdf:V: $PDF + +test:V: $TEST + +all:V: $TXT $PDF $TEST --- a//sys/include/libc.h +++ b//sys/include/libc.h @@ -77,6 +77,14 @@ extern long runestrlen(Rune*); extern Rune* runestrstr(Rune*, Rune*); +extern int runenorm(Rune*, Rune*, int, int); +extern int utfnorm(char*,char*,int,int); +extern char* fullutfnorm(char*,int); +extern Rune* fullrunenorm(Rune*,int); + +extern Rune* runewbreak(Rune*); +extern Rune* runegbreak(Rune*); + extern Rune tolowerrune(Rune); extern Rune totitlerune(Rune); extern Rune toupperrune(Rune); @@ -404,7 +412,7 @@ extern int enc16chr(int); extern int encodefmt(Fmt*); -extern void exits(char*); +extern _Noreturn void exits(char*); extern double frexp(double, int*); extern uintptr getcallerpc(void*); extern char* getenv(char*); @@ -431,7 +439,7 @@ extern ulong strtoul(char*, char**, int); extern vlong strtoll(char*, char**, int); extern uvlong strtoull(char*, char**, int); -extern void sysfatal(char*, ...); +extern _Noreturn void sysfatal(char*, ...); #pragma varargck argpos sysfatal 1 extern void syslog(int, char*, char*, ...); #pragma varargck argpos syslog 3 @@ -677,7 +685,7 @@ ulong len; } IOchunk; -extern void _exits(char*); +extern _Noreturn void _exits(char*); extern void abort(void); extern int access(char*, int); --- a//sys/src/cmd/tcs/hdr.h +++ b//sys/src/cmd/tcs/hdr.h @@ -23,6 +23,8 @@ void utf_in(int, long *, struct convert *); void utf_out(Rune *, int, long *); +void utfnfc_out(Rune *, int, long *); +void utfnfd_out(Rune *, int, long *); void isoutf_in(int, long *, struct convert *); void isoutf_out(Rune *, int, long *); --- a//sys/src/cmd/tcs/tcs.c +++ b//sys/src/cmd/tcs/tcs.c @@ -613,6 +613,10 @@ { "utf-16be", "alias for unicode-be (MIME)", Func, 0, (Fnptr)unicode_out_be }, { "utf-16le", "alias for unicode-le (MIME)", From|Func, 0, (Fnptr)unicode_in_le }, { "utf-16le", "alias for unicode-le (MIME)", Func, 0, (Fnptr)unicode_out_le }, + { "nfc", "UTF Normalization Form C", From|Func, 0, (Fnptr)utf_in }, + { "nfc", "UTF Normalization Form C", Func, 0, (Fnptr)utfnfc_out }, + { "nfd", "UTF Normalization Form D", From|Func, 0, (Fnptr)utf_in }, + { "nfd", "UTF Normalization Form D", Func, 0, (Fnptr)utfnfd_out }, { "viet1", "Vietnamese VSCII-1 (1993)", Table, (void *)tabviet1 }, { "viet2", "Vietnamese VSCII-2 (1993)", Table, (void *)tabviet2 }, { "vscii", "Vietnamese VISCII 1.1 (1992)", Table, (void *)tabviscii }, --- a//sys/src/cmd/tcs/utf.c +++ b//sys/src/cmd/tcs/utf.c @@ -19,38 +19,27 @@ void utf_in(int fd, long *, struct convert *out) { - char buf[N]; - int i, j, c, n, tot; - unsigned long l; + char buf[N + 1]; + Rune r; + char *p; + int n, tot, j; tot = 0; + j = 0; while((n = read(fd, buf+tot, N-tot)) >= 0){ tot += n; - for(i=j=0; i<=tot-UTFmax || (i obuf) write(1, obuf, p-obuf); +} + +void +utfnfc_out(Rune *base, int n, long *) +{ + Rune buf[N + 1]; + int w; + + w = runenorm(buf, base, n + 1, 1); + utf_out(buf, w, nil); +} + +void +utfnfd_out(Rune *base, int n, long *) +{ + Rune buf[N + 1]; + int w; + + w = runenorm(buf, base, n + 1, 0); + utf_out(buf, w, nil); } void --- a//sys/src/libc/port/mkfile +++ b//sys/src/libc/port/mkfile @@ -62,6 +62,9 @@ rand.c\ readn.c\ rune.c\ + runebreak.c\ + runeistype.c\ + runenorm.c\ runestrcat.c\ runestrchr.c\ runestrcmp.c\ @@ -74,7 +77,7 @@ runestrrchr.c\ runestrlen.c\ runestrstr.c\ - runetype.c\ + runetotype.c\ sin.c\ sinh.c\ sqrt.c\ @@ -127,3 +130,16 @@ +#include +#include + +enum{ + NRUNES = 1<<21 +}; + +typedef struct Param Param; +typedef struct Lvl Lvl; +struct Lvl{ + int bits; + int max; + int mask; +}; +struct Param{ + Lvl idx1; + Lvl idx2; + Lvl data; + + int round1max; +}; + +static void +derive(Lvl *l) +{ + l->max = 1 << l->bits; + l->mask = l->max - 1; +} + +static void +param(Param *p, int idx1, int idx2) +{ + + assert(idx1 + idx2 < 21); + p->idx1.bits = idx1; + p->idx2.bits = idx2; + p->data.bits = 21 - idx1 - idx2; + derive(&p->idx1); + derive(&p->idx2); + derive(&p->data); + + p->round1max = NRUNES/p->data.max; +} + +static int +lkup(Param *p, int *idx1, int *idx2, int *data, int x) +{ + int y, z; + + y = (((x)>>(p->data.bits+p->idx2.bits))&p->idx1.mask); + z = (((x)>>p->data.bits)&p->idx2.mask); + return data[idx2[idx1[y] + z] + (x&p->data.mask)]; +} + +static int +mkarrvar(int fd, char *name, int *d, int len) +{ + int i, sz; + int max, min; + char *t; + + max = min = 0; + for(i = 0; i < len; i++){ + if(d[i] > max) + max = d[i]; + if(d[i] < min) + min = d[i]; + } + if(min == 0){ + if(max < (uchar)~0) + t = "uchar", sz = 1; + else if(max < 0xFFFF) + t = "ushort", sz = 2; + else + t = "uint", sz = 4; + } else { + if(max < 1<<7) + t = "char", sz = 1; + else if(max < 1<<15) + t = "short", sz = 2; + else + t = "int", sz = 4; + } + if(fd < 0) + return sz * len; + + fprint(fd, "static\n%s\t%s[%d] =\n{\n\t", t, name, len); + for(i = 0; i < len; i++){ + fprint(fd, "%d,", d[i]); + if((i+1) % 16 == 0) + fprint(fd, "\n\t"); + } + fprint(fd, "\n};\n"); + + return sz * len; +} + +static int +mkexceptarr(int fd, char *name, int *d, int n, int all) +{ + int i; + fprint(fd, "static\nRune %s[][%d] =\n{\n\t", name, all ? 3 : 2); + for(i = 0; i < n*3; i += 3){ + if(all && d[i] != 0) + fprint(fd, "{0x%X, 0x%X, 0x%X},", d[i], d[i+1], d[i+2]); + else if(!all) + fprint(fd, "{0x%X, 0x%X},", d[i+1], d[i+2]); + if((i+3) % (8*3) == 0) + fprint(fd, "\n\t"); + } + fprint(fd, "\n};\n"); + return n * sizeof(Rune) * 2; +} + +static int +compact(int *data, int *idx, int nidx, int *src, int chunksize) +{ + int i, n, ndata, best; + int *dot, *lp, *rp; + + dot = src; + ndata = 0; + idx[0] = 0; + for(i = 1; i <= nidx; i++){ + rp = dot + chunksize; + lp = rp - 1; + + for(best = 0, n = 0; i != nidx && n < chunksize; n++, lp--){ + if(memcmp(lp, rp, (n+1) * sizeof data[0]) == 0) + best = n+1; + } + memmove(data + ndata, dot, (chunksize - best) * sizeof data[0]); + ndata += (chunksize - best); + idx[i] = idx[i - 1] + (chunksize - best); + dot = rp; + } + return ndata; +} + + +static int +mklkup(int fd, char *label, int *map, Param *p) +{ + static int data[NRUNES]; + static int idx2[NRUNES]; + static int idx2dest[NRUNES]; + static int idx1[NRUNES]; + int i, nidx2, ndata; + int size; + + ndata = compact(data, idx2, p->round1max, map, p->data.max); + nidx2 = compact(idx2dest, idx1, p->idx1.max, idx2, p->idx2.max); + + if(fd >= 0){ + for(i = 0; i < NRUNES; i++) + if(map[i] != lkup(p, idx1, idx2dest, data, i)) + sysfatal("mismatch in %s at %d %d %d\n", label, i, map[i], lkup(p, idx1, idx2dest, data, i)); + } + + size = mkarrvar(fd, smprint("_%sdata", label), data, ndata); + size += mkarrvar(fd, smprint("_%sidx2", label), idx2dest, nidx2); + size += mkarrvar(fd, smprint("_%sidx1", label), idx1, p->idx1.max); + if(fd >= 0){ + fprint(fd, "\n"); + fprint(fd, "#define %sindex1(x) (((x)>>(%d+%d))&0x%X)\n", label, p->data.bits, p->idx2.bits, p->idx1.mask); + fprint(fd, "#define %sindex2(x) (((x)>>%d)&0x%X)\n", label, p->data.bits, p->idx2.mask); + fprint(fd, "#define %soffset(x) ((x)&0x%X)\n", label, p->data.mask); + fprint(fd, "#define %slkup(x) (_%sdata[_%sidx2[_%sidx1[%sindex1(x)] + %sindex2(x)] + %soffset(x)] )\n\n", + label, label, label, label, label, label, label); + } + return size; +} + +static void +mklkupmatrix(char *label, int *map, Param *p) +{ + int bestsize, size, bestx, besty; + int x, y; + + bestsize = bestx = besty = -1; + for(x = 4; x <= 12; x++) + for(y=4; y <= (19 - x); y++){ + param(p, x, y); + size = mklkup(-1, label, map, p); + if(bestsize == -1 || size < bestsize){ + bestx = x; + besty = y; + bestsize = size; + } + } + + assert(bestsize != -1); + fprint(2, "label: %s best: %d %d (%d)\n", label, bestx, besty, bestsize); + param(p, bestx, besty); +} + +static int myismerged[NRUNES]; +static int mytoupper[NRUNES]; +static int mytolower[NRUNES]; +static int mytotitle[NRUNES]; +static int mybreak[NRUNES]; + +enum{ DSTART = 0xEEEE }; +static int mydecomp[NRUNES]; +static int mydespecial[256*3]; +static int nspecial; +static int myccc[NRUNES]; + +typedef struct KV KV; +struct KV{ + uint key; + uint val; + ushort next; +}; + +static KV myrecomp[2000]; +static int nrecomp; + +static int recompext[256*3]; +static int nrecompext; + +static uint +hash(uint x) +{ + x ^= x >> 16; + x *= 0x21f0aaad; + x ^= x >> 15; + x *= 0xd35a2d97; + x ^= x >> 15; + return x; +} + +static void +mkrecomp(int fd) +{ + int i; + KV *p; + static KV vals[512]; + static KV coll[1000]; + int over; + int maxchain; + + for(i = 0; i < nelem(vals); i++) + vals[i] = (KV){0, 0, 0}; + for(i = 0; i < nelem(coll); i++) + coll[i] = (KV){0, 0, 0}; + over = 1; + for(i = 0; i < nrecomp; i++){ + p = vals + (hash(myrecomp[i].key) % nelem(vals)); + maxchain = 0; + while(p->key != 0){ + maxchain++; + if(p->next == 0){ + p->next = over; + p = coll + over - 1; + over++; + } else + p = coll + p->next - 1; + } + p->key = myrecomp[i].key; + p->val = myrecomp[i].val; + } + fprint(2, "recomp map [%d][%d]: %d\n", nelem(vals), over-1, (nelem(vals) + over-1) * (4+2+2)); + fprint(fd, "static\nuint\t_recompdata[] =\n{\n\t"); + for(p = vals, i = 0;; i++){ + assert(p->val < 0xFFFF); + assert(p->next < 0xFFFF); + fprint(fd, "%udU,%udU,", p->key, p->val | (p->next<<16)); + if((i+1) % 8 == 0) + fprint(fd, "\n\t"); + + if(p == vals+nelem(vals)-1) + p = coll; + else if(p == coll + over - 2) + break; + else + p++; + } + fprint(fd, "\n};\n"); + fprint(fd, "static uint *_recompcoll = _recompdata+%d*2;\n", nelem(vals)); + /* + fprint(fd, + " x ^= x >> 16;\n" + " x *= 0x21f0aaad;\n" + " x ^= x >> 15;\n" + " x *= 0xd35a2d97;\n" + " x ^= x >> 15;\n" + " p = _recompdata + (x%%%d)*2;\n" + "}\n", nelem(vals)); + */ +} + +static void +mktables(void) +{ + Param p; + int tofd, isfd, normfd, breakfd; + int size; + + tofd = create("runetotypedata", OWRITE, 0664); + if(tofd < 0) + sysfatal("could not create runetotypedata: %r"); + param(&p, 10, 7); + size = mklkup(tofd, "upper", mytoupper, &p); + fprint(2, "%s: %d\n", "upper", size); + + size = mklkup(tofd, "lower", mytolower, &p); + fprint(2, "%s: %d\n", "lower", size); + + size = mklkup(tofd, "title", mytotitle, &p); + fprint(2, "%s: %d\n", "title", size); + close(tofd); + + isfd = create("runeistypedata", OWRITE, 0664); + if(isfd < 0) + sysfatal("could not create runeistypedata: %r"); + param(&p, 11, 6); + size = mklkup(isfd, "merged", myismerged, &p); + fprint(2, "%s: %d\n", "merged", size); + fprint(isfd, "static\nenum {\n"); + fprint(isfd, "\tL%s = %s,\n", "space", "1<<0"); + fprint(isfd, "\tL%s = %s,\n", "alpha", "1<<1"); + fprint(isfd, "\tL%s = %s,\n", "digit", "1<<2"); + fprint(isfd, "\tL%s = %s,\n", "upper", "1<<3"); + fprint(isfd, "\tL%s = %s,\n", "lower", "1<<4"); + fprint(isfd, "\tL%s = %s,\n", "title", "1<<5"); + fprint(isfd, "};\n"); + close(isfd); + + normfd = create("runenormdata", OWRITE, 0664); + if(normfd < 0) + sysfatal("could not create runenormdata: %r"); + param(&p, 10, 7); + size = mklkup(normfd, "decomp", mydecomp, &p); + fprint(2, "%s: %d\n", "decomp", size); + + param(&p, 9, 7); + size = mklkup(normfd, "ccc", myccc, &p); + fprint(2, "%s: %d\n", "ccc", size); + + mkexceptarr(normfd, "_decompexceptions", mydespecial, nspecial, 0); + mkexceptarr(normfd, "_recompexceptions", recompext, nrecompext, 1); + mkrecomp(normfd); + close(normfd); + + param(&p, 10, 6); + breakfd = create("runebreakdata", OWRITE, 0644); + if(breakfd < 0) + sysfatal("could not create runebreakdata: %r"); + size = mklkup(breakfd, "break", mybreak, &p); + fprint(2, "%s: %d\n", "break", size); +} + +enum { + FIELD_CODE, + FIELD_NAME, + FIELD_CATEGORY, + FIELD_COMBINING, + FIELD_BIDIR, + FIELD_DECOMP, + FIELD_DECIMAL_DIG, + FIELD_DIG, + FIELD_NUMERIC_VAL, + FIELD_MIRRORED, + FIELD_UNICODE_1_NAME, + FIELD_COMMENT, + FIELD_UPPER, + FIELD_LOWER, + FIELD_TITLE, + NFIELDS, +}; + +static int +getunicodeline(Biobuf *in, char **fields) +{ + char *p; + + if((p = Brdline(in, '\n')) == nil) + return 0; + + p[Blinelen(in)-1] = '\0'; + + if (getfields(p, fields, NFIELDS + 1, 0, ";") != NFIELDS) + sysfatal("bad number of fields"); + + return 1; +} + +static int +estrtoul(char *s, int base) +{ + char *epr; + Rune code; + + code = strtoul(s, &epr, base); + if(s == epr) + sysfatal("bad code point hex string"); + return code; +} + +enum { + OTHER, + Hebrew_Letter, Newline, Extend, Format, + Katakana, ALetter, MidLetter, MidNum, + MidNumLet, Numeric, ExtendNumLet, WSegSpace, + PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40, + L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0, + EMOJIEX = 0xB0, +}; + +static void +markbreak(void) +{ + Biobuf *b; + char *p, *dot; + int i, s, e; + uchar v; + + b = Bopen("/lib/ucd/WordBreakProperty.txt", OREAD); + if(b == nil) + sysfatal("could not load word breaks: %r"); + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#') + continue; + if((dot = strstr(p, "..")) != nil){ + *dot = 0; + dot += 2; + s = estrtoul(p, 16); + e = estrtoul(dot, 16); + } else { + s = e = estrtoul(p, 16); + dot = p; + } + v = 0; + if(strstr(dot, "ExtendNumLet") != nil) + v = ExtendNumLet; + else if(strstr(dot, "Hebrew_Letter") != nil) + v = Hebrew_Letter; + else if(strstr(dot, "Newline") != nil) + v = Newline; + else if(strstr(dot, "Extend") != nil) + v = Extend; + else if(strstr(dot, "Format") != nil) + v = Format; + else if(strstr(dot, "Katakana") != nil) + v = Katakana; + else if(strstr(dot, "ALetter") != nil) + v = ALetter; + else if(strstr(dot, "MidLetter") != nil) + v = MidLetter; + else if(strstr(dot, "MidNum") != nil) + v = MidNum; + else if(strstr(dot, "Numeric") != nil) + v = Numeric; + else if(strstr(dot, "WSegSpace") != nil) + v = WSegSpace; + for(i = s; i <= e; i++) + mybreak[i] = v; + } + Bterm(b); + b = Bopen("/lib/ucd/GraphemeBreakProperty.txt", OREAD); + if(b == nil) + sysfatal("could not load Grapheme breaks: %r"); + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#') + continue; + if((dot = strstr(p, "..")) != nil){ + *dot = 0; + dot += 2; + s = estrtoul(p, 16); + e = estrtoul(dot, 16); + } else { + s = e = estrtoul(p, 16); + dot = p; + } + v = 0; + if(strstr(dot, "; Prepend #") != nil) + v = PREPEND; + else if(strstr(dot, "; Control #") != nil) + v = CONTROL; + else if(strstr(dot, "; Extend #") != nil) + v = EXTEND; + else if(strstr(dot, "; Regional_Indicator #") != nil) + v = REGION; + else if(strstr(dot, "; SpacingMark #") != nil) + v = SPACEMK; + else if(strstr(dot, "; L #") != nil) + v = L; + else if(strstr(dot, "; V #") != nil) + v = V; + else if(strstr(dot, "; T #") != nil) + v = T; + else if(strstr(dot, "; LV #") != nil) + v = LV; + else if(strstr(dot, "; LVT #") != nil) + v = LVT; + for(i = s; i <= e; i++) + mybreak[i] |= v; + } + Bterm(b); + + b = Bopen("/lib/ucd/emoji-data.txt", OREAD); + if(b == nil) + sysfatal("could not load emoji-data: %r"); + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#') + continue; + if((dot = strstr(p, "..")) != nil){ + *dot = 0; + dot += 2; + s = estrtoul(p, 16); + e = estrtoul(dot, 16); + } else { + s = e = estrtoul(p, 16); + dot = p; + } + v = 0; + if(strstr(dot, "; Extended_Pictographic") != nil) + v = EMOJIEX; + for(i = s; i <= e; i++) + mybreak[i] |= v; + } + Bterm(b); +} + +static void +markexclusions(void) +{ + Biobuf *b; + char *p; + int i; + uint x; + + b = Bopen("/lib/ucd/CompositionExclusions.txt", OREAD); + if(b == nil) + sysfatal("could not load composition exclusions: %r"); + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#') + continue; + x = estrtoul(p, 16); + for(i = 0; i < nrecomp; i++){ + if(myrecomp[i].val == x){ + myrecomp[i].val = 0; + break; + } + } + if(i == nrecomp){ + for(i = 0; i < nrecompext; i++){ + if(recompext[i*3] == x){ + recompext[i*3] = 0; + break; + } + } + } + } + Bterm(b); +} + +void +main(int, char) +{ + static char myisspace[NRUNES]; + static char myisalpha[NRUNES]; + static char myisdigit[NRUNES]; + static char myisupper[NRUNES]; + static char myislower[NRUNES]; + static char myistitle[NRUNES]; + Biobuf *in; + char *fields[NFIELDS + 1], *fields2[NFIELDS + 1]; + char *p, *d; + int i, code, last; + int decomp[2], *ip; + + in = Bopen("/lib/ucd/UnicodeData.txt", OREAD); + if(in == nil) + sysfatal("can't open UnicodeData.txt: %r"); + + for(i = 0; i < NRUNES; i++){ + mytoupper[i] = -1; + mytolower[i] = -1; + mytotitle[i] = -1; + mydecomp[i] = 0; + myccc[i] = 0; + mybreak[i] = 0; + } + + myisspace['\t'] = 1; + myisspace['\n'] = 1; + myisspace['\r'] = 1; + myisspace['\f'] = 1; + myisspace['\v'] = 1; + myisspace[0x85] = 1; /* control char, "next line" */ + myisspace[0xfeff] = 1; /* zero-width non-break space */ + + last = -1; + nspecial = nrecomp = nrecompext = 0; + while(getunicodeline(in, fields)){ + code = estrtoul(fields[FIELD_CODE], 16); + if (code >= NRUNES) + sysfatal("code-point value too big: %x", code); + if(code <= last) + sysfatal("bad code sequence: %x then %x", last, code); + last = code; + + p = fields[FIELD_CATEGORY]; + if(strstr(fields[FIELD_NAME], ", First>") != nil){ + if(!getunicodeline(in, fields2)) + sysfatal("range start at eof"); + if (strstr(fields2[FIELD_NAME], ", Last>") == nil) + sysfatal("range start not followed by range end"); + last = estrtoul(fields2[FIELD_CODE], 16); + if(last <= code) + sysfatal("range out of sequence: %x then %x", code, last); + if(strcmp(p, fields2[FIELD_CATEGORY]) != 0) + sysfatal("range with mismatched category"); + } + + d = fields[FIELD_DECOMP]; + if(strlen(d) > 0 && strstr(d, "<") == nil){ + decomp[0] = estrtoul(d, 16); + d = strstr(d, " "); + if(d == nil){ + /* singleton recompositions are verboden */ + decomp[1] = 0; + if(decomp[0] > 0xFFFF){ + //fprint(2, "case1 %X %X\n", code, decomp[0]); + ip = mydespecial + nspecial*3; + ip[0] = code; + ip[1] = decomp[0]; + ip[2] = 0; + mydecomp[code] = (DSTART+nspecial)<<16; + nspecial++; + } else + mydecomp[code] = decomp[0]<<16; + } else { + d++; + decomp[1] = estrtoul(d, 16); + if(decomp[0] > 0xFFFF || decomp[1] > 0xFFFF){ + //fprint(2, "case2 %X %X %X\n", code, decomp[0], decomp[1]); + ip = mydespecial + nspecial*3; + ip[0] = code; + ip[1] = decomp[0]; + ip[2] = decomp[1]; + mydecomp[code] = (DSTART+nspecial)<<16; + nspecial++; + ip = recompext + nrecompext*3; + ip[0] = code; + ip[1] = decomp[0]; + ip[2] = decomp[1]; + nrecompext++; + } else { + mydecomp[code] = decomp[0]<<16 | decomp[1]; + myrecomp[nrecomp++] = (KV){decomp[0]<<16 | decomp[1], code, 0}; + } + } + } + + for (; code <= last; code++){ + if(p[0] == 'L') + myisalpha[code] = 1; + if(p[0] == 'Z') + myisspace[code] = 1; + + if(strcmp(p, "Lu") == 0) + myisupper[code] = 1; + if(strcmp(p, "Ll") == 0) + myislower[code] = 1; + + if(strcmp(p, "Lt") == 0) + myistitle[code] = 1; + + if(strcmp(p, "Nd") == 0) + myisdigit[code] = 1; + + if(fields[FIELD_UPPER][0] != '\0') + mytoupper[code] = estrtoul(fields[FIELD_UPPER], 16); + + if(fields[FIELD_LOWER][0] != '\0') + mytolower[code] = estrtoul(fields[FIELD_LOWER], 16); + + if(fields[FIELD_TITLE][0] != '\0') + mytotitle[code] = estrtoul(fields[FIELD_TITLE], 16); + + myccc[code] = estrtoul(fields[FIELD_COMBINING], 10); + } + } + + Bterm(in); + + markexclusions(); + + /* + * according to standard, if totitle(x) is not defined in ucd + * but toupper(x) is, then totitle is defined to be toupper(x) + */ + for(i = 0; i < NRUNES; i++){ + if(mytotitle[i] == -1 + && mytoupper[i] != -1 + && !myistitle[i]) + mytotitle[i] = mytoupper[i]; + } + + /* + * A couple corrections: + * is*(to*(x)) should be true. + * restore undefined transformations. + * store offset instead of value, makes them sparse. + */ + for(i = 0; i < NRUNES; i++){ + if(mytoupper[i] != -1) + myisupper[mytoupper[i]] = 1; + else + mytoupper[i] = i; + + if(mytolower[i] != -1) + myislower[mytolower[i]] = 1; + else + mytolower[i] = i; + + if(mytotitle[i] != -1) + myistitle[mytotitle[i]] = 1; + else + mytotitle[i] = i; + + mytoupper[i] = mytoupper[i] - i; + mytolower[i] = mytolower[i] - i; + mytotitle[i] = mytotitle[i] - i; + } + + uchar b; + for(i = 0; i < NRUNES; i++){ + b = 0; + if(myisspace[i]) + b |= 1<<0; + if(myisalpha[i]) + b |= 1<<1; + if(myisdigit[i]) + b |= 1<<2; + if(myisupper[i]) + b |= 1<<3; + if(myislower[i]) + b |= 1<<4; + if(myistitle[i]) + b |= 1<<5; + + myismerged[i] = b; + } + + markbreak(); + mktables(); + exits(nil); +} --- /dev/null +++ b//sys/src/libc/port/runebreak.c @@ -1,0 +1,149 @@ +#include +#include + +#include "/sys/src/libc/port/runebreakdata" + +enum { + OTHER, + Hebrew_Letter, Newline, Extend, Format, + Katakana, ALetter, MidLetter, MidNum, + MidNumLet, Numeric, ExtendNumLet, WSegSpace, + PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40, + L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0, + EMOJIEX = 0xB0, + + ZWJ = 0x200DU, + LINETAB = 0xB, +}; + +#define IS(x, y) ((x&0xf) == y) +#define ISG(x, y) ((x&0xf0) == y) + +Rune* +runegbreak(Rune *s) +{ + Rune l, r; + uchar lt, rt; + Rune *p; + + p = s; + if((l = *p++) == 0) + return s; + if((r = *p) == 0) + return s; + lt = breaklkup(l); + rt = breaklkup(r); + if(l == '\r' && r == '\n') + goto Done; + if(ISG(lt, CONTROL) || l == '\r' || l == '\n') + return p; + if(ISG(rt, CONTROL) || r == '\r' || r == '\n') + return p; + if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT))) + goto Done; + if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T))) + goto Done; + if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T))) + goto Done; + if(ISG(rt, SPACEMK) || ISG(lt, PREPEND)) + goto Done; + if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){ + while(ISG(rt, EXTEND)){ + p++; + if((r = *p) == 0) + return s; + rt = breaklkup(r); + } + if(r != ZWJ) + return p; + p++; + if((r = *p) == 0) + return s; + rt = breaklkup(r); + if(ISG(rt, EMOJIEX)) + goto Done; + return p; + } + if(ISG(rt, EXTEND) || r == ZWJ) + goto Done; + if(ISG(lt, REGION) && ISG(rt, REGION)) + goto Done; + + return p; + +Done: + if(p[1] == 0) + return s; + return p + 1; +} + +#define AH(x) (IS(x, ALetter) || IS(x, Hebrew_Letter)) +#define MNLQ(x) (IS(x, MidNumLet) || x == '\'') + +Rune* +runewbreak(Rune *s) +{ + Rune l, r; + uchar lt, rt; + Rune *p; + + p = s; + if((l = *p++) == 0) + return s; + if((r = *p) == 0) + return s; + lt = breaklkup(l); + rt = breaklkup(r); + if(l == '\r' && r == '\n') + goto Done; + if(l == '\r' || l == '\n' || l == LINETAB) + return p; + if(r == '\r' || r == '\n' || l == LINETAB) + return p; + if(IS(lt, WSegSpace) && IS(rt, WSegSpace)) + goto Done; + if(IS(rt, Format) || IS(rt, Extend)) + goto Done; + if(AH(lt)){ + if(AH(rt)) + goto Done; + if((IS(rt, MidLetter) || MNLQ(rt)) && p[1] != 0 && AH(breaklkup(p[1]))) + goto Done; + if(IS(lt, Hebrew_Letter) && r == '\'') + goto Done; + if(IS(lt, Hebrew_Letter) && r == '"' && p[1] != 0 && IS(breaklkup(p[1]), Hebrew_Letter)) + goto Done; + if(IS(rt, Numeric)) + goto Done; + } + if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric))) + goto Done; + if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && p[1] != 0 && IS(breaklkup(p[1]), Numeric)) + goto Done; + if(IS(lt, Katakana) && IS(rt, Katakana)) + goto Done; + if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet)) + if(IS(rt, ExtendNumLet)) + goto Done; + if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana))) + goto Done; + if(ISG(lt, REGION)){ + if(ISG(rt, REGION)) + goto Done; + if(r != ZWJ) + return p; + p++; + if((r = *p) == 0) + return s; + rt = breaklkup(r); + if(ISG(rt, REGION)) + goto Done; + } + + return p; + +Done: + if(p[1] == 0) + return s; + return p + 1; +} --- /dev/null +++ b//sys/src/libc/port/runeistype.c @@ -1,0 +1,40 @@ +#include +#include + +#include "/sys/src/libc/port/runeistypedata" + +int +isspacerune(Rune c) +{ + return (mergedlkup(c) & Lspace) == Lspace; +} + +int +isalpharune(Rune c) +{ + return (mergedlkup(c) & Lalpha) == Lalpha; +} + +int +isdigitrune(Rune c) +{ + return (mergedlkup(c) & Ldigit) == Ldigit; +} + +int +isupperrune(Rune c) +{ + return (mergedlkup(c) & Lupper) == Lupper; +} + +int +islowerrune(Rune c) +{ + return (mergedlkup(c) & Llower) == Llower; +} + +int +istitlerune(Rune c) +{ + return (mergedlkup(c) & Ltitle) == Ltitle; +} --- /dev/null +++ b//sys/src/libc/port/runenorm.c @@ -1,0 +1,328 @@ +#include +#include + +#include "/sys/src/libc/port/runenormdata" + +//Unicode Standard: Section 3.12 Conjoining Jamo Behavior +enum { + SBase = 0xAC00, + LBase = 0x1100, + VBase = 0x1161, + TBase = 0x11A7, + + LCount = 19, + VCount = 21, + TCount = 28, + NCount = VCount * TCount, + SCount = LCount * NCount, + + LLast = LBase + LCount - 1, + SLast = SBase + SCount - 1, + VLast = VBase + VCount - 1, + TLast = TBase + TCount - 1, +}; + +void +decomposerune(Rune c, Rune dst[2]) +{ + uint x; + + if(c >= SBase && c <= SLast){ + c -= SBase; + x = c % TCount; + if(x){ + dst[0] = SBase + ((c / TCount) * TCount); + dst[1] = TBase + x; + return; + } + dst[0] = LBase + (c / NCount); + dst[1] = VBase + ((c % NCount) / TCount); + return; + } + x = decomplkup(c); + if((x & (ushort)~0) != 0){ + dst[0] = x>>16; + dst[1] = x & (ushort)~0; + return; + } + x >>= 16; + if(x >= 0xEEEE && x <0xF8FF){ + memmove(dst, _decompexceptions[x - 0xEEEE], sizeof(Rune)*2); + return; + } + dst[0] = x; + dst[1] = 0; +} + +Rune +composerune(Rune r[2]) +{ + uint x, y, *p, next; + + if(r[0] >= LBase && r[0] <= LLast){ + if(r[1] < VBase || r[1] > VLast) + return 0; + x = (r[0] - LBase) * NCount + (r[1] - VBase) * TCount; + return SBase + x; + } + if(r[0] >= SBase && r[0] <= SLast && (r[0] - SBase) % TCount == 0){ + if(r[1] > TBase && r[1] <= TLast) + return r[0] + (r[1] - TBase); + return 0; + } + if(r[0] > (ushort)~0 || r[1] > (ushort)~0){ + for(x = 0; x < nelem(_recompexceptions); x++) + if(r[0] == _recompexceptions[x][1] && r[1] == _recompexceptions[x][2]) + return _recompexceptions[x][0]; + return 0; + } + y = x = r[0]<<16 | r[1]; + x ^= x >> 16; + x *= 0x21f0aaad; + x ^= x >> 15; + x *= 0xd35a2d97; + x ^= x >> 15; + p = _recompdata + (x%512)*2; + while(p[0] != y){ + next = p[1]>>16; + if(!next) + return 0; + p = _recompcoll + (next-1)*2; + } + return p[1] & 0xFFFF; +} + +int +runeccc(Rune c) +{ + return ccclkup(c); +} + +void +runecccsort(Rune *a, int len) +{ + Rune r; + int i; + int fail; + + do { + fail = 0; + for(i = 0; i < len - 1; i++){ + if(runeccc(a[i]) > runeccc(a[i+1]) > 0){ + r = a[i]; + a[i] = a[i+1]; + a[i + 1] = r; + fail = 1; + } + } + } while(fail); +} + +char* +fullutfnorm(char *s, int n) +{ + Rune r, peek; + char *p, *p2; + + p = s; + if(fullrune(p, n) == 0) + return s; + + p += chartorune(&r, p); + n -= (p - s); + + if((r >= LBase && r <= LLast) || (r >= SBase && r <= SLast)){ + do { + if(fullrune(p, n) == 0) + return s; + p2 = p + chartorune(&peek, p); + n -= (p2 - p); + p = p2; + } while(n > 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast)); + if(n <= 0) + return s; + return p; + } + + do { + if(fullrune(p, n) == 0) + return s; + p2 = p + chartorune(&peek, p); + n -= (p2 - p); + p = p2; + if(runeccc(peek) == 0) + return p; + } while(n > 0); + + return s; +} + +Rune* +fullrunenorm(Rune *r, int n) +{ + Rune *e, *p; + + p = r; + e = p + n; + + if((*p >= LBase && *p <= LLast) || (*p >= SBase && *p <= SLast)){ + p++; + while(p < e && (*p >= VBase && *p <= VLast) || (*p > TBase && *p <= TLast)) + p++; + + if(p >= e) + return r; + return p; + } + + for(; p < e && p + 1 < e; p++) + if(runeccc(p[1]) == 0) + return p + 1; + + return r; +} + +int +_runenorm(Rune *dst, Rune *src, char *sdst, char *ssrc, int max, int compose) +{ + Rune c, r[2], _stack[32]; + Rune *p, *stack, *sp, *tp; + char *strp, *strstop; + Rune *rp, *rrp; + Rune *stop; + Rune peek; + int w, w2, size; + int mode; + + if(src){ + mode = 1; + p = src; + stop = dst + (max - 1); + strp = ""; + strstop = nil; + } else { + mode = 0; + p = L""; + stop = nil; + strp = ssrc; + strstop = sdst + (max - 1); + } + + stack = _stack + nelem(_stack)/2; + size = 0; + w = w2 = 0; + while(*strp || *p){ + if(mode) + c = *p; + else + w = chartorune(&c, strp); + + sp = stack - 1; + tp = stack; + decomposerune(c, r); + while(r[0] != 0){ + c = r[0]; + if(r[1] != 0){ + *sp-- = r[1]; + if(sp == _stack) + break; + } + decomposerune(c, r); + } + + *sp = c; + if(mode) + peek = p[1]; + else + w2 = chartorune(&peek, strp+w); + + if((*sp >= LBase && *sp <= LLast) || (*sp >= SBase && *sp <= SLast)){ + while(peek != 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast)){ + *tp++ = peek; + if(mode){ + p++; + peek = p[1]; + } else { + strp += w; + w = w2; + w2 = chartorune(&peek, strp+w); + } + if(tp == _stack + nelem(_stack)) + break; + } + } + while(peek != 0 && runeccc(peek) != 0){ + decomposerune(peek, r); + if(r[1] != 0){ + if(tp+1 >= _stack + nelem(_stack)) + break; + *tp++ = r[0]; + *tp++ = r[1]; + } else if(r[0] != 0) + *tp++ = r[0]; + else + *tp++ = peek; + + if(mode){ + p++; + peek = p[1]; + } else { + strp += w; + w = w2; + w2 = chartorune(&peek, strp+w); + } + if(tp == _stack + nelem(_stack)) + break; + } + runecccsort(sp, tp - sp); + + if(compose && runeccc(*sp) == 0){ + for(rp = sp + 1; rp < tp; rp++){ + r[0] = *sp; + r[1] = *rp; + c = composerune(r); + if(c != 0){ + *sp = c; + for(rrp = rp; rrp > sp; rrp--) + *rrp = rrp[-1]; + sp++; + } else while(rp + 1 < tp && runeccc(*rp) == runeccc(*(rp+1))) + rp++; + } + } + + for(; sp < tp; sp++){ + if(mode){ + if(dst < stop) + *dst++ = *sp; + size++; + } else { + w2 = runelen(*sp); + if(sdst+w2 < strstop) + sdst += runetochar(sdst, sp); + size += w2; + } + } + if(mode) + p++; + else + strp += w; + } + if(mode) + *dst = 0; + else + *sdst = 0; + return size; +} + +int +runenorm(Rune *dst, Rune *src, int max, int compose) +{ + return _runenorm(dst, src, nil, nil, max, compose); +} + +int +utfnorm(char *dst, char *src, int max, int compose) +{ + return _runenorm(nil, nil, dst, src, max, compose); +} --- /dev/null +++ b//sys/src/libc/port/runetotype.c @@ -1,0 +1,22 @@ +#include +#include + +#include "/sys/src/libc/port/runetotypedata" + +Rune +toupperrune(Rune c) +{ + return c + upperlkup(c); +} + +Rune +tolowerrune(Rune c) +{ + return c + lowerlkup(c); +} + +Rune +totitlerune(Rune c) +{ + return c + titlelkup(c); +} --- a//sys/src/libc/port/runetype.c +++ /dev/null @@ -1,1181 +1,0 @@ -#include -#include - -/* - * alpha ranges - - * only covers ranges not in lower||upper - */ -static -Rune _alpha2[] = -{ - 0x00d8, 0x00f6, /* Ø - ö */ - 0x00f8, 0x01f5, /* ø - ǵ */ - 0x0250, 0x02a8, /* ɐ - ʨ */ - 0x038e, 0x03a1, /* Ύ - Ρ */ - 0x03a3, 0x03ce, /* Σ - ώ */ - 0x03d0, 0x03d6, /* ϐ - ϖ */ - 0x03e2, 0x03f3, /* Ϣ - ϳ */ - 0x0490, 0x04c4, /* Ґ - ӄ */ - 0x0561, 0x0587, /* ա - և */ - 0x05d0, 0x05ea, /* א - ת */ - 0x05f0, 0x05f2, /* װ - ײ */ - 0x0621, 0x063a, /* ء - غ */ - 0x0640, 0x064a, /* ـ - ي */ - 0x0671, 0x06b7, /* ٱ - ڷ */ - 0x06ba, 0x06be, /* ں - ھ */ - 0x06c0, 0x06ce, /* ۀ - ێ */ - 0x06d0, 0x06d3, /* ې - ۓ */ - 0x0905, 0x0939, /* अ - ह */ - 0x0958, 0x0961, /* क़ - ॡ */ - 0x0985, 0x098c, /* অ - ঌ */ - 0x098f, 0x0990, /* এ - ঐ */ - 0x0993, 0x09a8, /* ও - ন */ - 0x09aa, 0x09b0, /* প - র */ - 0x09b6, 0x09b9, /* শ - হ */ - 0x09dc, 0x09dd, /* ড় - ঢ় */ - 0x09df, 0x09e1, /* য় - ৡ */ - 0x09f0, 0x09f1, /* ৰ - ৱ */ - 0x0a05, 0x0a0a, /* ਅ - ਊ */ - 0x0a0f, 0x0a10, /* ਏ - ਐ */ - 0x0a13, 0x0a28, /* ਓ - ਨ */ - 0x0a2a, 0x0a30, /* ਪ - ਰ */ - 0x0a32, 0x0a33, /* ਲ - ਲ਼ */ - 0x0a35, 0x0a36, /* ਵ - ਸ਼ */ - 0x0a38, 0x0a39, /* ਸ - ਹ */ - 0x0a59, 0x0a5c, /* ਖ਼ - ੜ */ - 0x0a85, 0x0a8b, /* અ - ઋ */ - 0x0a8f, 0x0a91, /* એ - ઑ */ - 0x0a93, 0x0aa8, /* ઓ - ન */ - 0x0aaa, 0x0ab0, /* પ - ર */ - 0x0ab2, 0x0ab3, /* લ - ળ */ - 0x0ab5, 0x0ab9, /* વ - હ */ - 0x0b05, 0x0b0c, /* ଅ - ଌ */ - 0x0b0f, 0x0b10, /* ଏ - ଐ */ - 0x0b13, 0x0b28, /* ଓ - ନ */ - 0x0b2a, 0x0b30, /* ପ - ର */ - 0x0b32, 0x0b33, /* ଲ - ଳ */ - 0x0b36, 0x0b39, /* ଶ - ହ */ - 0x0b5c, 0x0b5d, /* ଡ଼ - ଢ଼ */ - 0x0b5f, 0x0b61, /* ୟ - ୡ */ - 0x0b85, 0x0b8a, /* அ - ஊ */ - 0x0b8e, 0x0b90, /* எ - ஐ */ - 0x0b92, 0x0b95, /* ஒ - க */ - 0x0b99, 0x0b9a, /* ங - ச */ - 0x0b9e, 0x0b9f, /* ஞ - ட */ - 0x0ba3, 0x0ba4, /* ண - த */ - 0x0ba8, 0x0baa, /* ந - ப */ - 0x0bae, 0x0bb5, /* ம - வ */ - 0x0bb7, 0x0bb9, /* ஷ - ஹ */ - 0x0c05, 0x0c0c, /* అ - ఌ */ - 0x0c0e, 0x0c10, /* ఎ - ఐ */ - 0x0c12, 0x0c28, /* ఒ - న */ - 0x0c2a, 0x0c33, /* ప - ళ */ - 0x0c35, 0x0c39, /* వ - హ */ - 0x0c60, 0x0c61, /* ౠ - ౡ */ - 0x0c85, 0x0c8c, /* ಅ - ಌ */ - 0x0c8e, 0x0c90, /* ಎ - ಐ */ - 0x0c92, 0x0ca8, /* ಒ - ನ */ - 0x0caa, 0x0cb3, /* ಪ - ಳ */ - 0x0cb5, 0x0cb9, /* ವ - ಹ */ - 0x0ce0, 0x0ce1, /* ೠ - ೡ */ - 0x0d05, 0x0d0c, /* അ - ഌ */ - 0x0d0e, 0x0d10, /* എ - ഐ */ - 0x0d12, 0x0d28, /* ഒ - ന */ - 0x0d2a, 0x0d39, /* പ - ഹ */ - 0x0d60, 0x0d61, /* ൠ - ൡ */ - 0x0e01, 0x0e30, /* ก - ะ */ - 0x0e32, 0x0e33, /* า - ำ */ - 0x0e40, 0x0e46, /* เ - ๆ */ - 0x0e5a, 0x0e5b, /* ๚ - ๛ */ - 0x0e81, 0x0e82, /* ກ - ຂ */ - 0x0e87, 0x0e88, /* ງ - ຈ */ - 0x0e94, 0x0e97, /* ດ - ທ */ - 0x0e99, 0x0e9f, /* ນ - ຟ */ - 0x0ea1, 0x0ea3, /* ມ - ຣ */ - 0x0eaa, 0x0eab, /* ສ - ຫ */ - 0x0ead, 0x0eae, /* ອ - ຮ */ - 0x0eb2, 0x0eb3, /* າ - ຳ */ - 0x0ec0, 0x0ec4, /* ເ - ໄ */ - 0x0edc, 0x0edd, /* ໜ - ໝ */ - 0x0f18, 0x0f19, /* ༘ - ༙ */ - 0x0f40, 0x0f47, /* ཀ - ཇ */ - 0x0f49, 0x0f69, /* ཉ - ཀྵ */ - 0x10d0, 0x10f6, /* ა - ჶ */ - 0x1100, 0x1159, /* ᄀ - ᅙ */ - 0x115f, 0x11a2, /* ᅟ - ᆢ */ - 0x11a8, 0x11f9, /* ᆨ - ᇹ */ - 0x1e00, 0x1e9b, /* Ḁ - ẛ */ - 0x1f50, 0x1f57, /* ὐ - ὗ */ - 0x1f80, 0x1fb4, /* ᾀ - ᾴ */ - 0x1fb6, 0x1fbc, /* ᾶ - ᾼ */ - 0x1fc2, 0x1fc4, /* ῂ - ῄ */ - 0x1fc6, 0x1fcc, /* ῆ - ῌ */ - 0x1fd0, 0x1fd3, /* ῐ - ΐ */ - 0x1fd6, 0x1fdb, /* ῖ - Ί */ - 0x1fe0, 0x1fec, /* ῠ - Ῥ */ - 0x1ff2, 0x1ff4, /* ῲ - ῴ */ - 0x1ff6, 0x1ffc, /* ῶ - ῼ */ - 0x210a, 0x2113, /* ℊ - ℓ */ - 0x2115, 0x211d, /* ℕ - ℝ */ - 0x2120, 0x2122, /* ℠ - ™ */ - 0x212a, 0x2131, /* K - ℱ */ - 0x2133, 0x2138, /* ℳ - ℸ */ - 0x3041, 0x3094, /* ぁ - ゔ */ - 0x30a1, 0x30fa, /* ァ - ヺ */ - 0x3105, 0x312c, /* ㄅ - ㄬ */ - 0x3131, 0x318e, /* ㄱ - ㆎ */ - 0x3192, 0x319f, /* ㆒ - ㆟ */ - 0x3260, 0x327b, /* ㉠ - ㉻ */ - 0x328a, 0x32b0, /* ㊊ - ㊰ */ - 0x32d0, 0x32fe, /* ㋐ - ㋾ */ - 0x3300, 0x3357, /* ㌀ - ㍗ */ - 0x3371, 0x3376, /* ㍱ - ㍶ */ - 0x337b, 0x3394, /* ㍻ - ㎔ */ - 0x3399, 0x339e, /* ㎙ - ㎞ */ - 0x33a9, 0x33ad, /* ㎩ - ㎭ */ - 0x33b0, 0x33c1, /* ㎰ - ㏁ */ - 0x33c3, 0x33c5, /* ㏃ - ㏅ */ - 0x33c7, 0x33d7, /* ㏇ - ㏗ */ - 0x33d9, 0x33dd, /* ㏙ - ㏝ */ - 0x4e00, 0x9fff, /* 一 - 鿿 */ - 0xac00, 0xd7a3, /* 가 - 힣 */ - 0xf900, 0xfb06, /* 豈 - st */ - 0xfb13, 0xfb17, /* ﬓ - ﬗ */ - 0xfb1f, 0xfb28, /* ײַ - ﬨ */ - 0xfb2a, 0xfb36, /* שׁ - זּ */ - 0xfb38, 0xfb3c, /* טּ - לּ */ - 0xfb40, 0xfb41, /* נּ - סּ */ - 0xfb43, 0xfb44, /* ףּ - פּ */ - 0xfb46, 0xfbb1, /* צּ - ﮱ */ - 0xfbd3, 0xfd3d, /* ﯓ - ﴽ */ - 0xfd50, 0xfd8f, /* ﵐ - ﶏ */ - 0xfd92, 0xfdc7, /* ﶒ - ﷇ */ - 0xfdf0, 0xfdf9, /* ﷰ - ﷹ */ - 0xfe70, 0xfe72, /* ﹰ - ﹲ */ - 0xfe76, 0xfefc, /* ﹶ - ﻼ */ - 0xff66, 0xff6f, /* ヲ - ッ */ - 0xff71, 0xff9d, /* ア - ン */ - 0xffa0, 0xffbe, /* ᅠ - ᄒ */ - 0xffc2, 0xffc7, /* ᅡ - ᅦ */ - 0xffca, 0xffcf, /* ᅧ - ᅬ */ - 0xffd2, 0xffd7, /* ᅭ - ᅲ */ - 0xffda, 0xffdc, /* ᅳ - ᅵ */ -}; - -/* - * alpha singlets - - * only covers ranges not in lower||upper - */ -static -Rune _alpha1[] = -{ - 0x00aa, /* ª */ - 0x00b5, /* µ */ - 0x00ba, /* º */ - 0x03da, /* Ϛ */ - 0x03dc, /* Ϝ */ - 0x03de, /* Ϟ */ - 0x03e0, /* Ϡ */ - 0x06d5, /* ە */ - 0x09b2, /* ল */ - 0x0a5e, /* ਫ਼ */ - 0x0a8d, /* ઍ */ - 0x0ae0, /* ૠ */ - 0x0b9c, /* ஜ */ - 0x0cde, /* ೞ */ - 0x0e4f, /* ๏ */ - 0x0e84, /* ຄ */ - 0x0e8a, /* ຊ */ - 0x0e8d, /* ຍ */ - 0x0ea5, /* ລ */ - 0x0ea7, /* ວ */ - 0x0eb0, /* ະ */ - 0x0ebd, /* ຽ */ - 0x1fbe, /* ι */ - 0x207f, /* ⁿ */ - 0x20a8, /* ₨ */ - 0x2102, /* ℂ */ - 0x2107, /* ℇ */ - 0x2124, /* ℤ */ - 0x2126, /* Ω */ - 0x2128, /* ℨ */ - 0xfb3e, /* מּ */ - 0xfe74, /* ﹴ */ -}; - -/* - * space ranges - */ -static -Rune _space2[] = -{ - 0x0009, 0x000a, /* tab and newline */ - 0x0020, 0x0020, /* space */ - 0x0085, 0x0085, - 0x00a0, 0x00a0, /*   */ - 0x1680, 0x1680, - 0x180e, 0x180e, - 0x2000, 0x200b, /*   - ​ */ - 0x2028, 0x2029, /* 
 - 
 */ - 0x202f, 0x202f, - 0x205f, 0x205f, - 0x3000, 0x3000, /*   */ - 0xfeff, 0xfeff, /*  */ -}; - -/* - * lower case ranges - * 3rd col is conversion excess 500 - */ -static -Rune _toupper2[] = -{ - 0x0061, 0x007a, 468, /* a-z A-Z */ - 0x00e0, 0x00f6, 468, /* à-ö À-Ö */ - 0x00f8, 0x00fe, 468, /* ø-þ Ø-Þ */ - 0x0256, 0x0257, 295, /* ɖ-ɗ Ɖ-Ɗ */ - 0x0258, 0x0259, 298, /* ɘ-ə Ǝ-Ə */ - 0x028a, 0x028b, 283, /* ʊ-ʋ Ʊ-Ʋ */ - 0x03ad, 0x03af, 463, /* έ-ί Έ-Ί */ - 0x03b1, 0x03c1, 468, /* α-ρ Α-Ρ */ - 0x03c3, 0x03cb, 468, /* σ-ϋ Σ-Ϋ */ - 0x03cd, 0x03ce, 437, /* ύ-ώ Ύ-Ώ */ - 0x0430, 0x044f, 468, /* а-я А-Я */ - 0x0451, 0x045c, 420, /* ё-ќ Ё-Ќ */ - 0x045e, 0x045f, 420, /* ў-џ Ў-Џ */ - 0x0561, 0x0586, 452, /* ա-ֆ Ա-Ֆ */ - 0x1f00, 0x1f07, 508, /* ἀ-ἇ Ἀ-Ἇ */ - 0x1f10, 0x1f15, 508, /* ἐ-ἕ Ἐ-Ἕ */ - 0x1f20, 0x1f27, 508, /* ἠ-ἧ Ἠ-Ἧ */ - 0x1f30, 0x1f37, 508, /* ἰ-ἷ Ἰ-Ἷ */ - 0x1f40, 0x1f45, 508, /* ὀ-ὅ Ὀ-Ὅ */ - 0x1f60, 0x1f67, 508, /* ὠ-ὧ Ὠ-Ὧ */ - 0x1f70, 0x1f71, 574, /* ὰ-ά Ὰ-Ά */ - 0x1f72, 0x1f75, 586, /* ὲ-ή Ὲ-Ή */ - 0x1f76, 0x1f77, 600, /* ὶ-ί Ὶ-Ί */ - 0x1f78, 0x1f79, 628, /* ὸ-ό Ὸ-Ό */ - 0x1f7a, 0x1f7b, 612, /* ὺ-ύ Ὺ-Ύ */ - 0x1f7c, 0x1f7d, 626, /* ὼ-ώ Ὼ-Ώ */ - 0x1f80, 0x1f87, 508, /* ᾀ-ᾇ ᾈ-ᾏ */ - 0x1f90, 0x1f97, 508, /* ᾐ-ᾗ ᾘ-ᾟ */ - 0x1fa0, 0x1fa7, 508, /* ᾠ-ᾧ ᾨ-ᾯ */ - 0x1fb0, 0x1fb1, 508, /* ᾰ-ᾱ Ᾰ-Ᾱ */ - 0x1fd0, 0x1fd1, 508, /* ῐ-ῑ Ῐ-Ῑ */ - 0x1fe0, 0x1fe1, 508, /* ῠ-ῡ Ῠ-Ῡ */ - 0x2170, 0x217f, 484, /* ⅰ-ⅿ Ⅰ-Ⅿ */ - 0x24d0, 0x24e9, 474, /* ⓐ-ⓩ Ⓐ-Ⓩ */ - 0xff41, 0xff5a, 468, /* a-z A-Z */ -}; - -/* - * lower case singlets - * 2nd col is conversion excess 500 - */ -static -Rune _toupper1[] = -{ - 0x00ff, 621, /* ÿ Ÿ */ - 0x0101, 499, /* ā Ā */ - 0x0103, 499, /* ă Ă */ - 0x0105, 499, /* ą Ą */ - 0x0107, 499, /* ć Ć */ - 0x0109, 499, /* ĉ Ĉ */ - 0x010b, 499, /* ċ Ċ */ - 0x010d, 499, /* č Č */ - 0x010f, 499, /* ď Ď */ - 0x0111, 499, /* đ Đ */ - 0x0113, 499, /* ē Ē */ - 0x0115, 499, /* ĕ Ĕ */ - 0x0117, 499, /* ė Ė */ - 0x0119, 499, /* ę Ę */ - 0x011b, 499, /* ě Ě */ - 0x011d, 499, /* ĝ Ĝ */ - 0x011f, 499, /* ğ Ğ */ - 0x0121, 499, /* ġ Ġ */ - 0x0123, 499, /* ģ Ģ */ - 0x0125, 499, /* ĥ Ĥ */ - 0x0127, 499, /* ħ Ħ */ - 0x0129, 499, /* ĩ Ĩ */ - 0x012b, 499, /* ī Ī */ - 0x012d, 499, /* ĭ Ĭ */ - 0x012f, 499, /* į Į */ - 0x0131, 268, /* ı I */ - 0x0133, 499, /* ij IJ */ - 0x0135, 499, /* ĵ Ĵ */ - 0x0137, 499, /* ķ Ķ */ - 0x013a, 499, /* ĺ Ĺ */ - 0x013c, 499, /* ļ Ļ */ - 0x013e, 499, /* ľ Ľ */ - 0x0140, 499, /* ŀ Ŀ */ - 0x0142, 499, /* ł Ł */ - 0x0144, 499, /* ń Ń */ - 0x0146, 499, /* ņ Ņ */ - 0x0148, 499, /* ň Ň */ - 0x014b, 499, /* ŋ Ŋ */ - 0x014d, 499, /* ō Ō */ - 0x014f, 499, /* ŏ Ŏ */ - 0x0151, 499, /* ő Ő */ - 0x0153, 499, /* œ Œ */ - 0x0155, 499, /* ŕ Ŕ */ - 0x0157, 499, /* ŗ Ŗ */ - 0x0159, 499, /* ř Ř */ - 0x015b, 499, /* ś Ś */ - 0x015d, 499, /* ŝ Ŝ */ - 0x015f, 499, /* ş Ş */ - 0x0161, 499, /* š Š */ - 0x0163, 499, /* ţ Ţ */ - 0x0165, 499, /* ť Ť */ - 0x0167, 499, /* ŧ Ŧ */ - 0x0169, 499, /* ũ Ũ */ - 0x016b, 499, /* ū Ū */ - 0x016d, 499, /* ŭ Ŭ */ - 0x016f, 499, /* ů Ů */ - 0x0171, 499, /* ű Ű */ - 0x0173, 499, /* ų Ų */ - 0x0175, 499, /* ŵ Ŵ */ - 0x0177, 499, /* ŷ Ŷ */ - 0x017a, 499, /* ź Ź */ - 0x017c, 499, /* ż Ż */ - 0x017e, 499, /* ž Ž */ - 0x017f, 200, /* ſ S */ - 0x0183, 499, /* ƃ Ƃ */ - 0x0185, 499, /* ƅ Ƅ */ - 0x0188, 499, /* ƈ Ƈ */ - 0x018c, 499, /* ƌ Ƌ */ - 0x0192, 499, /* ƒ Ƒ */ - 0x0199, 499, /* ƙ Ƙ */ - 0x01a1, 499, /* ơ Ơ */ - 0x01a3, 499, /* ƣ Ƣ */ - 0x01a5, 499, /* ƥ Ƥ */ - 0x01a8, 499, /* ƨ Ƨ */ - 0x01ad, 499, /* ƭ Ƭ */ - 0x01b0, 499, /* ư Ư */ - 0x01b4, 499, /* ƴ Ƴ */ - 0x01b6, 499, /* ƶ Ƶ */ - 0x01b9, 499, /* ƹ Ƹ */ - 0x01bd, 499, /* ƽ Ƽ */ - 0x01c5, 499, /* Dž DŽ */ - 0x01c6, 498, /* dž DŽ */ - 0x01c8, 499, /* Lj LJ */ - 0x01c9, 498, /* lj LJ */ - 0x01cb, 499, /* Nj NJ */ - 0x01cc, 498, /* nj NJ */ - 0x01ce, 499, /* ǎ Ǎ */ - 0x01d0, 499, /* ǐ Ǐ */ - 0x01d2, 499, /* ǒ Ǒ */ - 0x01d4, 499, /* ǔ Ǔ */ - 0x01d6, 499, /* ǖ Ǖ */ - 0x01d8, 499, /* ǘ Ǘ */ - 0x01da, 499, /* ǚ Ǚ */ - 0x01dc, 499, /* ǜ Ǜ */ - 0x01df, 499, /* ǟ Ǟ */ - 0x01e1, 499, /* ǡ Ǡ */ - 0x01e3, 499, /* ǣ Ǣ */ - 0x01e5, 499, /* ǥ Ǥ */ - 0x01e7, 499, /* ǧ Ǧ */ - 0x01e9, 499, /* ǩ Ǩ */ - 0x01eb, 499, /* ǫ Ǫ */ - 0x01ed, 499, /* ǭ Ǭ */ - 0x01ef, 499, /* ǯ Ǯ */ - 0x01f2, 499, /* Dz DZ */ - 0x01f3, 498, /* dz DZ */ - 0x01f5, 499, /* ǵ Ǵ */ - 0x01fb, 499, /* ǻ Ǻ */ - 0x01fd, 499, /* ǽ Ǽ */ - 0x01ff, 499, /* ǿ Ǿ */ - 0x0201, 499, /* ȁ Ȁ */ - 0x0203, 499, /* ȃ Ȃ */ - 0x0205, 499, /* ȅ Ȅ */ - 0x0207, 499, /* ȇ Ȇ */ - 0x0209, 499, /* ȉ Ȉ */ - 0x020b, 499, /* ȋ Ȋ */ - 0x020d, 499, /* ȍ Ȍ */ - 0x020f, 499, /* ȏ Ȏ */ - 0x0211, 499, /* ȑ Ȑ */ - 0x0213, 499, /* ȓ Ȓ */ - 0x0215, 499, /* ȕ Ȕ */ - 0x0217, 499, /* ȗ Ȗ */ - 0x0253, 290, /* ɓ Ɓ */ - 0x0254, 294, /* ɔ Ɔ */ - 0x025b, 297, /* ɛ Ɛ */ - 0x0260, 295, /* ɠ Ɠ */ - 0x0263, 293, /* ɣ Ɣ */ - 0x0268, 291, /* ɨ Ɨ */ - 0x0269, 289, /* ɩ Ɩ */ - 0x026f, 289, /* ɯ Ɯ */ - 0x0272, 287, /* ɲ Ɲ */ - 0x0283, 282, /* ʃ Ʃ */ - 0x0288, 282, /* ʈ Ʈ */ - 0x0292, 281, /* ʒ Ʒ */ - 0x03ac, 462, /* ά Ά */ - 0x03cc, 436, /* ό Ό */ - 0x03d0, 438, /* ϐ Β */ - 0x03d1, 443, /* ϑ Θ */ - 0x03d5, 453, /* ϕ Φ */ - 0x03d6, 446, /* ϖ Π */ - 0x03e3, 499, /* ϣ Ϣ */ - 0x03e5, 499, /* ϥ Ϥ */ - 0x03e7, 499, /* ϧ Ϧ */ - 0x03e9, 499, /* ϩ Ϩ */ - 0x03eb, 499, /* ϫ Ϫ */ - 0x03ed, 499, /* ϭ Ϭ */ - 0x03ef, 499, /* ϯ Ϯ */ - 0x03f0, 414, /* ϰ Κ */ - 0x03f1, 420, /* ϱ Ρ */ - 0x0461, 499, /* ѡ Ѡ */ - 0x0463, 499, /* ѣ Ѣ */ - 0x0465, 499, /* ѥ Ѥ */ - 0x0467, 499, /* ѧ Ѧ */ - 0x0469, 499, /* ѩ Ѩ */ - 0x046b, 499, /* ѫ Ѫ */ - 0x046d, 499, /* ѭ Ѭ */ - 0x046f, 499, /* ѯ Ѯ */ - 0x0471, 499, /* ѱ Ѱ */ - 0x0473, 499, /* ѳ Ѳ */ - 0x0475, 499, /* ѵ Ѵ */ - 0x0477, 499, /* ѷ Ѷ */ - 0x0479, 499, /* ѹ Ѹ */ - 0x047b, 499, /* ѻ Ѻ */ - 0x047d, 499, /* ѽ Ѽ */ - 0x047f, 499, /* ѿ Ѿ */ - 0x0481, 499, /* ҁ Ҁ */ - 0x0491, 499, /* ґ Ґ */ - 0x0493, 499, /* ғ Ғ */ - 0x0495, 499, /* ҕ Ҕ */ - 0x0497, 499, /* җ Җ */ - 0x0499, 499, /* ҙ Ҙ */ - 0x049b, 499, /* қ Қ */ - 0x049d, 499, /* ҝ Ҝ */ - 0x049f, 499, /* ҟ Ҟ */ - 0x04a1, 499, /* ҡ Ҡ */ - 0x04a3, 499, /* ң Ң */ - 0x04a5, 499, /* ҥ Ҥ */ - 0x04a7, 499, /* ҧ Ҧ */ - 0x04a9, 499, /* ҩ Ҩ */ - 0x04ab, 499, /* ҫ Ҫ */ - 0x04ad, 499, /* ҭ Ҭ */ - 0x04af, 499, /* ү Ү */ - 0x04b1, 499, /* ұ Ұ */ - 0x04b3, 499, /* ҳ Ҳ */ - 0x04b5, 499, /* ҵ Ҵ */ - 0x04b7, 499, /* ҷ Ҷ */ - 0x04b9, 499, /* ҹ Ҹ */ - 0x04bb, 499, /* һ Һ */ - 0x04bd, 499, /* ҽ Ҽ */ - 0x04bf, 499, /* ҿ Ҿ */ - 0x04c2, 499, /* ӂ Ӂ */ - 0x04c4, 499, /* ӄ Ӄ */ - 0x04c8, 499, /* ӈ Ӈ */ - 0x04cc, 499, /* ӌ Ӌ */ - 0x04d1, 499, /* ӑ Ӑ */ - 0x04d3, 499, /* ӓ Ӓ */ - 0x04d5, 499, /* ӕ Ӕ */ - 0x04d7, 499, /* ӗ Ӗ */ - 0x04d9, 499, /* ә Ә */ - 0x04db, 499, /* ӛ Ӛ */ - 0x04dd, 499, /* ӝ Ӝ */ - 0x04df, 499, /* ӟ Ӟ */ - 0x04e1, 499, /* ӡ Ӡ */ - 0x04e3, 499, /* ӣ Ӣ */ - 0x04e5, 499, /* ӥ Ӥ */ - 0x04e7, 499, /* ӧ Ӧ */ - 0x04e9, 499, /* ө Ө */ - 0x04eb, 499, /* ӫ Ӫ */ - 0x04ef, 499, /* ӯ Ӯ */ - 0x04f1, 499, /* ӱ Ӱ */ - 0x04f3, 499, /* ӳ Ӳ */ - 0x04f5, 499, /* ӵ Ӵ */ - 0x04f9, 499, /* ӹ Ӹ */ - 0x1e01, 499, /* ḁ Ḁ */ - 0x1e03, 499, /* ḃ Ḃ */ - 0x1e05, 499, /* ḅ Ḅ */ - 0x1e07, 499, /* ḇ Ḇ */ - 0x1e09, 499, /* ḉ Ḉ */ - 0x1e0b, 499, /* ḋ Ḋ */ - 0x1e0d, 499, /* ḍ Ḍ */ - 0x1e0f, 499, /* ḏ Ḏ */ - 0x1e11, 499, /* ḑ Ḑ */ - 0x1e13, 499, /* ḓ Ḓ */ - 0x1e15, 499, /* ḕ Ḕ */ - 0x1e17, 499, /* ḗ Ḗ */ - 0x1e19, 499, /* ḙ Ḙ */ - 0x1e1b, 499, /* ḛ Ḛ */ - 0x1e1d, 499, /* ḝ Ḝ */ - 0x1e1f, 499, /* ḟ Ḟ */ - 0x1e21, 499, /* ḡ Ḡ */ - 0x1e23, 499, /* ḣ Ḣ */ - 0x1e25, 499, /* ḥ Ḥ */ - 0x1e27, 499, /* ḧ Ḧ */ - 0x1e29, 499, /* ḩ Ḩ */ - 0x1e2b, 499, /* ḫ Ḫ */ - 0x1e2d, 499, /* ḭ Ḭ */ - 0x1e2f, 499, /* ḯ Ḯ */ - 0x1e31, 499, /* ḱ Ḱ */ - 0x1e33, 499, /* ḳ Ḳ */ - 0x1e35, 499, /* ḵ Ḵ */ - 0x1e37, 499, /* ḷ Ḷ */ - 0x1e39, 499, /* ḹ Ḹ */ - 0x1e3b, 499, /* ḻ Ḻ */ - 0x1e3d, 499, /* ḽ Ḽ */ - 0x1e3f, 499, /* ḿ Ḿ */ - 0x1e41, 499, /* ṁ Ṁ */ - 0x1e43, 499, /* ṃ Ṃ */ - 0x1e45, 499, /* ṅ Ṅ */ - 0x1e47, 499, /* ṇ Ṇ */ - 0x1e49, 499, /* ṉ Ṉ */ - 0x1e4b, 499, /* ṋ Ṋ */ - 0x1e4d, 499, /* ṍ Ṍ */ - 0x1e4f, 499, /* ṏ Ṏ */ - 0x1e51, 499, /* ṑ Ṑ */ - 0x1e53, 499, /* ṓ Ṓ */ - 0x1e55, 499, /* ṕ Ṕ */ - 0x1e57, 499, /* ṗ Ṗ */ - 0x1e59, 499, /* ṙ Ṙ */ - 0x1e5b, 499, /* ṛ Ṛ */ - 0x1e5d, 499, /* ṝ Ṝ */ - 0x1e5f, 499, /* ṟ Ṟ */ - 0x1e61, 499, /* ṡ Ṡ */ - 0x1e63, 499, /* ṣ Ṣ */ - 0x1e65, 499, /* ṥ Ṥ */ - 0x1e67, 499, /* ṧ Ṧ */ - 0x1e69, 499, /* ṩ Ṩ */ - 0x1e6b, 499, /* ṫ Ṫ */ - 0x1e6d, 499, /* ṭ Ṭ */ - 0x1e6f, 499, /* ṯ Ṯ */ - 0x1e71, 499, /* ṱ Ṱ */ - 0x1e73, 499, /* ṳ Ṳ */ - 0x1e75, 499, /* ṵ Ṵ */ - 0x1e77, 499, /* ṷ Ṷ */ - 0x1e79, 499, /* ṹ Ṹ */ - 0x1e7b, 499, /* ṻ Ṻ */ - 0x1e7d, 499, /* ṽ Ṽ */ - 0x1e7f, 499, /* ṿ Ṿ */ - 0x1e81, 499, /* ẁ Ẁ */ - 0x1e83, 499, /* ẃ Ẃ */ - 0x1e85, 499, /* ẅ Ẅ */ - 0x1e87, 499, /* ẇ Ẇ */ - 0x1e89, 499, /* ẉ Ẉ */ - 0x1e8b, 499, /* ẋ Ẋ */ - 0x1e8d, 499, /* ẍ Ẍ */ - 0x1e8f, 499, /* ẏ Ẏ */ - 0x1e91, 499, /* ẑ Ẑ */ - 0x1e93, 499, /* ẓ Ẓ */ - 0x1e95, 499, /* ẕ Ẕ */ - 0x1ea1, 499, /* ạ Ạ */ - 0x1ea3, 499, /* ả Ả */ - 0x1ea5, 499, /* ấ Ấ */ - 0x1ea7, 499, /* ầ Ầ */ - 0x1ea9, 499, /* ẩ Ẩ */ - 0x1eab, 499, /* ẫ Ẫ */ - 0x1ead, 499, /* ậ Ậ */ - 0x1eaf, 499, /* ắ Ắ */ - 0x1eb1, 499, /* ằ Ằ */ - 0x1eb3, 499, /* ẳ Ẳ */ - 0x1eb5, 499, /* ẵ Ẵ */ - 0x1eb7, 499, /* ặ Ặ */ - 0x1eb9, 499, /* ẹ Ẹ */ - 0x1ebb, 499, /* ẻ Ẻ */ - 0x1ebd, 499, /* ẽ Ẽ */ - 0x1ebf, 499, /* ế Ế */ - 0x1ec1, 499, /* ề Ề */ - 0x1ec3, 499, /* ể Ể */ - 0x1ec5, 499, /* ễ Ễ */ - 0x1ec7, 499, /* ệ Ệ */ - 0x1ec9, 499, /* ỉ Ỉ */ - 0x1ecb, 499, /* ị Ị */ - 0x1ecd, 499, /* ọ Ọ */ - 0x1ecf, 499, /* ỏ Ỏ */ - 0x1ed1, 499, /* ố Ố */ - 0x1ed3, 499, /* ồ Ồ */ - 0x1ed5, 499, /* ổ Ổ */ - 0x1ed7, 499, /* ỗ Ỗ */ - 0x1ed9, 499, /* ộ Ộ */ - 0x1edb, 499, /* ớ Ớ */ - 0x1edd, 499, /* ờ Ờ */ - 0x1edf, 499, /* ở Ở */ - 0x1ee1, 499, /* ỡ Ỡ */ - 0x1ee3, 499, /* ợ Ợ */ - 0x1ee5, 499, /* ụ Ụ */ - 0x1ee7, 499, /* ủ Ủ */ - 0x1ee9, 499, /* ứ Ứ */ - 0x1eeb, 499, /* ừ Ừ */ - 0x1eed, 499, /* ử Ử */ - 0x1eef, 499, /* ữ Ữ */ - 0x1ef1, 499, /* ự Ự */ - 0x1ef3, 499, /* ỳ Ỳ */ - 0x1ef5, 499, /* ỵ Ỵ */ - 0x1ef7, 499, /* ỷ Ỷ */ - 0x1ef9, 499, /* ỹ Ỹ */ - 0x1f51, 508, /* ὑ Ὑ */ - 0x1f53, 508, /* ὓ Ὓ */ - 0x1f55, 508, /* ὕ Ὕ */ - 0x1f57, 508, /* ὗ Ὗ */ - 0x1fb3, 509, /* ᾳ ᾼ */ - 0x1fc3, 509, /* ῃ ῌ */ - 0x1fe5, 507, /* ῥ Ῥ */ - 0x1ff3, 509, /* ῳ ῼ */ -}; - -static Rune __isdigitr[] = { - 0x0030, 0x0039, - 0x0660, 0x0669, - 0x06f0, 0x06f9, - 0x07c0, 0x07c9, - 0x0966, 0x096f, - 0x09e6, 0x09ef, - 0x0a66, 0x0a6f, - 0x0ae6, 0x0aef, - 0x0b66, 0x0b6f, - 0x0be6, 0x0bef, - 0x0c66, 0x0c6f, - 0x0ce6, 0x0cef, - 0x0d66, 0x0d6f, - 0x0e50, 0x0e59, - 0x0ed0, 0x0ed9, - 0x0f20, 0x0f29, - 0x1040, 0x1049, - 0x17e0, 0x17e9, - 0x1810, 0x1819, - 0x1946, 0x194f, - 0x19d0, 0x19d9, - 0x1b50, 0x1b59, - 0xff10, 0xff19, - 0x104a0, 0x104a9, - 0x1d7ce, 0x1d7ff, -}; - -/* - * upper case ranges - * 3rd col is conversion excess 500 - */ -static -Rune _tolower2[] = -{ - 0x0041, 0x005a, 532, /* A-Z a-z */ - 0x00c0, 0x00d6, 532, /* À-Ö à-ö */ - 0x00d8, 0x00de, 532, /* Ø-Þ ø-þ */ - 0x0189, 0x018a, 705, /* Ɖ-Ɗ ɖ-ɗ */ - 0x018e, 0x018f, 702, /* Ǝ-Ə ɘ-ə */ - 0x01b1, 0x01b2, 717, /* Ʊ-Ʋ ʊ-ʋ */ - 0x0388, 0x038a, 537, /* Έ-Ί έ-ί */ - 0x038e, 0x038f, 563, /* Ύ-Ώ ύ-ώ */ - 0x0391, 0x03a1, 532, /* Α-Ρ α-ρ */ - 0x03a3, 0x03ab, 532, /* Σ-Ϋ σ-ϋ */ - 0x0401, 0x040c, 580, /* Ё-Ќ ё-ќ */ - 0x040e, 0x040f, 580, /* Ў-Џ ў-џ */ - 0x0410, 0x042f, 532, /* А-Я а-я */ - 0x0531, 0x0556, 548, /* Ա-Ֆ ա-ֆ */ - 0x10a0, 0x10c5, 548, /* Ⴀ-Ⴥ ა-ჵ */ - 0x1f08, 0x1f0f, 492, /* Ἀ-Ἇ ἀ-ἇ */ - 0x1f18, 0x1f1d, 492, /* Ἐ-Ἕ ἐ-ἕ */ - 0x1f28, 0x1f2f, 492, /* Ἠ-Ἧ ἠ-ἧ */ - 0x1f38, 0x1f3f, 492, /* Ἰ-Ἷ ἰ-ἷ */ - 0x1f48, 0x1f4d, 492, /* Ὀ-Ὅ ὀ-ὅ */ - 0x1f68, 0x1f6f, 492, /* Ὠ-Ὧ ὠ-ὧ */ - 0x1f88, 0x1f8f, 492, /* ᾈ-ᾏ ᾀ-ᾇ */ - 0x1f98, 0x1f9f, 492, /* ᾘ-ᾟ ᾐ-ᾗ */ - 0x1fa8, 0x1faf, 492, /* ᾨ-ᾯ ᾠ-ᾧ */ - 0x1fb8, 0x1fb9, 492, /* Ᾰ-Ᾱ ᾰ-ᾱ */ - 0x1fba, 0x1fbb, 426, /* Ὰ-Ά ὰ-ά */ - 0x1fc8, 0x1fcb, 414, /* Ὲ-Ή ὲ-ή */ - 0x1fd8, 0x1fd9, 492, /* Ῐ-Ῑ ῐ-ῑ */ - 0x1fda, 0x1fdb, 400, /* Ὶ-Ί ὶ-ί */ - 0x1fe8, 0x1fe9, 492, /* Ῠ-Ῡ ῠ-ῡ */ - 0x1fea, 0x1feb, 388, /* Ὺ-Ύ ὺ-ύ */ - 0x1ff8, 0x1ff9, 372, /* Ὸ-Ό ὸ-ό */ - 0x1ffa, 0x1ffb, 374, /* Ὼ-Ώ ὼ-ώ */ - 0x2160, 0x216f, 516, /* Ⅰ-Ⅿ ⅰ-ⅿ */ - 0x24b6, 0x24cf, 526, /* Ⓐ-Ⓩ ⓐ-ⓩ */ - 0xff21, 0xff3a, 532, /* A-Z a-z */ -}; - -/* - * upper case singlets - * 2nd col is conversion excess 500 - */ -static -Rune _tolower1[] = -{ - 0x0100, 501, /* Ā ā */ - 0x0102, 501, /* Ă ă */ - 0x0104, 501, /* Ą ą */ - 0x0106, 501, /* Ć ć */ - 0x0108, 501, /* Ĉ ĉ */ - 0x010a, 501, /* Ċ ċ */ - 0x010c, 501, /* Č č */ - 0x010e, 501, /* Ď ď */ - 0x0110, 501, /* Đ đ */ - 0x0112, 501, /* Ē ē */ - 0x0114, 501, /* Ĕ ĕ */ - 0x0116, 501, /* Ė ė */ - 0x0118, 501, /* Ę ę */ - 0x011a, 501, /* Ě ě */ - 0x011c, 501, /* Ĝ ĝ */ - 0x011e, 501, /* Ğ ğ */ - 0x0120, 501, /* Ġ ġ */ - 0x0122, 501, /* Ģ ģ */ - 0x0124, 501, /* Ĥ ĥ */ - 0x0126, 501, /* Ħ ħ */ - 0x0128, 501, /* Ĩ ĩ */ - 0x012a, 501, /* Ī ī */ - 0x012c, 501, /* Ĭ ĭ */ - 0x012e, 501, /* Į į */ - 0x0130, 301, /* İ i */ - 0x0132, 501, /* IJ ij */ - 0x0134, 501, /* Ĵ ĵ */ - 0x0136, 501, /* Ķ ķ */ - 0x0139, 501, /* Ĺ ĺ */ - 0x013b, 501, /* Ļ ļ */ - 0x013d, 501, /* Ľ ľ */ - 0x013f, 501, /* Ŀ ŀ */ - 0x0141, 501, /* Ł ł */ - 0x0143, 501, /* Ń ń */ - 0x0145, 501, /* Ņ ņ */ - 0x0147, 501, /* Ň ň */ - 0x014a, 501, /* Ŋ ŋ */ - 0x014c, 501, /* Ō ō */ - 0x014e, 501, /* Ŏ ŏ */ - 0x0150, 501, /* Ő ő */ - 0x0152, 501, /* Œ œ */ - 0x0154, 501, /* Ŕ ŕ */ - 0x0156, 501, /* Ŗ ŗ */ - 0x0158, 501, /* Ř ř */ - 0x015a, 501, /* Ś ś */ - 0x015c, 501, /* Ŝ ŝ */ - 0x015e, 501, /* Ş ş */ - 0x0160, 501, /* Š š */ - 0x0162, 501, /* Ţ ţ */ - 0x0164, 501, /* Ť ť */ - 0x0166, 501, /* Ŧ ŧ */ - 0x0168, 501, /* Ũ ũ */ - 0x016a, 501, /* Ū ū */ - 0x016c, 501, /* Ŭ ŭ */ - 0x016e, 501, /* Ů ů */ - 0x0170, 501, /* Ű ű */ - 0x0172, 501, /* Ų ų */ - 0x0174, 501, /* Ŵ ŵ */ - 0x0176, 501, /* Ŷ ŷ */ - 0x0178, 379, /* Ÿ ÿ */ - 0x0179, 501, /* Ź ź */ - 0x017b, 501, /* Ż ż */ - 0x017d, 501, /* Ž ž */ - 0x0181, 710, /* Ɓ ɓ */ - 0x0182, 501, /* Ƃ ƃ */ - 0x0184, 501, /* Ƅ ƅ */ - 0x0186, 706, /* Ɔ ɔ */ - 0x0187, 501, /* Ƈ ƈ */ - 0x018b, 501, /* Ƌ ƌ */ - 0x0190, 703, /* Ɛ ɛ */ - 0x0191, 501, /* Ƒ ƒ */ - 0x0193, 705, /* Ɠ ɠ */ - 0x0194, 707, /* Ɣ ɣ */ - 0x0196, 711, /* Ɩ ɩ */ - 0x0197, 709, /* Ɨ ɨ */ - 0x0198, 501, /* Ƙ ƙ */ - 0x019c, 711, /* Ɯ ɯ */ - 0x019d, 713, /* Ɲ ɲ */ - 0x01a0, 501, /* Ơ ơ */ - 0x01a2, 501, /* Ƣ ƣ */ - 0x01a4, 501, /* Ƥ ƥ */ - 0x01a7, 501, /* Ƨ ƨ */ - 0x01a9, 718, /* Ʃ ʃ */ - 0x01ac, 501, /* Ƭ ƭ */ - 0x01ae, 718, /* Ʈ ʈ */ - 0x01af, 501, /* Ư ư */ - 0x01b3, 501, /* Ƴ ƴ */ - 0x01b5, 501, /* Ƶ ƶ */ - 0x01b7, 719, /* Ʒ ʒ */ - 0x01b8, 501, /* Ƹ ƹ */ - 0x01bc, 501, /* Ƽ ƽ */ - 0x01c4, 502, /* DŽ dž */ - 0x01c5, 501, /* Dž dž */ - 0x01c7, 502, /* LJ lj */ - 0x01c8, 501, /* Lj lj */ - 0x01ca, 502, /* NJ nj */ - 0x01cb, 501, /* Nj nj */ - 0x01cd, 501, /* Ǎ ǎ */ - 0x01cf, 501, /* Ǐ ǐ */ - 0x01d1, 501, /* Ǒ ǒ */ - 0x01d3, 501, /* Ǔ ǔ */ - 0x01d5, 501, /* Ǖ ǖ */ - 0x01d7, 501, /* Ǘ ǘ */ - 0x01d9, 501, /* Ǚ ǚ */ - 0x01db, 501, /* Ǜ ǜ */ - 0x01de, 501, /* Ǟ ǟ */ - 0x01e0, 501, /* Ǡ ǡ */ - 0x01e2, 501, /* Ǣ ǣ */ - 0x01e4, 501, /* Ǥ ǥ */ - 0x01e6, 501, /* Ǧ ǧ */ - 0x01e8, 501, /* Ǩ ǩ */ - 0x01ea, 501, /* Ǫ ǫ */ - 0x01ec, 501, /* Ǭ ǭ */ - 0x01ee, 501, /* Ǯ ǯ */ - 0x01f1, 502, /* DZ dz */ - 0x01f2, 501, /* Dz dz */ - 0x01f4, 501, /* Ǵ ǵ */ - 0x01fa, 501, /* Ǻ ǻ */ - 0x01fc, 501, /* Ǽ ǽ */ - 0x01fe, 501, /* Ǿ ǿ */ - 0x0200, 501, /* Ȁ ȁ */ - 0x0202, 501, /* Ȃ ȃ */ - 0x0204, 501, /* Ȅ ȅ */ - 0x0206, 501, /* Ȇ ȇ */ - 0x0208, 501, /* Ȉ ȉ */ - 0x020a, 501, /* Ȋ ȋ */ - 0x020c, 501, /* Ȍ ȍ */ - 0x020e, 501, /* Ȏ ȏ */ - 0x0210, 501, /* Ȑ ȑ */ - 0x0212, 501, /* Ȓ ȓ */ - 0x0214, 501, /* Ȕ ȕ */ - 0x0216, 501, /* Ȗ ȗ */ - 0x0386, 538, /* Ά ά */ - 0x038c, 564, /* Ό ό */ - 0x03e2, 501, /* Ϣ ϣ */ - 0x03e4, 501, /* Ϥ ϥ */ - 0x03e6, 501, /* Ϧ ϧ */ - 0x03e8, 501, /* Ϩ ϩ */ - 0x03ea, 501, /* Ϫ ϫ */ - 0x03ec, 501, /* Ϭ ϭ */ - 0x03ee, 501, /* Ϯ ϯ */ - 0x0460, 501, /* Ѡ ѡ */ - 0x0462, 501, /* Ѣ ѣ */ - 0x0464, 501, /* Ѥ ѥ */ - 0x0466, 501, /* Ѧ ѧ */ - 0x0468, 501, /* Ѩ ѩ */ - 0x046a, 501, /* Ѫ ѫ */ - 0x046c, 501, /* Ѭ ѭ */ - 0x046e, 501, /* Ѯ ѯ */ - 0x0470, 501, /* Ѱ ѱ */ - 0x0472, 501, /* Ѳ ѳ */ - 0x0474, 501, /* Ѵ ѵ */ - 0x0476, 501, /* Ѷ ѷ */ - 0x0478, 501, /* Ѹ ѹ */ - 0x047a, 501, /* Ѻ ѻ */ - 0x047c, 501, /* Ѽ ѽ */ - 0x047e, 501, /* Ѿ ѿ */ - 0x0480, 501, /* Ҁ ҁ */ - 0x0490, 501, /* Ґ ґ */ - 0x0492, 501, /* Ғ ғ */ - 0x0494, 501, /* Ҕ ҕ */ - 0x0496, 501, /* Җ җ */ - 0x0498, 501, /* Ҙ ҙ */ - 0x049a, 501, /* Қ қ */ - 0x049c, 501, /* Ҝ ҝ */ - 0x049e, 501, /* Ҟ ҟ */ - 0x04a0, 501, /* Ҡ ҡ */ - 0x04a2, 501, /* Ң ң */ - 0x04a4, 501, /* Ҥ ҥ */ - 0x04a6, 501, /* Ҧ ҧ */ - 0x04a8, 501, /* Ҩ ҩ */ - 0x04aa, 501, /* Ҫ ҫ */ - 0x04ac, 501, /* Ҭ ҭ */ - 0x04ae, 501, /* Ү ү */ - 0x04b0, 501, /* Ұ ұ */ - 0x04b2, 501, /* Ҳ ҳ */ - 0x04b4, 501, /* Ҵ ҵ */ - 0x04b6, 501, /* Ҷ ҷ */ - 0x04b8, 501, /* Ҹ ҹ */ - 0x04ba, 501, /* Һ һ */ - 0x04bc, 501, /* Ҽ ҽ */ - 0x04be, 501, /* Ҿ ҿ */ - 0x04c1, 501, /* Ӂ ӂ */ - 0x04c3, 501, /* Ӄ ӄ */ - 0x04c7, 501, /* Ӈ ӈ */ - 0x04cb, 501, /* Ӌ ӌ */ - 0x04d0, 501, /* Ӑ ӑ */ - 0x04d2, 501, /* Ӓ ӓ */ - 0x04d4, 501, /* Ӕ ӕ */ - 0x04d6, 501, /* Ӗ ӗ */ - 0x04d8, 501, /* Ә ә */ - 0x04da, 501, /* Ӛ ӛ */ - 0x04dc, 501, /* Ӝ ӝ */ - 0x04de, 501, /* Ӟ ӟ */ - 0x04e0, 501, /* Ӡ ӡ */ - 0x04e2, 501, /* Ӣ ӣ */ - 0x04e4, 501, /* Ӥ ӥ */ - 0x04e6, 501, /* Ӧ ӧ */ - 0x04e8, 501, /* Ө ө */ - 0x04ea, 501, /* Ӫ ӫ */ - 0x04ee, 501, /* Ӯ ӯ */ - 0x04f0, 501, /* Ӱ ӱ */ - 0x04f2, 501, /* Ӳ ӳ */ - 0x04f4, 501, /* Ӵ ӵ */ - 0x04f8, 501, /* Ӹ ӹ */ - 0x1e00, 501, /* Ḁ ḁ */ - 0x1e02, 501, /* Ḃ ḃ */ - 0x1e04, 501, /* Ḅ ḅ */ - 0x1e06, 501, /* Ḇ ḇ */ - 0x1e08, 501, /* Ḉ ḉ */ - 0x1e0a, 501, /* Ḋ ḋ */ - 0x1e0c, 501, /* Ḍ ḍ */ - 0x1e0e, 501, /* Ḏ ḏ */ - 0x1e10, 501, /* Ḑ ḑ */ - 0x1e12, 501, /* Ḓ ḓ */ - 0x1e14, 501, /* Ḕ ḕ */ - 0x1e16, 501, /* Ḗ ḗ */ - 0x1e18, 501, /* Ḙ ḙ */ - 0x1e1a, 501, /* Ḛ ḛ */ - 0x1e1c, 501, /* Ḝ ḝ */ - 0x1e1e, 501, /* Ḟ ḟ */ - 0x1e20, 501, /* Ḡ ḡ */ - 0x1e22, 501, /* Ḣ ḣ */ - 0x1e24, 501, /* Ḥ ḥ */ - 0x1e26, 501, /* Ḧ ḧ */ - 0x1e28, 501, /* Ḩ ḩ */ - 0x1e2a, 501, /* Ḫ ḫ */ - 0x1e2c, 501, /* Ḭ ḭ */ - 0x1e2e, 501, /* Ḯ ḯ */ - 0x1e30, 501, /* Ḱ ḱ */ - 0x1e32, 501, /* Ḳ ḳ */ - 0x1e34, 501, /* Ḵ ḵ */ - 0x1e36, 501, /* Ḷ ḷ */ - 0x1e38, 501, /* Ḹ ḹ */ - 0x1e3a, 501, /* Ḻ ḻ */ - 0x1e3c, 501, /* Ḽ ḽ */ - 0x1e3e, 501, /* Ḿ ḿ */ - 0x1e40, 501, /* Ṁ ṁ */ - 0x1e42, 501, /* Ṃ ṃ */ - 0x1e44, 501, /* Ṅ ṅ */ - 0x1e46, 501, /* Ṇ ṇ */ - 0x1e48, 501, /* Ṉ ṉ */ - 0x1e4a, 501, /* Ṋ ṋ */ - 0x1e4c, 501, /* Ṍ ṍ */ - 0x1e4e, 501, /* Ṏ ṏ */ - 0x1e50, 501, /* Ṑ ṑ */ - 0x1e52, 501, /* Ṓ ṓ */ - 0x1e54, 501, /* Ṕ ṕ */ - 0x1e56, 501, /* Ṗ ṗ */ - 0x1e58, 501, /* Ṙ ṙ */ - 0x1e5a, 501, /* Ṛ ṛ */ - 0x1e5c, 501, /* Ṝ ṝ */ - 0x1e5e, 501, /* Ṟ ṟ */ - 0x1e60, 501, /* Ṡ ṡ */ - 0x1e62, 501, /* Ṣ ṣ */ - 0x1e64, 501, /* Ṥ ṥ */ - 0x1e66, 501, /* Ṧ ṧ */ - 0x1e68, 501, /* Ṩ ṩ */ - 0x1e6a, 501, /* Ṫ ṫ */ - 0x1e6c, 501, /* Ṭ ṭ */ - 0x1e6e, 501, /* Ṯ ṯ */ - 0x1e70, 501, /* Ṱ ṱ */ - 0x1e72, 501, /* Ṳ ṳ */ - 0x1e74, 501, /* Ṵ ṵ */ - 0x1e76, 501, /* Ṷ ṷ */ - 0x1e78, 501, /* Ṹ ṹ */ - 0x1e7a, 501, /* Ṻ ṻ */ - 0x1e7c, 501, /* Ṽ ṽ */ - 0x1e7e, 501, /* Ṿ ṿ */ - 0x1e80, 501, /* Ẁ ẁ */ - 0x1e82, 501, /* Ẃ ẃ */ - 0x1e84, 501, /* Ẅ ẅ */ - 0x1e86, 501, /* Ẇ ẇ */ - 0x1e88, 501, /* Ẉ ẉ */ - 0x1e8a, 501, /* Ẋ ẋ */ - 0x1e8c, 501, /* Ẍ ẍ */ - 0x1e8e, 501, /* Ẏ ẏ */ - 0x1e90, 501, /* Ẑ ẑ */ - 0x1e92, 501, /* Ẓ ẓ */ - 0x1e94, 501, /* Ẕ ẕ */ - 0x1ea0, 501, /* Ạ ạ */ - 0x1ea2, 501, /* Ả ả */ - 0x1ea4, 501, /* Ấ ấ */ - 0x1ea6, 501, /* Ầ ầ */ - 0x1ea8, 501, /* Ẩ ẩ */ - 0x1eaa, 501, /* Ẫ ẫ */ - 0x1eac, 501, /* Ậ ậ */ - 0x1eae, 501, /* Ắ ắ */ - 0x1eb0, 501, /* Ằ ằ */ - 0x1eb2, 501, /* Ẳ ẳ */ - 0x1eb4, 501, /* Ẵ ẵ */ - 0x1eb6, 501, /* Ặ ặ */ - 0x1eb8, 501, /* Ẹ ẹ */ - 0x1eba, 501, /* Ẻ ẻ */ - 0x1ebc, 501, /* Ẽ ẽ */ - 0x1ebe, 501, /* Ế ế */ - 0x1ec0, 501, /* Ề ề */ - 0x1ec2, 501, /* Ể ể */ - 0x1ec4, 501, /* Ễ ễ */ - 0x1ec6, 501, /* Ệ ệ */ - 0x1ec8, 501, /* Ỉ ỉ */ - 0x1eca, 501, /* Ị ị */ - 0x1ecc, 501, /* Ọ ọ */ - 0x1ece, 501, /* Ỏ ỏ */ - 0x1ed0, 501, /* Ố ố */ - 0x1ed2, 501, /* Ồ ồ */ - 0x1ed4, 501, /* Ổ ổ */ - 0x1ed6, 501, /* Ỗ ỗ */ - 0x1ed8, 501, /* Ộ ộ */ - 0x1eda, 501, /* Ớ ớ */ - 0x1edc, 501, /* Ờ ờ */ - 0x1ede, 501, /* Ở ở */ - 0x1ee0, 501, /* Ỡ ỡ */ - 0x1ee2, 501, /* Ợ ợ */ - 0x1ee4, 501, /* Ụ ụ */ - 0x1ee6, 501, /* Ủ ủ */ - 0x1ee8, 501, /* Ứ ứ */ - 0x1eea, 501, /* Ừ ừ */ - 0x1eec, 501, /* Ử ử */ - 0x1eee, 501, /* Ữ ữ */ - 0x1ef0, 501, /* Ự ự */ - 0x1ef2, 501, /* Ỳ ỳ */ - 0x1ef4, 501, /* Ỵ ỵ */ - 0x1ef6, 501, /* Ỷ ỷ */ - 0x1ef8, 501, /* Ỹ ỹ */ - 0x1f59, 492, /* Ὑ ὑ */ - 0x1f5b, 492, /* Ὓ ὓ */ - 0x1f5d, 492, /* Ὕ ὕ */ - 0x1f5f, 492, /* Ὗ ὗ */ - 0x1fbc, 491, /* ᾼ ᾳ */ - 0x1fcc, 491, /* ῌ ῃ */ - 0x1fec, 493, /* Ῥ ῥ */ - 0x1ffc, 491, /* ῼ ῳ */ -}; - -/* - * title characters are those between - * upper and lower case. ie DZ Dz dz - */ -static -Rune _totitle1[] = -{ - 0x01c4, 501, /* DŽ Dž */ - 0x01c6, 499, /* dž Dž */ - 0x01c7, 501, /* LJ Lj */ - 0x01c9, 499, /* lj Lj */ - 0x01ca, 501, /* NJ Nj */ - 0x01cc, 499, /* nj Nj */ - 0x01f1, 501, /* DZ Dz */ - 0x01f3, 499, /* dz Dz */ -}; - -static -Rune* -bsearch(Rune c, Rune *t, int n, int ne) -{ - Rune *p; - int m; - - while(n > 1) { - m = n/2; - p = t + m*ne; - if(c >= p[0]) { - t = p; - n = n-m; - } else - n = m; - } - if(n && c >= t[0]) - return t; - return 0; -} - -Rune -tolowerrune(Rune c) -{ - Rune *p; - - p = bsearch(c, _tolower2, nelem(_tolower2)/3, 3); - if(p && c >= p[0] && c <= p[1]) - return c + p[2] - 500; - p = bsearch(c, _tolower1, nelem(_tolower1)/2, 2); - if(p && c == p[0]) - return c + p[1] - 500; - return c; -} - -Rune -toupperrune(Rune c) -{ - Rune *p; - - p = bsearch(c, _toupper2, nelem(_toupper2)/3, 3); - if(p && c >= p[0] && c <= p[1]) - return c + p[2] - 500; - p = bsearch(c, _toupper1, nelem(_toupper1)/2, 2); - if(p && c == p[0]) - return c + p[1] - 500; - return c; -} - -Rune -totitlerune(Rune c) -{ - Rune *p; - - p = bsearch(c, _totitle1, nelem(_totitle1)/2, 2); - if(p && c == p[0]) - return c + p[1] - 500; - return c; -} - -int -islowerrune(Rune c) -{ - Rune *p; - - p = bsearch(c, _toupper2, nelem(_toupper2)/3, 3); - if(p && c >= p[0] && c <= p[1]) - return 1; - p = bsearch(c, _toupper1, nelem(_toupper1)/2, 2); - if(p && c == p[0]) - return 1; - return 0; -} - -int -isupperrune(Rune c) -{ - Rune *p; - - p = bsearch(c, _tolower2, nelem(_tolower2)/3, 3); - if(p && c >= p[0] && c <= p[1]) - return 1; - p = bsearch(c, _tolower1, nelem(_tolower1)/2, 2); - if(p && c == p[0]) - return 1; - return 0; -} - -int -isalpharune(Rune c) -{ - Rune *p; - - if(isupperrune(c) || islowerrune(c)) - return 1; - p = bsearch(c, _alpha2, nelem(_alpha2)/2, 2); - if(p && c >= p[0] && c <= p[1]) - return 1; - p = bsearch(c, _alpha1, nelem(_alpha1), 1); - if(p && c == p[0]) - return 1; - return 0; -} - -int -istitlerune(Rune c) -{ - return isupperrune(c) && islowerrune(c); -} - -int -isspacerune(Rune c) -{ - Rune *p; - - p = bsearch(c, _space2, nelem(_space2)/2, 2); - if(p && c >= p[0] && c <= p[1]) - return 1; - return 0; -} - -int -isdigitrune(Rune c) -{ - Rune *p; - - p = bsearch(c, __isdigitr, nelem(__isdigitr)/2, 2); - if(p && c >= p[0] && c <= p[1]) - return 1; - return 0; -} --- a//sys/src/libc/test/mkfile +++ b//sys/src/libc/test/mkfile @@ -3,6 +3,8 @@ TEST=\ date\ pow\ + runebreak\ + runenorm\ strchr\ +#include +#include + +static int +estrtoul(char *s) +{ + char *epr; + Rune code; + + code = strtoul(s, &epr, 16); + if(s == epr) + sysfatal("bad code point hex string"); + return code; +} + +void +run(char *file, Rune* (*fn)(Rune*)) +{ + Biobuf *b; + char *p, *dot; + char *pieces[16]; + int i, j, n; + Rune stack[16], ops[16]; + int nstack, nops; + Rune r, *rp, *rp2; + char *line; + + b = Bopen(file, OREAD); + if(b == nil) + sysfatal("could not load composition exclusions: %r"); + + for(;(p = Brdline(b, '\n')) != nil; free(line)){ + p[Blinelen(b)-1] = 0; + line = strdup(p); + if(p[0] == 0 || p[0] == '#') + continue; + if((dot = strstr(p, "#")) != nil) + *dot = 0; + n = getfields(p, pieces, nelem(pieces), 0, " "); + nstack = nops = 0; + for(i = 0; i < n; i++){ + chartorune(&r, pieces[i]); + if(r != L'÷' && r != L'×'){ + r = estrtoul(pieces[i]); + stack[nstack++] = r; + stack[nstack] = 0; + } else { + ops[nops++] = r; + ops[nops] = 0; + } + } + + rp = stack; + for(i = 1; i < nops-1;){ + rp2 = fn(rp); + switch(ops[i]){ + case L'÷': + if(rp2 != rp+1){ + print("break fail %X %X || %s\n", rp[0], rp[1], line); + goto Break; + } + rp++; + i++; + break; + case L'×': + if(rp2 - rp == 0){ + for(j = i; j < nops - 1; j++) + if(ops[j] != L'×') + print("skipped %d %d %s\n", i, nops, line); + goto Break; + } + for(; rp < (rp2-1); rp++, i++){ + if(ops[i] != L'×') + print("skipped %d %d %s\n", i, nops, line); + } + rp = rp2; + i++; + break; + } + } +Break: + ; + } +} + +void +main(int, char) +{ + run("/lib/ucd/GraphemeBreakTest.txt", runegbreak); + run("/lib/ucd/WordBreakTest.txt", runewbreak); + exits(nil); +} --- /dev/null +++ b//sys/src/libc/test/runenorm.c @@ -1,0 +1,92 @@ +#include +#include +#include + +static int +estrtoul(char *s) +{ + char *epr; + Rune code; + + code = strtoul(s, &epr, 16); + if(s == epr) + sysfatal("bad code point hex string"); + return code; +} + +void +main(int, char) +{ + Rune buffer1[64]; + Rune buffer2[64]; + char utfbuff1[128]; + char utfbuff2[128]; + char srctmp[128], tmp1[128], tmp2[128]; + char *fields[10]; + char *runes[32]; + char *p; + int n, n2; + int i; + uint fail; + Biobuf *b; + + b = Bopen("/lib/ucd/NormalizationTest.txt", OREAD); + if(b == nil) + sysfatal("could not load composition exclusions: %r"); + + struct { + Rune src[32]; + Rune nfc[32]; + Rune nfd[32]; + } test; + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#' || p[0] == '@') + continue; + getfields(p, fields, 6 + 1, 0, ";"); + n = getfields(fields[0], runes, nelem(runes), 0, " "); + for(i = 0; i < n; i++) + test.src[i] = estrtoul(runes[i]); + test.src[i] = 0; + + n = getfields(fields[1], runes, nelem(runes), 0, " "); + for(i = 0; i < n; i++) + test.nfc[i] = estrtoul(runes[i]); + test.nfc[i] = 0; + + n = getfields(fields[2], runes, nelem(runes), 0, " "); + for(i = 0; i < n; i++) + test.nfd[i] = estrtoul(runes[i]); + test.nfd[i] = 0; + + n = runenorm(buffer1, test.src, nelem(buffer1), 1); + n2 = runenorm(buffer2, test.src, nelem(buffer2), 0); + fail = 0; + + if(runestrcmp(buffer1, test.nfc) != 0) + fail |= 1<<0; + if(runestrcmp(buffer2, test.nfd) != 0) + fail |= 1<<1; + if(fail) + print("%d %d %S %S %S %S %S\n", fail, i, test.src, test.nfd, test.nfc, buffer2, buffer1); + assert(n == runestrlen(test.nfc)); + assert(n2 == runestrlen(test.nfd)); + + snprint(srctmp, sizeof tmp1, "%S", test.src); + snprint(tmp1, sizeof tmp1, "%S", test.nfc); + snprint(tmp2, sizeof tmp2, "%S", test.nfd); + + n = utfnorm(utfbuff1, srctmp, nelem(utfbuff1), 1); + n2 = utfnorm(utfbuff2, srctmp, nelem(utfbuff2), 0); + + if(strcmp(utfbuff1, tmp1) != 0) + fail |= 1<<2; + if(strcmp(utfbuff2, tmp2) != 0) + fail |= 1<<3; + if(fail) + print("%d %d %s %s %s %s %s\n", fail, i, srctmp, tmp2, tmp1, utfbuff2, utfbuff1); + assert(n == strlen(tmp1)); + assert(n2 == strlen(tmp2)); + } + exits(nil); +}