RUNECOMP(2) RUNECOMP(2) NAME runecomp, runedecomp, fullrunenorm, runegbreak, runewbreak, utfcomp, utfdecomp, fullutfnorm, utfgbreak, utfwbreak - multi-rune graphemes SYNOPSIS #include <u.h> #include <libc.h> int runecomp(Rune *dst, Rune *src, int max) int runedecomp(Rune *dst, Rune *src, int max) Rune* fullrunenorm(Rune *s, int n) Rune* runegbreak(Rune *s) Rune* runewbreak(Rune *s) int utfcomp(char *dst, char *src, int max) int utfdecomp(char *dst, char *src, int max) char* fullutfnorm(char *s, int n) char* utfgbreak(char *s) char* utfwbreak(char *s) DESCRIPTION These routines help in handling graphemes that may span mul- tiple runes. Runecomp, runedecomp, utfcomp, and utfdecomp perform Uni- code® normalization on src, storing the result in dst. No more then max elements will be written, and the resulting string will always be null terminated. The return value is always the total number of elements required to store the transformation. If this value is larger then the supplied max the caller can assume the result has been truncated. Runecomp and utfcomp perform NFC normalization while runedecomp and utfdecomp perform NFD normalization. Fullrunenorm, and fullutfnorm determine if enough elements are present in s to perform normalization. If enough are present, a pointer is returned to the first element that begins the next context. Otherwise s is returned. No more then n elements will be read. Runegbreak, and utfgbreak search s for the next grapheme break opportunity. If none are found before the end of the string, s is returned. Runewbreak, and utfwbreak search s for the next word break opportunity. If none are found before the end of the string, s is returned. SOURCE /sys/src/libc/port/mkrunetype.c /sys/src/libc/port/runenorm.c /sys/src/libc/port/runebreak.c SEE ALSO Unicode® Standard Annex #15 Unicode® Standard Annex #29 rune(2), utf(6), tcs(1)
Fri Mar 24 09:32:37 EDT 2023
JOiDoeivtOWFq+mBkyTvvIjnrKzkuIDniYjvvIlieSDkuIvljYjkuInngrnljYoKCjIwMjAuOS4yOO+8jOS4pOS6uuato+W8j+W8gOWni+WQjOahjOOAguS4pOS6uuWQjOahjOebtOWIsDIwMjAuMTAuMTnjgIIKCuS6i+WunuS4iuS4pOS6uuiupOivhuaYr+eUseS6juWcqOS/oeaBr+mYn+mbhuiureOAguesrOS4gOWkqTE144CBMzDjgIE0NuS4ieS6uuWOu+aJvum7hOafkOmcnuS4reWNiOivt+WBh+WOu+acuuaIv+WBmumimO+8jOivt+WBh+adoeWunumZheS4iuaYrzMw5YaZ55qE44CC5L2G5ZCO5p2lMTXnlLHkuo7lpKrllpzmrKLkuK3ljYjnnaHop4nkuobvvIzmiYDku6XlsLHmsqHmnInlho3ljrvkuobjgIIxNemCo+S4gOWkqei/mOmXruS6hjMw77yM6ZeuNDbnmoRpZOaYr+S7gOS5iOOAguS9huaYrzMw5oeS5b6X55CGMTXvvIzmiYDku6XlsLHlj6sxNeiHquW3seWOu+mXrjQ244CC77yI5aW977yM5beu54K55b+Y5LqG6L+Z5pys5Lic6KW/5Y+r6IOh6K+05YWr6YGT77yM5YGP6aKY5LqG77yJCgrlkIzmoYzpgqPkvJrlhL/vvIzmgbDpgKJDU1DliJ3otZvvvIwxNeW3ruWNiuWIhui/h+S9huaYrzMw6L+H57q/5LqG44CCMTXor7Toh6rlt7E2N+WIhu+8jOeEtuWQjjMw6ams5LiK6Lez6LW35p2l6K+05ZOH5aGe5L2g5aW95by65ZWK5q+UNDbpq5jkuoY45YiG77yB54S25ZCO5bCx56qc6L+H5Y675ZGK6K+JNDbkuobvvIHvvIjmnInmhI/mgJ3lkJfvvIzosIHmg7PlkYror4k0NuWViu+8iQoK5pyJ5LiA6IqC57u85ZCI6K++77yM6ICB5biI6YCa6L+H5oq95omR5YWL54mM55qE5pa55byP5Yaz5a6a5YiG57uE44CC6YKj5aSpMzDmir3liLDkuobkuIDlvKDpu5HmoYM544CC6ICB5biI6K+05LiN6IO95pKV6L+Z5Liq5aSn5Z6L55qE5omR5YWL54mM77yM5LiN6IO95Zyo5LiK6Z2i5YaZ5YaZ55S755S777yM5LiN6IO95oqY5Y+g5a6D44CC5LqO5pivMzDmiormiZHlhYvniYzlkIPkuobvvIznhLblkI7lkYror4nkuoYxNe+8jOi/mOivtOi/meagt+W5tuS4jei/neazleOAguWboOS4uuS7luayoeacieaKmOWPoOOAgeaSleavgeaIluiAheWcqOS4iumdouWGmeWGmeeUu+eUu+OAguS7luWPquaYr+aKiuaJkeWFi+eJjOWQg+S6huOAgu+8iOWGmei/meS4queahOaXtuWAmeecn+eahOeskeW+l+aIkeacieeCueW0qea6g++8jOi/meWwseaYr3d4ZuivtOeahOKAnOWGmeW+l+iuqeS6uuS4gOeci+WwseefpemBk+aYr3Rh4oCd55qE5paH56ug5ZCX77yM5oeC5LqG77yJCgrmnInkuIDlpKnliY3lkI7lt6blj7Plh6DkurrorqjorrrnlJ/ml6XvvIwzMOivtOiHquW3seaYrzA45bm0OeaciDTml6XnlJ/ml6XnmoTvvIwxNeWQrOWujOaDiuWPueS6huS4gOWjsO+8jOeEtuWQjuivtOS6huS4gOWPpeKAlOKAlArigJzkvaDnlJ/ml6Xlkoznjovlv4Plh4zlsLHlt67kuIDlpKnor7bvvIHigJ0KMzDvvJrvvJ8K77yIb2ggYmFieeaDheivneWkmuivtOS4gOeCue+8ie+8iOS4jeaYr++8iQrlhbblrp7lkI7mnaUxNeiAgeaYr+iusOmUmTMw55Sf5pel5bCx5piv5Zug5Li6546L5b+D5YeM77yI5LiN5piv77yJ44CCCuWFtuWunuS5i+aJgOS7pTE15Lya6L+Z5LmI6K+05piv5Zug5Li65Yia5Yia5p+l5a6M546L5b+D5YeM55Sf5pel44CCCgrpgZPlvrfkuI7ms5Xmsrvor77vvIzogIHluIjorqnlkIzmoYzorqjorrrmnIDnvqHmhZXmlofkuK3kuKTkvY3kurrnianvvIhta3MmZWdz77yJ55qE5ZOq5pa56Z2i55qE5Y+L6LCK77yM54S25ZCOMzDot58xNeivtO+8jOacgOe+oeaFleKAnGVnc+e7meS6iG1rc+W+iOWkmumSseKAneOAgu+8iOWlveWQp++8jOS7luWcqOmihOiogDE1NDblhavkuIvliafmg4XlkJfvvJ/vvIkKCuWOhuWPsuivvu+8jOiusueZvuWutuS6iem4o++8jOivvuWJjeivu+S5pu+8jDMw5LiOMTXnibnliKvlnLDlpKflo7DvvIzlvojpu5jlpZHlnLDmiormiYDmnInlkI3or43pg73lv7XmiJDigJzlpKflpYbigJ3vvIznhLblkI7ooqszOOiusOWQjeS6hu+8jOWXr++8jOS4gOi1t+iusOeahOOAggoK5pWw5a2m6K++77yM6buE5p+Q6Zye6KeB5LqM5Lq65bm25LiN6K6k55yf5ZCs6K++5Y+q5Zyo5Y+R5ZGG77yM6K+077ya4oCc6K+25L2g5Lus5a2m5L+h5oGv55qE5piv5LiN5piv5bCx5piv5Zac5qyi5oqK5LiA5aCG56yU5pGK5Zyo5qGM6Z2i54S25ZCO5ZWl5Lmf5LiN5bmy5ZWK77yf4oCd54S25ZCO5Lik5Lq66Lqr5ZCO55qENDflub3lub3lnLDmnaXkuobkuIDlj6XvvJrigJx6YXPourrmnqrkuobjgILigJ0KCuacieS4gOWkqeWNiOS8ke+8jDQ35pCeMzDnmoTmpIXlrZDvvIzlm6DkuLrljYjkvJHnmoTml7blgJkzMOWOu+S6huacuuaIv++8jOS4jeWcqOOAgueEtuWQjjQ35bCx5oqK5ZyG6KeE5o+S5YiwMzDluqfkvY3nmoTkuIvovrnvvIjorrDlvpfliJ3kuIDml7blgJnnmoTmpIXlrZDlkJfvvJ/mnInnvJ3nmoTvvIzliJrlpb3lj6/ku6XloZ7lvpfkuIvlnIbop4TvvInvvIznhLblkI7lgbflgbfot58xNeivtOi/meagt+WPr+S7peW6n+aOiTMw5ZCO5Y2K55Sf44CC44CC44CC5b2T54S2MTXlvojlv6vlsLHor7Tlh7rljrvkuobvvIHvvIjlpb3lkKfvvIzmiJHlv5jkuobmmK8zMOaQnjQ355qE6L+Y5pivNDfmkJ4zMOeahO+8jOWPjeato+W3ruS4jeWkmu+8ge+8iQoK5ZCM5qGM5pyA5ZCO5LiA5aSp77yMMTXnnIvnnYDmlrDnmoTluqfkvY3ooajlj5HlkYbvvIzpg4Hpl7flnLDnlKjpkqLlsLrliK7msLTlo7bvvIzliK7lh7rkuIDloIbloZHmlpnlsZHmnaXvvIjpmaTmraTku6XlpJbvvIzov5jliK7kuobmoYzlrZDjgIHmpIXlrZDvvInjgILnhLblkI4zMOW+iOW0qea6g++8jOaIkeS7rOS5n+S4jeefpemBk+S7luWIsOW6leWcqOW0qea6g+S7gOS5iO+8jOS4gOebtOi3nzE16K+077yM5L2g6IO95LiN6IO95LiN6KaB5Yiu5LqG77yM5L2g6IO95LiN6IO95LiN6KaB5Yiu5LqG44CCMTXnv7vkuobkuKrnmb3nnLzor7TkuI3og73vvIzkvYbmmK/lhbblrp7lgZzmiYvkuobjgILvvIjku47mraTku6XlkI4xNeaDszMw5LqG5bCx5Lya5Yiu5rC05aO244CC77yJCgo55pyI6IezMTHmnIjvvIwzMOWSjDQ25LiN5YGc5Zyw5py65oOoMTXvvIznhLblkI4zMOi/mOi3keWIsOeUqOaIt+S4vuaKpeS4k+WMuuS4vuaKpTE155qE6L+d6KeE6KGM5Li677yI5YW25a6e6L+d6KeE6KGM5Li65bCx5pivMzDmi78xNeeahOWPt+aQnueahO+8ieOAgjE15ZCO5p2l6LSo6Zeu5piv6LCB5bmy55qE77yMMzDor7TmmK80NuW5sueahO+8jDQ26K+05pivMzDlubLnmoTvvIzkuozkurrlnYfljYHliIbnnJ/mjJrjgILvvIjlhbblrp7ov5nmmK/ku5bku6zkv6nlr7kxNeacgOecn+aMmueahOS4gOasoe+8iOS4jeaYr++8ie+8ieWboOS4uuW9k+aXtjE155u45L+hMzDvvIzmiYDku6XorqTkuLrov5nku7bkuovmg4XkuIDlrprmmK80NuW5sueahOOAgu+8iOi/meaYr+S4gOacrOato+e7j+adkOaWmeWvueWQp+OAgu+8ie+8iOWFtuWunuacieWlveWHoOasoeexu+S8vOeahO+8jOWPquS4jei/h+W5tOS7o+S5hei/nDE15Lmf5b+Y5LqG77yJCgrlkI7mnaXvvIwxNeWWnOasojQ277yI5L2g56Gu5a6a6L+Z5piv6IOh6K+05YWr6YGT77yf5LiN5piv5LiA5pys5q2j57uP77yf77yJ77yM5b6I5b+r5bCx5ZGK6K+J5LqGMzDjgILmiJHor7TkvaDku6zlj6/og73kuI3kv6HigKbigKYxNei3nzMw5ZCM5qGM55qE5pe25YCZ6L+Y5b6I5oOz6LCD6LWw5pyJ5LiA5Liq5aWz5ZCM5qGM77yM5L2G5piv5o2i5bqn5L2N5LqG5Lul5ZCO5Y+I5aW95YOP5pyJ54K55oOz5b+1MzDvvIzmiYDku6XlsLHljrvmib4zMOOAgueEtuWQjjMw5aiB6IOBMTXkuIDlrpropoHor7TvvIwxNeivtHbmiJE1MOaIkeWwseivtO+8iOWOn+ivneW9k+eEtuS4jeaYr+i/meagt+eahOWVpu+8jHbmiJE1MOi/meaYrzIwMjLmiY3mnInnmoTor43or63llabjgILlj43mraMxNeWwsei/meS4quaEj+aAne+8ieOAgjMw5b2T5pe25bCx5oqKNDbnmoTppa3ljaHpgJLov4fmnaXor7TkvaDmi7/ov5nkuKrljaHljrvliLc1MHLigKbigKYxNeiCr+WumuWPjeW6lOi/h+adpeS6hui/meS4jeaYrzMw55qE6aWt5Y2h77yM54S25ZCO5LuU57uG5LiA55yL6L+Z5aW95YOP5pivNDbnmoTigKbigKbnhLblkI7lkI7mnaXlkYror4nkuoYzMOOAgui/meS4quWRiuivieeahOi/h+eoi+S5n+ibruacieaEj+aAne+8mjE155So55y856We56S65oSPMzDlpbnor7TnmoTmmK80Nu+8jOeEtuiAjDMw56ys5LiA5qyh5rKh5pyJZ2V05Yiw77yMMTXor7TkvaDnn6XpgZPkuoblkKfvvIwzMOivtOaIkeS4jeefpemBk+OAgueEtuWQjjE15Y+I5oyH5LqG5LiA5qyh77yMMzDnn6XpgZPkuobvvJsxNeaAlTMw6K6k6ZSZ54S25ZCO6KaBMzDmjIfkuIDkuIvigJzkvaDmjIfkuIDkuIvmiJHnnIvkvaDmnInmsqHmnInorqTplJnjgILigJ3lvZPml7YzMOWSjDQ2562J5Lq65Zyo6K6y5Y+w5LiK6Z2i546p6K6y5Y+w55qE55S16ISR77yM54S25ZCOMzDlsLHpobrlir/mjIfkuobmjIc0NuOAgjE16K+05piv5LuW5ZOI5ZOI5ZOI5ZOI5ZOI5ZOI5LiN6KaB5ZGK6K+J5LuW77yM54S25ZCO6LeR5byA5LqG44CC77yI5oiR5aOw5piO77yMMTXkuI3llpzmrKI0NuOAguS9huaYr+mCo+S4quaXtuWAmeWPr+iDveaYr+WWnOasoueahOOAgu+8iQoK54S25ZCO6YKj5aSp5L2T6ZS76K++77yM55Sx5LqO5a2m5Y+355qE5Y6f5Zug4oCm4oCmMTXlj4jlnZAzMOaXgei+ueOAgueEtuWQjjE15b2T5pe25Y+q5piv5oOz6ZqP5L6/5om+MzDorrLor53orrLngrnku4DkuYjvvIjlm6DkuLrpgqPkuKrml7blgJkxNeeci+edgDMw5bCx5oOz6Lef5LuW6K6y6K+d77yM5oiR5Y+R6KqT5piv6L+Z5qC355qE77yM6LefNDbmsqHmnInlhbPns7vvvIzll6/jgILvvInvvIzkuo7mmK/lpbnljrvpmo/lj6Ppl67kuobkuIDlj6XvvJrigJzkvaDmsqHmnInlkYror4nku5blkKfvvJ/igJ3nu5PmnpwzMOaWqemSieaIqumTgeWcsOivtO+8muKAnOaIkeWRiuivieS6huWViu+8geKAnTE177ya77yf77yf77yf6YKj5LuW5oCO5LmI6K+077yfMzDlvZPml7blsLHkuIDohLjlkIPnk5zvvIjvvJ/vvInnmoTmoLflrZDot58xNeivtO+8muKAnOS7luivtOS4jeWPr+iDve+8geKAneeEtuWQjuingTE155aR5oOR77yMMzDlj4jmnaXkuobkuIDlj6XvvJrigJznnJ/nmoTvvIHmiJHor7TkuobkuKTpgY3ku5bpg73or7TkuI3lj6/og73vvIHigJ0xNe+8mu+8n++8iOeUseatpOWPr+ingTMw5Lmf5piv5Liq5Zi05ryP55qE77yM5YW25a6e5pys6LSo5LiK6LefMDjmsqHku4DkuYjljLrliKvjgILvvIkKCjIwMjHlubQx5pyINeaXpe+8jOWPiOWQjuadpe+8jDE15Z2Q5ZyoMzDnmoTlkI7pnaLvvIwxNemCo+S4quaXtuWAmQ==
Fri Mar 24 03:11:11 EDT 2023
RUNECOMP(2) RUNECOMP(2) NAME runecomp, runedecomp, fullrunenorm, runegbreak, runewbreak, utfcomp, utfdecomp, fullutfnorm, utfgbreak, utfwbreak - multi-rune graphemes SYNOPSIS #include <u.h> #include <libc.h> int runecomp(Rune *dst, Rune *src, int max) int runedecomp(Rune *dst, Rune *src, int max) Rune* fullrunenorm(Rune *s, int n) Rune* runegbreak(Rune *s) Rune* runewbreak(Rune *s) int utfcomp(char *dst, char *src, int max) int utfdecomp(char *dst, char *src, int max) char* fullutfnorm(char *s, int n) char* utfgbreak(char *s) char* utfwbreak(char *s) DESCRIPTION These routines help in handling graphemes that may span mul- tiple runes. Runecomp, runedecomp, utfcomp, and utfdecomp perform Uni- code® normalization on src, storing the result in dst. No more then max elements will be written, and the resulting string will always be null terminated. The return value is always the total number of elements required to store the transformation. If this value is larger then the supplied max the caller can assume the result has been truncated. Runecomp and utfcomp perform NFC normalization while runedecomp and utfdecomp perform NFD normalization. Fullrunenorm, and fullutfnorm determine if enough elements are present in s to perform normalization. If enough are present, a pointer is returned to the first element that begins the next context. Otherwise s is returned. No more then n elements will be read. Runegbreak, and utfgbreak search s for the next grapheme break opportunity. If none are found before the end of the string s is returned. Runewbreak, and utfwbreak search s for the next word break opportunity. If none are found before the end of the string s is returned. SOURCE /sys/src/libc/port/mkrunetype.c /sys/src/libc/port/runenorm.c /sys/src/libc/port/runebreak.c SEE ALSO Unicode® Standard Annex #15 Unicode® Standard Annex #29 rune(2), utf(6), tcs(1)
Thu Mar 23 21:12:35 EDT 2023
RUNENORM(2) RUNENORM(2) NAME runedecomp, runerecomp, runenorm, utfnorm, fullrunenorm, fullutfnorm, runegbreak, runewbreak - multi-rune graphemes SYNOPSIS #include <u.h> #include <libc.h> void runedecomp(Rune dst[2], Rune src) Rune runerecomp(Rune r[2]) int runenorm(Rune *dst, Rune *src, int max, int compose) int utfnorm(char *dst, char *src, int max, int compose) Rune* fullrunenorm(Rune *s, int n) char* fullutfnorm(char *s, int n) Rune* runegbreak(Rune *r) Rune* runewbreak(Rune *r) DESCRIPTION These routines help in handling graphemes that may span mul- tiple runes. Runedecomp decomposes the rune src and places the two decom- posed runes into dst. If no decomposition is found dst is zeroed. Runerecomp composes the two runes provided in r and returns the result. If no composition is found 0 is returned. Runenorm (Utfnorm) copies the rune (UTF) sequence src to dst while performing Unicode normalization. No more then max runes (bytes) will be copied, and the result is always null terminated. If compose is non-zero NFC normalization is per- formed, otherwise NFD normalization is performed. Fullrunenorm (fullutfnorm) determines if enough runes (bytes) are present in s to perform normalization. If enough is present, a pointer is returned to the first rune (byte) that begins the next context, otherwise s is returned. No more then n runes (bytes) will be read. Runegbreak (Runewbreak) searches r for the next grapheme (word) break opportunity. If none are found before the end of the string r is returned. SOURCE /sys/src/libc/port/mkrunetype.c /sys/src/libc/port/runenorm.c /sys/src/libc/port/runebreak.c SEE ALSO Unicode® Standard Annex #15 Unicode® Standard Annex #29 rune(2), utf(6), tcs(1)
Thu Mar 23 14:58:42 EDT 2023
diff 4c1d2d44db117f01d4af59d18d06c5fc0b20f3c1 uncommitted --- a/tree.c +++ b/tree.c @@ -654,7 +654,6 @@ if(d == l) if((i == b->nval-2) || (i >= 2 && copied >= halfsz)){ d = r; - full = 0; spc = Leafspc - (halfsz + Msgmax); getval(b, i, mid); }
Thu Mar 23 08:36:43 EDT 2023
From 8afe25e31e00575ae9e123348bbcf9d9cb948bfd Mon Sep 17 00:00:00 2001 From: Lennart Jablonka <humm@ljabl.com> Date: Thu, 23 Mar 2023 13:30:17 +0100 Subject: [PATCH gridirc] don't require PRIVMSGs to have a colon MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When relaying an IRC message to gridchat, everything up to and including the first colon is stripped: PRIVMSG #chat :hello there becomes humm → hello there If a message does not contain whitespace, it need not start in a colon, though: PRIVMSG #chat hello would get relayed as humm → PRIVMSG chat hello --- gridirc.rc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gridirc.rc b/gridirc.rc index a8338cc..d7cfb7a 100755 --- a/gridirc.rc +++ b/gridirc.rc @@ -129,7 +129,9 @@ $1 ~ /^JOIN$/ { $1 ~ /^PRIVMSG$/ { sub(/^#/, "", $2) chan=$2 - sub(/^[^:]*:/, "") + $1 = "" + $2 = "" + sub(/^ *:?/, "") file=sprintf("/n/chat/%s", chan) printf "%s → %s\n", nick, $0 >> file fflush -- 2.39.2
Thu Mar 23 07:12:21 EDT 2023
diff --git a/Makefile b/Makefile index f455219..801b634 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -PROGS=9pex 9gc +PROG=9pex 9gc PREFIX?=/usr/local BIN=${DESTDIR}${PREFIX}/bin CFLAGS?=-g -O2 @@ -17,7 +17,7 @@ default: 9pex 9gc 9gc: ${COMMON_O} ${GC_O} ${CC} -o $@ ${COMMON_O} ${GC_O} -install: progs +install: all install -d ${BIN} install -m 755 ${PROG} ${BIN}
Thu Mar 23 07:11:39 EDT 2023
diff --git a/Makefile b/Makefile index f455219..801b634 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -PROGS=9pex 9gc +PROG=9pex 9gc PREFIX?=/usr/local BIN=${DESTDIR}${PREFIX}/bin CFLAGS?=-g -O2 @@ -17,7 +17,7 @@ default: 9pex 9gc 9gc: ${COMMON_O} ${GC_O} ${CC} -o $@ ${COMMON_O} ${GC_O} -install: progs +install: all install -d ${BIN} install -m 755 ${PROG} ${BIN}
Thu Mar 23 01:17:08 EDT 2023
RUNENORM(2) RUNENORM(2) NAME runedecomp, runerecomp, runenorm, utfnorm, fullrunenorm, fullutfnorm, runegbreak, runewbreak - multi-rune graphemes SYNOPSIS #include <u.h> #include <libc.h> void runedecomp(Rune dst[2], Rune src) Rune runerecomp(Rune r[2]) int runenorm(Rune *dst, Rune *src, int max, int compose) int utfnorm(char *dst, char *src, int max, int compose) Rune* fullrunenorm(Rune *s, int n) char* fullutfnorm(char *s, int n) Rune* runegbreak(Rune *r) Rune* runewbreak(Rune *r) DESCRIPTION These routines help in handling graphemes that may span mul- tiple runes. Runedecomp decomposes the rune src and places the two decom- posed runes into dst. If no decomposition is found dst is zerod. Runerecomp composes the two runes provided in r and returns the result. If no composition is found 0 is returned. Runenorm (Utfnorm) copies the rune (UTF) sequence src to dst while performing Unicode normalization. No more then max runes (bytes) will be copied, and the result is always null terminated. If compose is non-zero NFC normalization is per- formed, otherwise NFD normalization is performed. Fullrunenorm (fullutfnorm) determines if enough runes (bytes) are present in s to perform normalization. If enough is present, a pointer is returned to the first rune (byte) that begins the next context, otherwise s is returned. No more then n runes (bytes) will be read. Runegbreak (Runewbreak) searches r for the next grapheme (word) break opportunity. If none are found before the end of the string r is returned. SOURCE /sys/src/libc/port/mkrunetype.c /sys/src/libc/port/runenorm.c /sys/src/libc/port/runebreak.c SEE ALSO rune(2), utf(6), tcs(1)
Wed Mar 22 21:17:40 EDT 2023
diff b8ae7708fb3ef3acbb30ccf3181897f8157c18de uncommitted --- /dev/null +++ b//lib/ucd/mkfile @@ -1,0 +1,70 @@ +</$objtype/mkfile + +VERSION='15.0.0' +URL='https://www.unicode.org/Public/'$VERSION'/ucd/' + +TXT=\ + ArabicShaping.txt\ + BidiBrackets.txt\ + BidiMirroring.txt\ + BidiTest.txt\ + Blocks.txt\ + CJKRadicals.txt\ + CaseFolding.txt\ + CompositionExclusions.txt\ + DerivedAge.txt\ + DerivedCoreProperties.txt\ + DerivedNormalizationProps.txt\ + EastAsianWidth.txt\ + EmojiSources.txt\ + EquivalentUnifiedIdeograph.txt\ + HangulSyllableType.txt\ + Index.txt\ + IndicPositionalCategory.txt\ + IndicSyllabicCategory.txt\ + Jamo.txt\ + LineBreak.txt\ + NameAliases.txt\ + NamedSequences.txt\ + NamedSequencesProv.txt\ + NamesList.txt\ + NormalizationCorrections.txt\ + NushuSources.txt\ + PropList.txt\ + PropertyAliases.txt\ + PropertyValueAliases.txt\ + ScriptExtensions.txt\ + Scripts.txt\ + SpecialCasing.txt\ + StandardizedVariants.txt\ + TangutSources.txt\ + USourceData.txt\ + UnicodeData.txt\ + VerticalOrientation.txt\ + +TEST=\ + NormalizationTest.txt\ + BidiCharacterTest.txt\ + +PDF=\ + USourceGlyphs.pdf\ + USourceRSChart.pdf\ + +AUX=\ + WordBreakProperty.txt\ + GraphemeBreakProperty.txt\ + +ucd:V: UnicodeData.txt + +%.txt: + hget $URL^$target > $target >[2]/dev/null || hget $URL^'auxiliary/'^$target > $target +%.pdf: + hget $URL^$target > $target + +txt:V: $TXT + +pdf:V: $PDF + +test:V: $TEST + +all:V: $TXT $PDF $TEST --- a//sys/include/libc.h +++ b//sys/include/libc.h @@ -77,6 +77,14 @@ extern long runestrlen(Rune*); extern Rune* runestrstr(Rune*, Rune*); +extern int runenorm(Rune*, Rune*, int, int); +extern int utfnorm(char*,char*,int,int); +extern char* fullutfnorm(char*,int); +extern Rune* fullrunenorm(Rune*,int); + +extern Rune* runewbreak(Rune*); +extern Rune* runegbreak(Rune*); + extern Rune tolowerrune(Rune); extern Rune totitlerune(Rune); extern Rune toupperrune(Rune); @@ -404,7 +412,7 @@ extern int enc16chr(int); extern int encodefmt(Fmt*); -extern void exits(char*); +extern _Noreturn void exits(char*); extern double frexp(double, int*); extern uintptr getcallerpc(void*); extern char* getenv(char*); @@ -431,7 +439,7 @@ extern ulong strtoul(char*, char**, int); extern vlong strtoll(char*, char**, int); extern uvlong strtoull(char*, char**, int); -extern void sysfatal(char*, ...); +extern _Noreturn void sysfatal(char*, ...); #pragma varargck argpos sysfatal 1 extern void syslog(int, char*, char*, ...); #pragma varargck argpos syslog 3 @@ -677,7 +685,7 @@ ulong len; } IOchunk; -extern void _exits(char*); +extern _Noreturn void _exits(char*); extern void abort(void); extern int access(char*, int); --- a//sys/src/cmd/tcs/hdr.h +++ b//sys/src/cmd/tcs/hdr.h @@ -23,6 +23,8 @@ void utf_in(int, long *, struct convert *); void utf_out(Rune *, int, long *); +void utfnfc_out(Rune *, int, long *); +void utfnfd_out(Rune *, int, long *); void isoutf_in(int, long *, struct convert *); void isoutf_out(Rune *, int, long *); --- a//sys/src/cmd/tcs/tcs.c +++ b//sys/src/cmd/tcs/tcs.c @@ -613,6 +613,10 @@ { "utf-16be", "alias for unicode-be (MIME)", Func, 0, (Fnptr)unicode_out_be }, { "utf-16le", "alias for unicode-le (MIME)", From|Func, 0, (Fnptr)unicode_in_le }, { "utf-16le", "alias for unicode-le (MIME)", Func, 0, (Fnptr)unicode_out_le }, + { "nfc", "UTF Normalization Form C", From|Func, 0, (Fnptr)utf_in }, + { "nfc", "UTF Normalization Form C", Func, 0, (Fnptr)utfnfc_out }, + { "nfd", "UTF Normalization Form D", From|Func, 0, (Fnptr)utf_in }, + { "nfd", "UTF Normalization Form D", Func, 0, (Fnptr)utfnfd_out }, { "viet1", "Vietnamese VSCII-1 (1993)", Table, (void *)tabviet1 }, { "viet2", "Vietnamese VSCII-2 (1993)", Table, (void *)tabviet2 }, { "vscii", "Vietnamese VISCII 1.1 (1992)", Table, (void *)tabviscii }, --- a//sys/src/cmd/tcs/utf.c +++ b//sys/src/cmd/tcs/utf.c @@ -19,38 +19,27 @@ void utf_in(int fd, long *, struct convert *out) { - char buf[N]; - int i, j, c, n, tot; - unsigned long l; + char buf[N + 1]; + Rune r; + char *p; + int n, tot, j; tot = 0; + j = 0; while((n = read(fd, buf+tot, N-tot)) >= 0){ tot += n; - for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){ - c = our_mbtowc(&l, buf+i, tot-i); - if(c == -1){ - if(squawk) - warn("bad UTF sequence near byte %ld in input", ninput+i); - if(clean){ - i++; - continue; - } - nerrors++; - l = Runeerror; - c = 1; - } - runes[j++] = l; - i += c; + if(fullutfnorm(buf, tot) == buf) + continue; + /* fullutfnorm ensures rune boundary */ + for(p = buf; p < buf + tot;){ + p += chartorune(&r, p); + runes[j++] = r; + runes[j] = 0; } OUT(out, runes, j); - tot -= i; - ninput += i; - if(tot) - memmove(buf, buf+i, tot); - if(n == 0) - break; + j = 0; + tot = 0; } - OUT(out, runes, 0); } void @@ -66,6 +55,26 @@ noutput += p-obuf; if(p > obuf) write(1, obuf, p-obuf); +} + +void +utfnfc_out(Rune *base, int n, long *) +{ + Rune buf[N + 1]; + int w; + + w = runenorm(buf, base, n + 1, 1); + utf_out(buf, w, nil); +} + +void +utfnfd_out(Rune *base, int n, long *) +{ + Rune buf[N + 1]; + int w; + + w = runenorm(buf, base, n + 1, 0); + utf_out(buf, w, nil); } void --- a//sys/src/libc/port/mkfile +++ b//sys/src/libc/port/mkfile @@ -62,6 +62,9 @@ rand.c\ readn.c\ rune.c\ + runebreak.c\ + runeistype.c\ + runenorm.c\ runestrcat.c\ runestrchr.c\ runestrcmp.c\ @@ -74,7 +77,7 @@ runestrrchr.c\ runestrlen.c\ runestrstr.c\ - runetype.c\ + runetotype.c\ sin.c\ sinh.c\ sqrt.c\ @@ -127,3 +130,16 @@ </sys/src/cmd/mksyslib profile.$O: /sys/include/tos.h + +runenorm.$O: runenormdata runenorm.c +runetotype.$O: runetotypedata runetotype.c +runeistype.$O: runeistypedata runeistype.c +runebreak.$O: runebreakdata runebreak.c + +runenormdata runetotypedata runeistypedata runebreakdata: mkrunetype.c + @{ + eval `{grep '^[A-Z]' /$cputype/mkfile} + $CC $CFLAGS -o mkrunetype.$O $prereq + $LD $LDFLAGS -o $O.mkrunetype mkrunetype.$O + $O.mkrunetype + } --- /dev/null +++ b//sys/src/libc/port/mkrunetype.c @@ -1,0 +1,761 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> + +enum{ + NRUNES = 1<<21 +}; + +typedef struct Param Param; +typedef struct Lvl Lvl; +struct Lvl{ + int bits; + int max; + int mask; +}; +struct Param{ + Lvl idx1; + Lvl idx2; + Lvl data; + + int round1max; +}; + +static void +derive(Lvl *l) +{ + l->max = 1 << l->bits; + l->mask = l->max - 1; +} + +static void +param(Param *p, int idx1, int idx2) +{ + + assert(idx1 + idx2 < 21); + p->idx1.bits = idx1; + p->idx2.bits = idx2; + p->data.bits = 21 - idx1 - idx2; + derive(&p->idx1); + derive(&p->idx2); + derive(&p->data); + + p->round1max = NRUNES/p->data.max; +} + +static int +lkup(Param *p, int *idx1, int *idx2, int *data, int x) +{ + int y, z; + + y = (((x)>>(p->data.bits+p->idx2.bits))&p->idx1.mask); + z = (((x)>>p->data.bits)&p->idx2.mask); + return data[idx2[idx1[y] + z] + (x&p->data.mask)]; +} + +static int +mkarrvar(int fd, char *name, int *d, int len) +{ + int i, sz; + int max, min; + char *t; + + max = min = 0; + for(i = 0; i < len; i++){ + if(d[i] > max) + max = d[i]; + if(d[i] < min) + min = d[i]; + } + if(min == 0){ + if(max < (uchar)~0) + t = "uchar", sz = 1; + else if(max < 0xFFFF) + t = "ushort", sz = 2; + else + t = "uint", sz = 4; + } else { + if(max < 1<<7) + t = "char", sz = 1; + else if(max < 1<<15) + t = "short", sz = 2; + else + t = "int", sz = 4; + } + if(fd < 0) + return sz * len; + + fprint(fd, "static\n%s\t%s[%d] =\n{\n\t", t, name, len); + for(i = 0; i < len; i++){ + fprint(fd, "%d,", d[i]); + if((i+1) % 16 == 0) + fprint(fd, "\n\t"); + } + fprint(fd, "\n};\n"); + + return sz * len; +} + +static int +mkexceptarr(int fd, char *name, int *d, int n, int all) +{ + int i; + fprint(fd, "static\nRune %s[][%d] =\n{\n\t", name, all ? 3 : 2); + for(i = 0; i < n*3; i += 3){ + if(all && d[i] != 0) + fprint(fd, "{0x%X, 0x%X, 0x%X},", d[i], d[i+1], d[i+2]); + else if(!all) + fprint(fd, "{0x%X, 0x%X},", d[i+1], d[i+2]); + if((i+3) % (8*3) == 0) + fprint(fd, "\n\t"); + } + fprint(fd, "\n};\n"); + return n * sizeof(Rune) * 2; +} + +static int +compact(int *data, int *idx, int nidx, int *src, int chunksize) +{ + int i, n, ndata, best; + int *dot, *lp, *rp; + + dot = src; + ndata = 0; + idx[0] = 0; + for(i = 1; i <= nidx; i++){ + rp = dot + chunksize; + lp = rp - 1; + + for(best = 0, n = 0; i != nidx && n < chunksize; n++, lp--){ + if(memcmp(lp, rp, (n+1) * sizeof data[0]) == 0) + best = n+1; + } + memmove(data + ndata, dot, (chunksize - best) * sizeof data[0]); + ndata += (chunksize - best); + idx[i] = idx[i - 1] + (chunksize - best); + dot = rp; + } + return ndata; +} + + +static int +mklkup(int fd, char *label, int *map, Param *p) +{ + static int data[NRUNES]; + static int idx2[NRUNES]; + static int idx2dest[NRUNES]; + static int idx1[NRUNES]; + int i, nidx2, ndata; + int size; + + ndata = compact(data, idx2, p->round1max, map, p->data.max); + nidx2 = compact(idx2dest, idx1, p->idx1.max, idx2, p->idx2.max); + + if(fd >= 0){ + for(i = 0; i < NRUNES; i++) + if(map[i] != lkup(p, idx1, idx2dest, data, i)) + sysfatal("mismatch in %s at %d %d %d\n", label, i, map[i], lkup(p, idx1, idx2dest, data, i)); + } + + size = mkarrvar(fd, smprint("_%sdata", label), data, ndata); + size += mkarrvar(fd, smprint("_%sidx2", label), idx2dest, nidx2); + size += mkarrvar(fd, smprint("_%sidx1", label), idx1, p->idx1.max); + if(fd >= 0){ + fprint(fd, "\n"); + fprint(fd, "#define %sindex1(x) (((x)>>(%d+%d))&0x%X)\n", label, p->data.bits, p->idx2.bits, p->idx1.mask); + fprint(fd, "#define %sindex2(x) (((x)>>%d)&0x%X)\n", label, p->data.bits, p->idx2.mask); + fprint(fd, "#define %soffset(x) ((x)&0x%X)\n", label, p->data.mask); + fprint(fd, "#define %slkup(x) (_%sdata[_%sidx2[_%sidx1[%sindex1(x)] + %sindex2(x)] + %soffset(x)] )\n\n", + label, label, label, label, label, label, label); + } + return size; +} + +static void +mklkupmatrix(char *label, int *map, Param *p) +{ + int bestsize, size, bestx, besty; + int x, y; + + bestsize = bestx = besty = -1; + for(x = 4; x <= 12; x++) + for(y=4; y <= (19 - x); y++){ + param(p, x, y); + size = mklkup(-1, label, map, p); + if(bestsize == -1 || size < bestsize){ + bestx = x; + besty = y; + bestsize = size; + } + } + + assert(bestsize != -1); + fprint(2, "label: %s best: %d %d (%d)\n", label, bestx, besty, bestsize); + param(p, bestx, besty); +} + +static int myismerged[NRUNES]; +static int mytoupper[NRUNES]; +static int mytolower[NRUNES]; +static int mytotitle[NRUNES]; +static int mybreak[NRUNES]; + +enum{ DSTART = 0xEEEE }; +static int mydecomp[NRUNES]; +static int mydespecial[256*3]; +static int nspecial; +static int myccc[NRUNES]; + +typedef struct KV KV; +struct KV{ + uint key; + uint val; + ushort next; +}; + +static KV myrecomp[2000]; +static int nrecomp; + +static int recompext[256*3]; +static int nrecompext; + +static uint +hash(uint x) +{ + x ^= x >> 16; + x *= 0x21f0aaad; + x ^= x >> 15; + x *= 0xd35a2d97; + x ^= x >> 15; + return x; +} + +static void +mkrecomp(int fd) +{ + int i; + KV *p; + static KV vals[512]; + static KV coll[1000]; + int over; + int maxchain; + + for(i = 0; i < nelem(vals); i++) + vals[i] = (KV){0, 0, 0}; + for(i = 0; i < nelem(coll); i++) + coll[i] = (KV){0, 0, 0}; + over = 1; + for(i = 0; i < nrecomp; i++){ + p = vals + (hash(myrecomp[i].key) % nelem(vals)); + maxchain = 0; + while(p->key != 0){ + maxchain++; + if(p->next == 0){ + p->next = over; + p = coll + over - 1; + over++; + } else + p = coll + p->next - 1; + } + p->key = myrecomp[i].key; + p->val = myrecomp[i].val; + } + fprint(2, "recomp map [%d][%d]: %d\n", nelem(vals), over-1, (nelem(vals) + over-1) * (4+2+2)); + fprint(fd, "static\nuint\t_recompdata[] =\n{\n\t"); + for(p = vals, i = 0;; i++){ + assert(p->val < 0xFFFF); + assert(p->next < 0xFFFF); + fprint(fd, "%udU,%udU,", p->key, p->val | (p->next<<16)); + if((i+1) % 8 == 0) + fprint(fd, "\n\t"); + + if(p == vals+nelem(vals)-1) + p = coll; + else if(p == coll + over - 2) + break; + else + p++; + } + fprint(fd, "\n};\n"); + fprint(fd, "static uint *_recompcoll = _recompdata+%d*2;\n", nelem(vals)); + /* + fprint(fd, + " x ^= x >> 16;\n" + " x *= 0x21f0aaad;\n" + " x ^= x >> 15;\n" + " x *= 0xd35a2d97;\n" + " x ^= x >> 15;\n" + " p = _recompdata + (x%%%d)*2;\n" + "}\n", nelem(vals)); + */ +} + +static void +mktables(void) +{ + Param p; + int tofd, isfd, normfd, breakfd; + int size; + + tofd = create("runetotypedata", OWRITE, 0664); + if(tofd < 0) + sysfatal("could not create runetotypedata: %r"); + param(&p, 10, 7); + size = mklkup(tofd, "upper", mytoupper, &p); + fprint(2, "%s: %d\n", "upper", size); + + size = mklkup(tofd, "lower", mytolower, &p); + fprint(2, "%s: %d\n", "lower", size); + + size = mklkup(tofd, "title", mytotitle, &p); + fprint(2, "%s: %d\n", "title", size); + close(tofd); + + isfd = create("runeistypedata", OWRITE, 0664); + if(isfd < 0) + sysfatal("could not create runeistypedata: %r"); + param(&p, 11, 6); + size = mklkup(isfd, "merged", myismerged, &p); + fprint(2, "%s: %d\n", "merged", size); + fprint(isfd, "static\nenum {\n"); + fprint(isfd, "\tL%s = %s,\n", "space", "1<<0"); + fprint(isfd, "\tL%s = %s,\n", "alpha", "1<<1"); + fprint(isfd, "\tL%s = %s,\n", "digit", "1<<2"); + fprint(isfd, "\tL%s = %s,\n", "upper", "1<<3"); + fprint(isfd, "\tL%s = %s,\n", "lower", "1<<4"); + fprint(isfd, "\tL%s = %s,\n", "title", "1<<5"); + fprint(isfd, "};\n"); + close(isfd); + + normfd = create("runenormdata", OWRITE, 0664); + if(normfd < 0) + sysfatal("could not create runenormdata: %r"); + param(&p, 10, 7); + size = mklkup(normfd, "decomp", mydecomp, &p); + fprint(2, "%s: %d\n", "decomp", size); + + param(&p, 9, 7); + size = mklkup(normfd, "ccc", myccc, &p); + fprint(2, "%s: %d\n", "ccc", size); + + mkexceptarr(normfd, "_decompexceptions", mydespecial, nspecial, 0); + mkexceptarr(normfd, "_recompexceptions", recompext, nrecompext, 1); + mkrecomp(normfd); + close(normfd); + + param(&p, 10, 6); + breakfd = create("runebreakdata", OWRITE, 0644); + if(breakfd < 0) + sysfatal("could not create runebreakdata: %r"); + size = mklkup(breakfd, "break", mybreak, &p); + fprint(2, "%s: %d\n", "break", size); +} + +enum { + FIELD_CODE, + FIELD_NAME, + FIELD_CATEGORY, + FIELD_COMBINING, + FIELD_BIDIR, + FIELD_DECOMP, + FIELD_DECIMAL_DIG, + FIELD_DIG, + FIELD_NUMERIC_VAL, + FIELD_MIRRORED, + FIELD_UNICODE_1_NAME, + FIELD_COMMENT, + FIELD_UPPER, + FIELD_LOWER, + FIELD_TITLE, + NFIELDS, +}; + +static int +getunicodeline(Biobuf *in, char **fields) +{ + char *p; + + if((p = Brdline(in, '\n')) == nil) + return 0; + + p[Blinelen(in)-1] = '\0'; + + if (getfields(p, fields, NFIELDS + 1, 0, ";") != NFIELDS) + sysfatal("bad number of fields"); + + return 1; +} + +static int +estrtoul(char *s, int base) +{ + char *epr; + Rune code; + + code = strtoul(s, &epr, base); + if(s == epr) + sysfatal("bad code point hex string"); + return code; +} + +enum { + OTHER, + Hebrew_Letter, Newline, Extend, Format, + Katakana, ALetter, MidLetter, MidNum, + MidNumLet, Numeric, ExtendNumLet, WSegSpace, + PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40, + L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0, + EMOJIEX = 0xB0, +}; + +static void +markbreak(void) +{ + Biobuf *b; + char *p, *dot; + int i, s, e; + uchar v; + + b = Bopen("/lib/ucd/WordBreakProperty.txt", OREAD); + if(b == nil) + sysfatal("could not load word breaks: %r"); + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#') + continue; + if((dot = strstr(p, "..")) != nil){ + *dot = 0; + dot += 2; + s = estrtoul(p, 16); + e = estrtoul(dot, 16); + } else { + s = e = estrtoul(p, 16); + dot = p; + } + v = 0; + if(strstr(dot, "ExtendNumLet") != nil) + v = ExtendNumLet; + else if(strstr(dot, "Hebrew_Letter") != nil) + v = Hebrew_Letter; + else if(strstr(dot, "Newline") != nil) + v = Newline; + else if(strstr(dot, "Extend") != nil) + v = Extend; + else if(strstr(dot, "Format") != nil) + v = Format; + else if(strstr(dot, "Katakana") != nil) + v = Katakana; + else if(strstr(dot, "ALetter") != nil) + v = ALetter; + else if(strstr(dot, "MidLetter") != nil) + v = MidLetter; + else if(strstr(dot, "MidNum") != nil) + v = MidNum; + else if(strstr(dot, "Numeric") != nil) + v = Numeric; + else if(strstr(dot, "WSegSpace") != nil) + v = WSegSpace; + for(i = s; i <= e; i++) + mybreak[i] = v; + } + Bterm(b); + b = Bopen("/lib/ucd/GraphemeBreakProperty.txt", OREAD); + if(b == nil) + sysfatal("could not load Grapheme breaks: %r"); + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#') + continue; + if((dot = strstr(p, "..")) != nil){ + *dot = 0; + dot += 2; + s = estrtoul(p, 16); + e = estrtoul(dot, 16); + } else { + s = e = estrtoul(p, 16); + dot = p; + } + v = 0; + if(strstr(dot, "; Prepend #") != nil) + v = PREPEND; + else if(strstr(dot, "; Control #") != nil) + v = CONTROL; + else if(strstr(dot, "; Extend #") != nil) + v = EXTEND; + else if(strstr(dot, "; Regional_Indicator #") != nil) + v = REGION; + else if(strstr(dot, "; SpacingMark #") != nil) + v = SPACEMK; + else if(strstr(dot, "; L #") != nil) + v = L; + else if(strstr(dot, "; V #") != nil) + v = V; + else if(strstr(dot, "; T #") != nil) + v = T; + else if(strstr(dot, "; LV #") != nil) + v = LV; + else if(strstr(dot, "; LVT #") != nil) + v = LVT; + for(i = s; i <= e; i++) + mybreak[i] |= v; + } + Bterm(b); + + b = Bopen("/lib/ucd/emoji-data.txt", OREAD); + if(b == nil) + sysfatal("could not load emoji-data: %r"); + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#') + continue; + if((dot = strstr(p, "..")) != nil){ + *dot = 0; + dot += 2; + s = estrtoul(p, 16); + e = estrtoul(dot, 16); + } else { + s = e = estrtoul(p, 16); + dot = p; + } + v = 0; + if(strstr(dot, "; Extended_Pictographic") != nil) + v = EMOJIEX; + for(i = s; i <= e; i++) + mybreak[i] |= v; + } + Bterm(b); +} + +static void +markexclusions(void) +{ + Biobuf *b; + char *p; + int i; + uint x; + + b = Bopen("/lib/ucd/CompositionExclusions.txt", OREAD); + if(b == nil) + sysfatal("could not load composition exclusions: %r"); + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#') + continue; + x = estrtoul(p, 16); + for(i = 0; i < nrecomp; i++){ + if(myrecomp[i].val == x){ + myrecomp[i].val = 0; + break; + } + } + if(i == nrecomp){ + for(i = 0; i < nrecompext; i++){ + if(recompext[i*3] == x){ + recompext[i*3] = 0; + break; + } + } + } + } + Bterm(b); +} + +void +main(int, char) +{ + static char myisspace[NRUNES]; + static char myisalpha[NRUNES]; + static char myisdigit[NRUNES]; + static char myisupper[NRUNES]; + static char myislower[NRUNES]; + static char myistitle[NRUNES]; + Biobuf *in; + char *fields[NFIELDS + 1], *fields2[NFIELDS + 1]; + char *p, *d; + int i, code, last; + int decomp[2], *ip; + + in = Bopen("/lib/ucd/UnicodeData.txt", OREAD); + if(in == nil) + sysfatal("can't open UnicodeData.txt: %r"); + + for(i = 0; i < NRUNES; i++){ + mytoupper[i] = -1; + mytolower[i] = -1; + mytotitle[i] = -1; + mydecomp[i] = 0; + myccc[i] = 0; + mybreak[i] = 0; + } + + myisspace['\t'] = 1; + myisspace['\n'] = 1; + myisspace['\r'] = 1; + myisspace['\f'] = 1; + myisspace['\v'] = 1; + myisspace[0x85] = 1; /* control char, "next line" */ + myisspace[0xfeff] = 1; /* zero-width non-break space */ + + last = -1; + nspecial = nrecomp = nrecompext = 0; + while(getunicodeline(in, fields)){ + code = estrtoul(fields[FIELD_CODE], 16); + if (code >= NRUNES) + sysfatal("code-point value too big: %x", code); + if(code <= last) + sysfatal("bad code sequence: %x then %x", last, code); + last = code; + + p = fields[FIELD_CATEGORY]; + if(strstr(fields[FIELD_NAME], ", First>") != nil){ + if(!getunicodeline(in, fields2)) + sysfatal("range start at eof"); + if (strstr(fields2[FIELD_NAME], ", Last>") == nil) + sysfatal("range start not followed by range end"); + last = estrtoul(fields2[FIELD_CODE], 16); + if(last <= code) + sysfatal("range out of sequence: %x then %x", code, last); + if(strcmp(p, fields2[FIELD_CATEGORY]) != 0) + sysfatal("range with mismatched category"); + } + + d = fields[FIELD_DECOMP]; + if(strlen(d) > 0 && strstr(d, "<") == nil){ + decomp[0] = estrtoul(d, 16); + d = strstr(d, " "); + if(d == nil){ + /* singleton recompositions are verboden */ + decomp[1] = 0; + if(decomp[0] > 0xFFFF){ + //fprint(2, "case1 %X %X\n", code, decomp[0]); + ip = mydespecial + nspecial*3; + ip[0] = code; + ip[1] = decomp[0]; + ip[2] = 0; + mydecomp[code] = (DSTART+nspecial)<<16; + nspecial++; + } else + mydecomp[code] = decomp[0]<<16; + } else { + d++; + decomp[1] = estrtoul(d, 16); + if(decomp[0] > 0xFFFF || decomp[1] > 0xFFFF){ + //fprint(2, "case2 %X %X %X\n", code, decomp[0], decomp[1]); + ip = mydespecial + nspecial*3; + ip[0] = code; + ip[1] = decomp[0]; + ip[2] = decomp[1]; + mydecomp[code] = (DSTART+nspecial)<<16; + nspecial++; + ip = recompext + nrecompext*3; + ip[0] = code; + ip[1] = decomp[0]; + ip[2] = decomp[1]; + nrecompext++; + } else { + mydecomp[code] = decomp[0]<<16 | decomp[1]; + myrecomp[nrecomp++] = (KV){decomp[0]<<16 | decomp[1], code, 0}; + } + } + } + + for (; code <= last; code++){ + if(p[0] == 'L') + myisalpha[code] = 1; + if(p[0] == 'Z') + myisspace[code] = 1; + + if(strcmp(p, "Lu") == 0) + myisupper[code] = 1; + if(strcmp(p, "Ll") == 0) + myislower[code] = 1; + + if(strcmp(p, "Lt") == 0) + myistitle[code] = 1; + + if(strcmp(p, "Nd") == 0) + myisdigit[code] = 1; + + if(fields[FIELD_UPPER][0] != '\0') + mytoupper[code] = estrtoul(fields[FIELD_UPPER], 16); + + if(fields[FIELD_LOWER][0] != '\0') + mytolower[code] = estrtoul(fields[FIELD_LOWER], 16); + + if(fields[FIELD_TITLE][0] != '\0') + mytotitle[code] = estrtoul(fields[FIELD_TITLE], 16); + + myccc[code] = estrtoul(fields[FIELD_COMBINING], 10); + } + } + + Bterm(in); + + markexclusions(); + + /* + * according to standard, if totitle(x) is not defined in ucd + * but toupper(x) is, then totitle is defined to be toupper(x) + */ + for(i = 0; i < NRUNES; i++){ + if(mytotitle[i] == -1 + && mytoupper[i] != -1 + && !myistitle[i]) + mytotitle[i] = mytoupper[i]; + } + + /* + * A couple corrections: + * is*(to*(x)) should be true. + * restore undefined transformations. + * store offset instead of value, makes them sparse. + */ + for(i = 0; i < NRUNES; i++){ + if(mytoupper[i] != -1) + myisupper[mytoupper[i]] = 1; + else + mytoupper[i] = i; + + if(mytolower[i] != -1) + myislower[mytolower[i]] = 1; + else + mytolower[i] = i; + + if(mytotitle[i] != -1) + myistitle[mytotitle[i]] = 1; + else + mytotitle[i] = i; + + mytoupper[i] = mytoupper[i] - i; + mytolower[i] = mytolower[i] - i; + mytotitle[i] = mytotitle[i] - i; + } + + uchar b; + for(i = 0; i < NRUNES; i++){ + b = 0; + if(myisspace[i]) + b |= 1<<0; + if(myisalpha[i]) + b |= 1<<1; + if(myisdigit[i]) + b |= 1<<2; + if(myisupper[i]) + b |= 1<<3; + if(myislower[i]) + b |= 1<<4; + if(myistitle[i]) + b |= 1<<5; + + myismerged[i] = b; + } + + markbreak(); + mktables(); + exits(nil); +} --- /dev/null +++ b//sys/src/libc/port/runebreak.c @@ -1,0 +1,149 @@ +#include <u.h> +#include <libc.h> + +#include "/sys/src/libc/port/runebreakdata" + +enum { + OTHER, + Hebrew_Letter, Newline, Extend, Format, + Katakana, ALetter, MidLetter, MidNum, + MidNumLet, Numeric, ExtendNumLet, WSegSpace, + PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40, + L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0, + EMOJIEX = 0xB0, + + ZWJ = 0x200DU, + LINETAB = 0xB, +}; + +#define IS(x, y) ((x&0xf) == y) +#define ISG(x, y) ((x&0xf0) == y) + +Rune* +runegbreak(Rune *s) +{ + Rune l, r; + uchar lt, rt; + Rune *p; + + p = s; + if((l = *p++) == 0) + return s; + if((r = *p) == 0) + return s; + lt = breaklkup(l); + rt = breaklkup(r); + if(l == '\r' && r == '\n') + goto Done; + if(ISG(lt, CONTROL) || l == '\r' || l == '\n') + return p; + if(ISG(rt, CONTROL) || r == '\r' || r == '\n') + return p; + if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT))) + goto Done; + if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T))) + goto Done; + if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T))) + goto Done; + if(ISG(rt, SPACEMK) || ISG(lt, PREPEND)) + goto Done; + if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){ + while(ISG(rt, EXTEND)){ + p++; + if((r = *p) == 0) + return s; + rt = breaklkup(r); + } + if(r != ZWJ) + return p; + p++; + if((r = *p) == 0) + return s; + rt = breaklkup(r); + if(ISG(rt, EMOJIEX)) + goto Done; + return p; + } + if(ISG(rt, EXTEND) || r == ZWJ) + goto Done; + if(ISG(lt, REGION) && ISG(rt, REGION)) + goto Done; + + return p; + +Done: + if(p[1] == 0) + return s; + return p + 1; +} + +#define AH(x) (IS(x, ALetter) || IS(x, Hebrew_Letter)) +#define MNLQ(x) (IS(x, MidNumLet) || x == '\'') + +Rune* +runewbreak(Rune *s) +{ + Rune l, r; + uchar lt, rt; + Rune *p; + + p = s; + if((l = *p++) == 0) + return s; + if((r = *p) == 0) + return s; + lt = breaklkup(l); + rt = breaklkup(r); + if(l == '\r' && r == '\n') + goto Done; + if(l == '\r' || l == '\n' || l == LINETAB) + return p; + if(r == '\r' || r == '\n' || l == LINETAB) + return p; + if(IS(lt, WSegSpace) && IS(rt, WSegSpace)) + goto Done; + if(IS(rt, Format) || IS(rt, Extend)) + goto Done; + if(AH(lt)){ + if(AH(rt)) + goto Done; + if((IS(rt, MidLetter) || MNLQ(rt)) && p[1] != 0 && AH(breaklkup(p[1]))) + goto Done; + if(IS(lt, Hebrew_Letter) && r == '\'') + goto Done; + if(IS(lt, Hebrew_Letter) && r == '"' && p[1] != 0 && IS(breaklkup(p[1]), Hebrew_Letter)) + goto Done; + if(IS(rt, Numeric)) + goto Done; + } + if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric))) + goto Done; + if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && p[1] != 0 && IS(breaklkup(p[1]), Numeric)) + goto Done; + if(IS(lt, Katakana) && IS(rt, Katakana)) + goto Done; + if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet)) + if(IS(rt, ExtendNumLet)) + goto Done; + if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana))) + goto Done; + if(ISG(lt, REGION)){ + if(ISG(rt, REGION)) + goto Done; + if(r != ZWJ) + return p; + p++; + if((r = *p) == 0) + return s; + rt = breaklkup(r); + if(ISG(rt, REGION)) + goto Done; + } + + return p; + +Done: + if(p[1] == 0) + return s; + return p + 1; +} --- /dev/null +++ b//sys/src/libc/port/runeistype.c @@ -1,0 +1,40 @@ +#include <u.h> +#include <libc.h> + +#include "/sys/src/libc/port/runeistypedata" + +int +isspacerune(Rune c) +{ + return (mergedlkup(c) & Lspace) == Lspace; +} + +int +isalpharune(Rune c) +{ + return (mergedlkup(c) & Lalpha) == Lalpha; +} + +int +isdigitrune(Rune c) +{ + return (mergedlkup(c) & Ldigit) == Ldigit; +} + +int +isupperrune(Rune c) +{ + return (mergedlkup(c) & Lupper) == Lupper; +} + +int +islowerrune(Rune c) +{ + return (mergedlkup(c) & Llower) == Llower; +} + +int +istitlerune(Rune c) +{ + return (mergedlkup(c) & Ltitle) == Ltitle; +} --- /dev/null +++ b//sys/src/libc/port/runenorm.c @@ -1,0 +1,328 @@ +#include <u.h> +#include <libc.h> + +#include "/sys/src/libc/port/runenormdata" + +//Unicode Standard: Section 3.12 Conjoining Jamo Behavior +enum { + SBase = 0xAC00, + LBase = 0x1100, + VBase = 0x1161, + TBase = 0x11A7, + + LCount = 19, + VCount = 21, + TCount = 28, + NCount = VCount * TCount, + SCount = LCount * NCount, + + LLast = LBase + LCount - 1, + SLast = SBase + SCount - 1, + VLast = VBase + VCount - 1, + TLast = TBase + TCount - 1, +}; + +void +decomposerune(Rune c, Rune dst[2]) +{ + uint x; + + if(c >= SBase && c <= SLast){ + c -= SBase; + x = c % TCount; + if(x){ + dst[0] = SBase + ((c / TCount) * TCount); + dst[1] = TBase + x; + return; + } + dst[0] = LBase + (c / NCount); + dst[1] = VBase + ((c % NCount) / TCount); + return; + } + x = decomplkup(c); + if((x & (ushort)~0) != 0){ + dst[0] = x>>16; + dst[1] = x & (ushort)~0; + return; + } + x >>= 16; + if(x >= 0xEEEE && x <0xF8FF){ + memmove(dst, _decompexceptions[x - 0xEEEE], sizeof(Rune)*2); + return; + } + dst[0] = x; + dst[1] = 0; +} + +Rune +composerune(Rune r[2]) +{ + uint x, y, *p, next; + + if(r[0] >= LBase && r[0] <= LLast){ + if(r[1] < VBase || r[1] > VLast) + return 0; + x = (r[0] - LBase) * NCount + (r[1] - VBase) * TCount; + return SBase + x; + } + if(r[0] >= SBase && r[0] <= SLast && (r[0] - SBase) % TCount == 0){ + if(r[1] > TBase && r[1] <= TLast) + return r[0] + (r[1] - TBase); + return 0; + } + if(r[0] > (ushort)~0 || r[1] > (ushort)~0){ + for(x = 0; x < nelem(_recompexceptions); x++) + if(r[0] == _recompexceptions[x][1] && r[1] == _recompexceptions[x][2]) + return _recompexceptions[x][0]; + return 0; + } + y = x = r[0]<<16 | r[1]; + x ^= x >> 16; + x *= 0x21f0aaad; + x ^= x >> 15; + x *= 0xd35a2d97; + x ^= x >> 15; + p = _recompdata + (x%512)*2; + while(p[0] != y){ + next = p[1]>>16; + if(!next) + return 0; + p = _recompcoll + (next-1)*2; + } + return p[1] & 0xFFFF; +} + +int +runeccc(Rune c) +{ + return ccclkup(c); +} + +void +runecccsort(Rune *a, int len) +{ + Rune r; + int i; + int fail; + + do { + fail = 0; + for(i = 0; i < len - 1; i++){ + if(runeccc(a[i]) > runeccc(a[i+1]) > 0){ + r = a[i]; + a[i] = a[i+1]; + a[i + 1] = r; + fail = 1; + } + } + } while(fail); +} + +char* +fullutfnorm(char *s, int n) +{ + Rune r, peek; + char *p, *p2; + + p = s; + if(fullrune(p, n) == 0) + return s; + + p += chartorune(&r, p); + n -= (p - s); + + if((r >= LBase && r <= LLast) || (r >= SBase && r <= SLast)){ + do { + if(fullrune(p, n) == 0) + return s; + p2 = p + chartorune(&peek, p); + n -= (p2 - p); + p = p2; + } while(n > 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast)); + if(n <= 0) + return s; + return p; + } + + do { + if(fullrune(p, n) == 0) + return s; + p2 = p + chartorune(&peek, p); + n -= (p2 - p); + p = p2; + if(runeccc(peek) == 0) + return p; + } while(n > 0); + + return s; +} + +Rune* +fullrunenorm(Rune *r, int n) +{ + Rune *e, *p; + + p = r; + e = p + n; + + if((*p >= LBase && *p <= LLast) || (*p >= SBase && *p <= SLast)){ + p++; + while(p < e && (*p >= VBase && *p <= VLast) || (*p > TBase && *p <= TLast)) + p++; + + if(p >= e) + return r; + return p; + } + + for(; p < e && p + 1 < e; p++) + if(runeccc(p[1]) == 0) + return p + 1; + + return r; +} + +int +_runenorm(Rune *dst, Rune *src, char *sdst, char *ssrc, int max, int compose) +{ + Rune c, r[2], _stack[32]; + Rune *p, *stack, *sp, *tp; + char *strp, *strstop; + Rune *rp, *rrp; + Rune *stop; + Rune peek; + int w, w2, size; + int mode; + + if(src){ + mode = 1; + p = src; + stop = dst + (max - 1); + strp = ""; + strstop = nil; + } else { + mode = 0; + p = L""; + stop = nil; + strp = ssrc; + strstop = sdst + (max - 1); + } + + stack = _stack + nelem(_stack)/2; + size = 0; + w = w2 = 0; + while(*strp || *p){ + if(mode) + c = *p; + else + w = chartorune(&c, strp); + + sp = stack - 1; + tp = stack; + decomposerune(c, r); + while(r[0] != 0){ + c = r[0]; + if(r[1] != 0){ + *sp-- = r[1]; + if(sp == _stack) + break; + } + decomposerune(c, r); + } + + *sp = c; + if(mode) + peek = p[1]; + else + w2 = chartorune(&peek, strp+w); + + if((*sp >= LBase && *sp <= LLast) || (*sp >= SBase && *sp <= SLast)){ + while(peek != 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast)){ + *tp++ = peek; + if(mode){ + p++; + peek = p[1]; + } else { + strp += w; + w = w2; + w2 = chartorune(&peek, strp+w); + } + if(tp == _stack + nelem(_stack)) + break; + } + } + while(peek != 0 && runeccc(peek) != 0){ + decomposerune(peek, r); + if(r[1] != 0){ + if(tp+1 >= _stack + nelem(_stack)) + break; + *tp++ = r[0]; + *tp++ = r[1]; + } else if(r[0] != 0) + *tp++ = r[0]; + else + *tp++ = peek; + + if(mode){ + p++; + peek = p[1]; + } else { + strp += w; + w = w2; + w2 = chartorune(&peek, strp+w); + } + if(tp == _stack + nelem(_stack)) + break; + } + runecccsort(sp, tp - sp); + + if(compose && runeccc(*sp) == 0){ + for(rp = sp + 1; rp < tp; rp++){ + r[0] = *sp; + r[1] = *rp; + c = composerune(r); + if(c != 0){ + *sp = c; + for(rrp = rp; rrp > sp; rrp--) + *rrp = rrp[-1]; + sp++; + } else while(rp + 1 < tp && runeccc(*rp) == runeccc(*(rp+1))) + rp++; + } + } + + for(; sp < tp; sp++){ + if(mode){ + if(dst < stop) + *dst++ = *sp; + size++; + } else { + w2 = runelen(*sp); + if(sdst+w2 < strstop) + sdst += runetochar(sdst, sp); + size += w2; + } + } + if(mode) + p++; + else + strp += w; + } + if(mode) + *dst = 0; + else + *sdst = 0; + return size; +} + +int +runenorm(Rune *dst, Rune *src, int max, int compose) +{ + return _runenorm(dst, src, nil, nil, max, compose); +} + +int +utfnorm(char *dst, char *src, int max, int compose) +{ + return _runenorm(nil, nil, dst, src, max, compose); +} --- /dev/null +++ b//sys/src/libc/port/runetotype.c @@ -1,0 +1,22 @@ +#include <u.h> +#include <libc.h> + +#include "/sys/src/libc/port/runetotypedata" + +Rune +toupperrune(Rune c) +{ + return c + upperlkup(c); +} + +Rune +tolowerrune(Rune c) +{ + return c + lowerlkup(c); +} + +Rune +totitlerune(Rune c) +{ + return c + titlelkup(c); +} --- a//sys/src/libc/port/runetype.c +++ /dev/null @@ -1,1181 +1,0 @@ -#include <u.h> -#include <libc.h> - -/* - * alpha ranges - - * only covers ranges not in lower||upper - */ -static -Rune _alpha2[] = -{ - 0x00d8, 0x00f6, /* Ø - ö */ - 0x00f8, 0x01f5, /* ø - ǵ */ - 0x0250, 0x02a8, /* ɐ - ʨ */ - 0x038e, 0x03a1, /* Ύ - Ρ */ - 0x03a3, 0x03ce, /* Σ - ώ */ - 0x03d0, 0x03d6, /* ϐ - ϖ */ - 0x03e2, 0x03f3, /* Ϣ - ϳ */ - 0x0490, 0x04c4, /* Ґ - ӄ */ - 0x0561, 0x0587, /* ա - և */ - 0x05d0, 0x05ea, /* א - ת */ - 0x05f0, 0x05f2, /* װ - ײ */ - 0x0621, 0x063a, /* ء - غ */ - 0x0640, 0x064a, /* ـ - ي */ - 0x0671, 0x06b7, /* ٱ - ڷ */ - 0x06ba, 0x06be, /* ں - ھ */ - 0x06c0, 0x06ce, /* ۀ - ێ */ - 0x06d0, 0x06d3, /* ې - ۓ */ - 0x0905, 0x0939, /* अ - ह */ - 0x0958, 0x0961, /* क़ - ॡ */ - 0x0985, 0x098c, /* অ - ঌ */ - 0x098f, 0x0990, /* এ - ঐ */ - 0x0993, 0x09a8, /* ও - ন */ - 0x09aa, 0x09b0, /* প - র */ - 0x09b6, 0x09b9, /* শ - হ */ - 0x09dc, 0x09dd, /* ড় - ঢ় */ - 0x09df, 0x09e1, /* য় - ৡ */ - 0x09f0, 0x09f1, /* ৰ - ৱ */ - 0x0a05, 0x0a0a, /* ਅ - ਊ */ - 0x0a0f, 0x0a10, /* ਏ - ਐ */ - 0x0a13, 0x0a28, /* ਓ - ਨ */ - 0x0a2a, 0x0a30, /* ਪ - ਰ */ - 0x0a32, 0x0a33, /* ਲ - ਲ਼ */ - 0x0a35, 0x0a36, /* ਵ - ਸ਼ */ - 0x0a38, 0x0a39, /* ਸ - ਹ */ - 0x0a59, 0x0a5c, /* ਖ਼ - ੜ */ - 0x0a85, 0x0a8b, /* અ - ઋ */ - 0x0a8f, 0x0a91, /* એ - ઑ */ - 0x0a93, 0x0aa8, /* ઓ - ન */ - 0x0aaa, 0x0ab0, /* પ - ર */ - 0x0ab2, 0x0ab3, /* લ - ળ */ - 0x0ab5, 0x0ab9, /* વ - હ */ - 0x0b05, 0x0b0c, /* ଅ - ଌ */ - 0x0b0f, 0x0b10, /* ଏ - ଐ */ - 0x0b13, 0x0b28, /* ଓ - ନ */ - 0x0b2a, 0x0b30, /* ପ - ର */ - 0x0b32, 0x0b33, /* ଲ - ଳ */ - 0x0b36, 0x0b39, /* ଶ - ହ */ - 0x0b5c, 0x0b5d, /* ଡ଼ - ଢ଼ */ - 0x0b5f, 0x0b61, /* ୟ - ୡ */ - 0x0b85, 0x0b8a, /* அ - ஊ */ - 0x0b8e, 0x0b90, /* எ - ஐ */ - 0x0b92, 0x0b95, /* ஒ - க */ - 0x0b99, 0x0b9a, /* ங - ச */ - 0x0b9e, 0x0b9f, /* ஞ - ட */ - 0x0ba3, 0x0ba4, /* ண - த */ - 0x0ba8, 0x0baa, /* ந - ப */ - 0x0bae, 0x0bb5, /* ம - வ */ - 0x0bb7, 0x0bb9, /* ஷ - ஹ */ - 0x0c05, 0x0c0c, /* అ - ఌ */ - 0x0c0e, 0x0c10, /* ఎ - ఐ */ - 0x0c12, 0x0c28, /* ఒ - న */ - 0x0c2a, 0x0c33, /* ప - ళ */ - 0x0c35, 0x0c39, /* వ - హ */ - 0x0c60, 0x0c61, /* ౠ - ౡ */ - 0x0c85, 0x0c8c, /* ಅ - ಌ */ - 0x0c8e, 0x0c90, /* ಎ - ಐ */ - 0x0c92, 0x0ca8, /* ಒ - ನ */ - 0x0caa, 0x0cb3, /* ಪ - ಳ */ - 0x0cb5, 0x0cb9, /* ವ - ಹ */ - 0x0ce0, 0x0ce1, /* ೠ - ೡ */ - 0x0d05, 0x0d0c, /* അ - ഌ */ - 0x0d0e, 0x0d10, /* എ - ഐ */ - 0x0d12, 0x0d28, /* ഒ - ന */ - 0x0d2a, 0x0d39, /* പ - ഹ */ - 0x0d60, 0x0d61, /* ൠ - ൡ */ - 0x0e01, 0x0e30, /* ก - ะ */ - 0x0e32, 0x0e33, /* า - ำ */ - 0x0e40, 0x0e46, /* เ - ๆ */ - 0x0e5a, 0x0e5b, /* ๚ - ๛ */ - 0x0e81, 0x0e82, /* ກ - ຂ */ - 0x0e87, 0x0e88, /* ງ - ຈ */ - 0x0e94, 0x0e97, /* ດ - ທ */ - 0x0e99, 0x0e9f, /* ນ - ຟ */ - 0x0ea1, 0x0ea3, /* ມ - ຣ */ - 0x0eaa, 0x0eab, /* ສ - ຫ */ - 0x0ead, 0x0eae, /* ອ - ຮ */ - 0x0eb2, 0x0eb3, /* າ - ຳ */ - 0x0ec0, 0x0ec4, /* ເ - ໄ */ - 0x0edc, 0x0edd, /* ໜ - ໝ */ - 0x0f18, 0x0f19, /* ༘ - ༙ */ - 0x0f40, 0x0f47, /* ཀ - ཇ */ - 0x0f49, 0x0f69, /* ཉ - ཀྵ */ - 0x10d0, 0x10f6, /* ა - ჶ */ - 0x1100, 0x1159, /* ᄀ - ᅙ */ - 0x115f, 0x11a2, /* ᅟ - ᆢ */ - 0x11a8, 0x11f9, /* ᆨ - ᇹ */ - 0x1e00, 0x1e9b, /* Ḁ - ẛ */ - 0x1f50, 0x1f57, /* ὐ - ὗ */ - 0x1f80, 0x1fb4, /* ᾀ - ᾴ */ - 0x1fb6, 0x1fbc, /* ᾶ - ᾼ */ - 0x1fc2, 0x1fc4, /* ῂ - ῄ */ - 0x1fc6, 0x1fcc, /* ῆ - ῌ */ - 0x1fd0, 0x1fd3, /* ῐ - ΐ */ - 0x1fd6, 0x1fdb, /* ῖ - Ί */ - 0x1fe0, 0x1fec, /* ῠ - Ῥ */ - 0x1ff2, 0x1ff4, /* ῲ - ῴ */ - 0x1ff6, 0x1ffc, /* ῶ - ῼ */ - 0x210a, 0x2113, /* ℊ - ℓ */ - 0x2115, 0x211d, /* ℕ - ℝ */ - 0x2120, 0x2122, /* ℠ - ™ */ - 0x212a, 0x2131, /* K - ℱ */ - 0x2133, 0x2138, /* ℳ - ℸ */ - 0x3041, 0x3094, /* ぁ - ゔ */ - 0x30a1, 0x30fa, /* ァ - ヺ */ - 0x3105, 0x312c, /* ㄅ - ㄬ */ - 0x3131, 0x318e, /* ㄱ - ㆎ */ - 0x3192, 0x319f, /* ㆒ - ㆟ */ - 0x3260, 0x327b, /* ㉠ - ㉻ */ - 0x328a, 0x32b0, /* ㊊ - ㊰ */ - 0x32d0, 0x32fe, /* ㋐ - ㋾ */ - 0x3300, 0x3357, /* ㌀ - ㍗ */ - 0x3371, 0x3376, /* ㍱ - ㍶ */ - 0x337b, 0x3394, /* ㍻ - ㎔ */ - 0x3399, 0x339e, /* ㎙ - ㎞ */ - 0x33a9, 0x33ad, /* ㎩ - ㎭ */ - 0x33b0, 0x33c1, /* ㎰ - ㏁ */ - 0x33c3, 0x33c5, /* ㏃ - ㏅ */ - 0x33c7, 0x33d7, /* ㏇ - ㏗ */ - 0x33d9, 0x33dd, /* ㏙ - ㏝ */ - 0x4e00, 0x9fff, /* 一 - 鿿 */ - 0xac00, 0xd7a3, /* 가 - 힣 */ - 0xf900, 0xfb06, /* 豈 - st */ - 0xfb13, 0xfb17, /* ﬓ - ﬗ */ - 0xfb1f, 0xfb28, /* ײַ - ﬨ */ - 0xfb2a, 0xfb36, /* שׁ - זּ */ - 0xfb38, 0xfb3c, /* טּ - לּ */ - 0xfb40, 0xfb41, /* נּ - סּ */ - 0xfb43, 0xfb44, /* ףּ - פּ */ - 0xfb46, 0xfbb1, /* צּ - ﮱ */ - 0xfbd3, 0xfd3d, /* ﯓ - ﴽ */ - 0xfd50, 0xfd8f, /* ﵐ - ﶏ */ - 0xfd92, 0xfdc7, /* ﶒ - ﷇ */ - 0xfdf0, 0xfdf9, /* ﷰ - ﷹ */ - 0xfe70, 0xfe72, /* ﹰ - ﹲ */ - 0xfe76, 0xfefc, /* ﹶ - ﻼ */ - 0xff66, 0xff6f, /* ヲ - ッ */ - 0xff71, 0xff9d, /* ア - ン */ - 0xffa0, 0xffbe, /* ᅠ - ᄒ */ - 0xffc2, 0xffc7, /* ᅡ - ᅦ */ - 0xffca, 0xffcf, /* ᅧ - ᅬ */ - 0xffd2, 0xffd7, /* ᅭ - ᅲ */ - 0xffda, 0xffdc, /* ᅳ - ᅵ */ -}; - -/* - * alpha singlets - - * only covers ranges not in lower||upper - */ -static -Rune _alpha1[] = -{ - 0x00aa, /* ª */ - 0x00b5, /* µ */ - 0x00ba, /* º */ - 0x03da, /* Ϛ */ - 0x03dc, /* Ϝ */ - 0x03de, /* Ϟ */ - 0x03e0, /* Ϡ */ - 0x06d5, /* ە */ - 0x09b2, /* ল */ - 0x0a5e, /* ਫ਼ */ - 0x0a8d, /* ઍ */ - 0x0ae0, /* ૠ */ - 0x0b9c, /* ஜ */ - 0x0cde, /* ೞ */ - 0x0e4f, /* ๏ */ - 0x0e84, /* ຄ */ - 0x0e8a, /* ຊ */ - 0x0e8d, /* ຍ */ - 0x0ea5, /* ລ */ - 0x0ea7, /* ວ */ - 0x0eb0, /* ະ */ - 0x0ebd, /* ຽ */ - 0x1fbe, /* ι */ - 0x207f, /* ⁿ */ - 0x20a8, /* ₨ */ - 0x2102, /* ℂ */ - 0x2107, /* ℇ */ - 0x2124, /* ℤ */ - 0x2126, /* Ω */ - 0x2128, /* ℨ */ - 0xfb3e, /* מּ */ - 0xfe74, /* ﹴ */ -}; - -/* - * space ranges - */ -static -Rune _space2[] = -{ - 0x0009, 0x000a, /* tab and newline */ - 0x0020, 0x0020, /* space */ - 0x0085, 0x0085, - 0x00a0, 0x00a0, /* */ - 0x1680, 0x1680, - 0x180e, 0x180e, - 0x2000, 0x200b, /* - */ - 0x2028, 0x2029, /* - */ - 0x202f, 0x202f, - 0x205f, 0x205f, - 0x3000, 0x3000, /* */ - 0xfeff, 0xfeff, /* */ -}; - -/* - * lower case ranges - * 3rd col is conversion excess 500 - */ -static -Rune _toupper2[] = -{ - 0x0061, 0x007a, 468, /* a-z A-Z */ - 0x00e0, 0x00f6, 468, /* à-ö À-Ö */ - 0x00f8, 0x00fe, 468, /* ø-þ Ø-Þ */ - 0x0256, 0x0257, 295, /* ɖ-ɗ Ɖ-Ɗ */ - 0x0258, 0x0259, 298, /* ɘ-ə Ǝ-Ə */ - 0x028a, 0x028b, 283, /* ʊ-ʋ Ʊ-Ʋ */ - 0x03ad, 0x03af, 463, /* έ-ί Έ-Ί */ - 0x03b1, 0x03c1, 468, /* α-ρ Α-Ρ */ - 0x03c3, 0x03cb, 468, /* σ-ϋ Σ-Ϋ */ - 0x03cd, 0x03ce, 437, /* ύ-ώ Ύ-Ώ */ - 0x0430, 0x044f, 468, /* а-я А-Я */ - 0x0451, 0x045c, 420, /* ё-ќ Ё-Ќ */ - 0x045e, 0x045f, 420, /* ў-џ Ў-Џ */ - 0x0561, 0x0586, 452, /* ա-ֆ Ա-Ֆ */ - 0x1f00, 0x1f07, 508, /* ἀ-ἇ Ἀ-Ἇ */ - 0x1f10, 0x1f15, 508, /* ἐ-ἕ Ἐ-Ἕ */ - 0x1f20, 0x1f27, 508, /* ἠ-ἧ Ἠ-Ἧ */ - 0x1f30, 0x1f37, 508, /* ἰ-ἷ Ἰ-Ἷ */ - 0x1f40, 0x1f45, 508, /* ὀ-ὅ Ὀ-Ὅ */ - 0x1f60, 0x1f67, 508, /* ὠ-ὧ Ὠ-Ὧ */ - 0x1f70, 0x1f71, 574, /* ὰ-ά Ὰ-Ά */ - 0x1f72, 0x1f75, 586, /* ὲ-ή Ὲ-Ή */ - 0x1f76, 0x1f77, 600, /* ὶ-ί Ὶ-Ί */ - 0x1f78, 0x1f79, 628, /* ὸ-ό Ὸ-Ό */ - 0x1f7a, 0x1f7b, 612, /* ὺ-ύ Ὺ-Ύ */ - 0x1f7c, 0x1f7d, 626, /* ὼ-ώ Ὼ-Ώ */ - 0x1f80, 0x1f87, 508, /* ᾀ-ᾇ ᾈ-ᾏ */ - 0x1f90, 0x1f97, 508, /* ᾐ-ᾗ ᾘ-ᾟ */ - 0x1fa0, 0x1fa7, 508, /* ᾠ-ᾧ ᾨ-ᾯ */ - 0x1fb0, 0x1fb1, 508, /* ᾰ-ᾱ Ᾰ-Ᾱ */ - 0x1fd0, 0x1fd1, 508, /* ῐ-ῑ Ῐ-Ῑ */ - 0x1fe0, 0x1fe1, 508, /* ῠ-ῡ Ῠ-Ῡ */ - 0x2170, 0x217f, 484, /* ⅰ-ⅿ Ⅰ-Ⅿ */ - 0x24d0, 0x24e9, 474, /* ⓐ-ⓩ Ⓐ-Ⓩ */ - 0xff41, 0xff5a, 468, /* a-z A-Z */ -}; - -/* - * lower case singlets - * 2nd col is conversion excess 500 - */ -static -Rune _toupper1[] = -{ - 0x00ff, 621, /* ÿ Ÿ */ - 0x0101, 499, /* ā Ā */ - 0x0103, 499, /* ă Ă */ - 0x0105, 499, /* ą Ą */ - 0x0107, 499, /* ć Ć */ - 0x0109, 499, /* ĉ Ĉ */ - 0x010b, 499, /* ċ Ċ */ - 0x010d, 499, /* č Č */ - 0x010f, 499, /* ď Ď */ - 0x0111, 499, /* đ Đ */ - 0x0113, 499, /* ē Ē */ - 0x0115, 499, /* ĕ Ĕ */ - 0x0117, 499, /* ė Ė */ - 0x0119, 499, /* ę Ę */ - 0x011b, 499, /* ě Ě */ - 0x011d, 499, /* ĝ Ĝ */ - 0x011f, 499, /* ğ Ğ */ - 0x0121, 499, /* ġ Ġ */ - 0x0123, 499, /* ģ Ģ */ - 0x0125, 499, /* ĥ Ĥ */ - 0x0127, 499, /* ħ Ħ */ - 0x0129, 499, /* ĩ Ĩ */ - 0x012b, 499, /* ī Ī */ - 0x012d, 499, /* ĭ Ĭ */ - 0x012f, 499, /* į Į */ - 0x0131, 268, /* ı I */ - 0x0133, 499, /* ij IJ */ - 0x0135, 499, /* ĵ Ĵ */ - 0x0137, 499, /* ķ Ķ */ - 0x013a, 499, /* ĺ Ĺ */ - 0x013c, 499, /* ļ Ļ */ - 0x013e, 499, /* ľ Ľ */ - 0x0140, 499, /* ŀ Ŀ */ - 0x0142, 499, /* ł Ł */ - 0x0144, 499, /* ń Ń */ - 0x0146, 499, /* ņ Ņ */ - 0x0148, 499, /* ň Ň */ - 0x014b, 499, /* ŋ Ŋ */ - 0x014d, 499, /* ō Ō */ - 0x014f, 499, /* ŏ Ŏ */ - 0x0151, 499, /* ő Ő */ - 0x0153, 499, /* œ Œ */ - 0x0155, 499, /* ŕ Ŕ */ - 0x0157, 499, /* ŗ Ŗ */ - 0x0159, 499, /* ř Ř */ - 0x015b, 499, /* ś Ś */ - 0x015d, 499, /* ŝ Ŝ */ - 0x015f, 499, /* ş Ş */ - 0x0161, 499, /* š Š */ - 0x0163, 499, /* ţ Ţ */ - 0x0165, 499, /* ť Ť */ - 0x0167, 499, /* ŧ Ŧ */ - 0x0169, 499, /* ũ Ũ */ - 0x016b, 499, /* ū Ū */ - 0x016d, 499, /* ŭ Ŭ */ - 0x016f, 499, /* ů Ů */ - 0x0171, 499, /* ű Ű */ - 0x0173, 499, /* ų Ų */ - 0x0175, 499, /* ŵ Ŵ */ - 0x0177, 499, /* ŷ Ŷ */ - 0x017a, 499, /* ź Ź */ - 0x017c, 499, /* ż Ż */ - 0x017e, 499, /* ž Ž */ - 0x017f, 200, /* ſ S */ - 0x0183, 499, /* ƃ Ƃ */ - 0x0185, 499, /* ƅ Ƅ */ - 0x0188, 499, /* ƈ Ƈ */ - 0x018c, 499, /* ƌ Ƌ */ - 0x0192, 499, /* ƒ Ƒ */ - 0x0199, 499, /* ƙ Ƙ */ - 0x01a1, 499, /* ơ Ơ */ - 0x01a3, 499, /* ƣ Ƣ */ - 0x01a5, 499, /* ƥ Ƥ */ - 0x01a8, 499, /* ƨ Ƨ */ - 0x01ad, 499, /* ƭ Ƭ */ - 0x01b0, 499, /* ư Ư */ - 0x01b4, 499, /* ƴ Ƴ */ - 0x01b6, 499, /* ƶ Ƶ */ - 0x01b9, 499, /* ƹ Ƹ */ - 0x01bd, 499, /* ƽ Ƽ */ - 0x01c5, 499, /* Dž DŽ */ - 0x01c6, 498, /* dž DŽ */ - 0x01c8, 499, /* Lj LJ */ - 0x01c9, 498, /* lj LJ */ - 0x01cb, 499, /* Nj NJ */ - 0x01cc, 498, /* nj NJ */ - 0x01ce, 499, /* ǎ Ǎ */ - 0x01d0, 499, /* ǐ Ǐ */ - 0x01d2, 499, /* ǒ Ǒ */ - 0x01d4, 499, /* ǔ Ǔ */ - 0x01d6, 499, /* ǖ Ǖ */ - 0x01d8, 499, /* ǘ Ǘ */ - 0x01da, 499, /* ǚ Ǚ */ - 0x01dc, 499, /* ǜ Ǜ */ - 0x01df, 499, /* ǟ Ǟ */ - 0x01e1, 499, /* ǡ Ǡ */ - 0x01e3, 499, /* ǣ Ǣ */ - 0x01e5, 499, /* ǥ Ǥ */ - 0x01e7, 499, /* ǧ Ǧ */ - 0x01e9, 499, /* ǩ Ǩ */ - 0x01eb, 499, /* ǫ Ǫ */ - 0x01ed, 499, /* ǭ Ǭ */ - 0x01ef, 499, /* ǯ Ǯ */ - 0x01f2, 499, /* Dz DZ */ - 0x01f3, 498, /* dz DZ */ - 0x01f5, 499, /* ǵ Ǵ */ - 0x01fb, 499, /* ǻ Ǻ */ - 0x01fd, 499, /* ǽ Ǽ */ - 0x01ff, 499, /* ǿ Ǿ */ - 0x0201, 499, /* ȁ Ȁ */ - 0x0203, 499, /* ȃ Ȃ */ - 0x0205, 499, /* ȅ Ȅ */ - 0x0207, 499, /* ȇ Ȇ */ - 0x0209, 499, /* ȉ Ȉ */ - 0x020b, 499, /* ȋ Ȋ */ - 0x020d, 499, /* ȍ Ȍ */ - 0x020f, 499, /* ȏ Ȏ */ - 0x0211, 499, /* ȑ Ȑ */ - 0x0213, 499, /* ȓ Ȓ */ - 0x0215, 499, /* ȕ Ȕ */ - 0x0217, 499, /* ȗ Ȗ */ - 0x0253, 290, /* ɓ Ɓ */ - 0x0254, 294, /* ɔ Ɔ */ - 0x025b, 297, /* ɛ Ɛ */ - 0x0260, 295, /* ɠ Ɠ */ - 0x0263, 293, /* ɣ Ɣ */ - 0x0268, 291, /* ɨ Ɨ */ - 0x0269, 289, /* ɩ Ɩ */ - 0x026f, 289, /* ɯ Ɯ */ - 0x0272, 287, /* ɲ Ɲ */ - 0x0283, 282, /* ʃ Ʃ */ - 0x0288, 282, /* ʈ Ʈ */ - 0x0292, 281, /* ʒ Ʒ */ - 0x03ac, 462, /* ά Ά */ - 0x03cc, 436, /* ό Ό */ - 0x03d0, 438, /* ϐ Β */ - 0x03d1, 443, /* ϑ Θ */ - 0x03d5, 453, /* ϕ Φ */ - 0x03d6, 446, /* ϖ Π */ - 0x03e3, 499, /* ϣ Ϣ */ - 0x03e5, 499, /* ϥ Ϥ */ - 0x03e7, 499, /* ϧ Ϧ */ - 0x03e9, 499, /* ϩ Ϩ */ - 0x03eb, 499, /* ϫ Ϫ */ - 0x03ed, 499, /* ϭ Ϭ */ - 0x03ef, 499, /* ϯ Ϯ */ - 0x03f0, 414, /* ϰ Κ */ - 0x03f1, 420, /* ϱ Ρ */ - 0x0461, 499, /* ѡ Ѡ */ - 0x0463, 499, /* ѣ Ѣ */ - 0x0465, 499, /* ѥ Ѥ */ - 0x0467, 499, /* ѧ Ѧ */ - 0x0469, 499, /* ѩ Ѩ */ - 0x046b, 499, /* ѫ Ѫ */ - 0x046d, 499, /* ѭ Ѭ */ - 0x046f, 499, /* ѯ Ѯ */ - 0x0471, 499, /* ѱ Ѱ */ - 0x0473, 499, /* ѳ Ѳ */ - 0x0475, 499, /* ѵ Ѵ */ - 0x0477, 499, /* ѷ Ѷ */ - 0x0479, 499, /* ѹ Ѹ */ - 0x047b, 499, /* ѻ Ѻ */ - 0x047d, 499, /* ѽ Ѽ */ - 0x047f, 499, /* ѿ Ѿ */ - 0x0481, 499, /* ҁ Ҁ */ - 0x0491, 499, /* ґ Ґ */ - 0x0493, 499, /* ғ Ғ */ - 0x0495, 499, /* ҕ Ҕ */ - 0x0497, 499, /* җ Җ */ - 0x0499, 499, /* ҙ Ҙ */ - 0x049b, 499, /* қ Қ */ - 0x049d, 499, /* ҝ Ҝ */ - 0x049f, 499, /* ҟ Ҟ */ - 0x04a1, 499, /* ҡ Ҡ */ - 0x04a3, 499, /* ң Ң */ - 0x04a5, 499, /* ҥ Ҥ */ - 0x04a7, 499, /* ҧ Ҧ */ - 0x04a9, 499, /* ҩ Ҩ */ - 0x04ab, 499, /* ҫ Ҫ */ - 0x04ad, 499, /* ҭ Ҭ */ - 0x04af, 499, /* ү Ү */ - 0x04b1, 499, /* ұ Ұ */ - 0x04b3, 499, /* ҳ Ҳ */ - 0x04b5, 499, /* ҵ Ҵ */ - 0x04b7, 499, /* ҷ Ҷ */ - 0x04b9, 499, /* ҹ Ҹ */ - 0x04bb, 499, /* һ Һ */ - 0x04bd, 499, /* ҽ Ҽ */ - 0x04bf, 499, /* ҿ Ҿ */ - 0x04c2, 499, /* ӂ Ӂ */ - 0x04c4, 499, /* ӄ Ӄ */ - 0x04c8, 499, /* ӈ Ӈ */ - 0x04cc, 499, /* ӌ Ӌ */ - 0x04d1, 499, /* ӑ Ӑ */ - 0x04d3, 499, /* ӓ Ӓ */ - 0x04d5, 499, /* ӕ Ӕ */ - 0x04d7, 499, /* ӗ Ӗ */ - 0x04d9, 499, /* ә Ә */ - 0x04db, 499, /* ӛ Ӛ */ - 0x04dd, 499, /* ӝ Ӝ */ - 0x04df, 499, /* ӟ Ӟ */ - 0x04e1, 499, /* ӡ Ӡ */ - 0x04e3, 499, /* ӣ Ӣ */ - 0x04e5, 499, /* ӥ Ӥ */ - 0x04e7, 499, /* ӧ Ӧ */ - 0x04e9, 499, /* ө Ө */ - 0x04eb, 499, /* ӫ Ӫ */ - 0x04ef, 499, /* ӯ Ӯ */ - 0x04f1, 499, /* ӱ Ӱ */ - 0x04f3, 499, /* ӳ Ӳ */ - 0x04f5, 499, /* ӵ Ӵ */ - 0x04f9, 499, /* ӹ Ӹ */ - 0x1e01, 499, /* ḁ Ḁ */ - 0x1e03, 499, /* ḃ Ḃ */ - 0x1e05, 499, /* ḅ Ḅ */ - 0x1e07, 499, /* ḇ Ḇ */ - 0x1e09, 499, /* ḉ Ḉ */ - 0x1e0b, 499, /* ḋ Ḋ */ - 0x1e0d, 499, /* ḍ Ḍ */ - 0x1e0f, 499, /* ḏ Ḏ */ - 0x1e11, 499, /* ḑ Ḑ */ - 0x1e13, 499, /* ḓ Ḓ */ - 0x1e15, 499, /* ḕ Ḕ */ - 0x1e17, 499, /* ḗ Ḗ */ - 0x1e19, 499, /* ḙ Ḙ */ - 0x1e1b, 499, /* ḛ Ḛ */ - 0x1e1d, 499, /* ḝ Ḝ */ - 0x1e1f, 499, /* ḟ Ḟ */ - 0x1e21, 499, /* ḡ Ḡ */ - 0x1e23, 499, /* ḣ Ḣ */ - 0x1e25, 499, /* ḥ Ḥ */ - 0x1e27, 499, /* ḧ Ḧ */ - 0x1e29, 499, /* ḩ Ḩ */ - 0x1e2b, 499, /* ḫ Ḫ */ - 0x1e2d, 499, /* ḭ Ḭ */ - 0x1e2f, 499, /* ḯ Ḯ */ - 0x1e31, 499, /* ḱ Ḱ */ - 0x1e33, 499, /* ḳ Ḳ */ - 0x1e35, 499, /* ḵ Ḵ */ - 0x1e37, 499, /* ḷ Ḷ */ - 0x1e39, 499, /* ḹ Ḹ */ - 0x1e3b, 499, /* ḻ Ḻ */ - 0x1e3d, 499, /* ḽ Ḽ */ - 0x1e3f, 499, /* ḿ Ḿ */ - 0x1e41, 499, /* ṁ Ṁ */ - 0x1e43, 499, /* ṃ Ṃ */ - 0x1e45, 499, /* ṅ Ṅ */ - 0x1e47, 499, /* ṇ Ṇ */ - 0x1e49, 499, /* ṉ Ṉ */ - 0x1e4b, 499, /* ṋ Ṋ */ - 0x1e4d, 499, /* ṍ Ṍ */ - 0x1e4f, 499, /* ṏ Ṏ */ - 0x1e51, 499, /* ṑ Ṑ */ - 0x1e53, 499, /* ṓ Ṓ */ - 0x1e55, 499, /* ṕ Ṕ */ - 0x1e57, 499, /* ṗ Ṗ */ - 0x1e59, 499, /* ṙ Ṙ */ - 0x1e5b, 499, /* ṛ Ṛ */ - 0x1e5d, 499, /* ṝ Ṝ */ - 0x1e5f, 499, /* ṟ Ṟ */ - 0x1e61, 499, /* ṡ Ṡ */ - 0x1e63, 499, /* ṣ Ṣ */ - 0x1e65, 499, /* ṥ Ṥ */ - 0x1e67, 499, /* ṧ Ṧ */ - 0x1e69, 499, /* ṩ Ṩ */ - 0x1e6b, 499, /* ṫ Ṫ */ - 0x1e6d, 499, /* ṭ Ṭ */ - 0x1e6f, 499, /* ṯ Ṯ */ - 0x1e71, 499, /* ṱ Ṱ */ - 0x1e73, 499, /* ṳ Ṳ */ - 0x1e75, 499, /* ṵ Ṵ */ - 0x1e77, 499, /* ṷ Ṷ */ - 0x1e79, 499, /* ṹ Ṹ */ - 0x1e7b, 499, /* ṻ Ṻ */ - 0x1e7d, 499, /* ṽ Ṽ */ - 0x1e7f, 499, /* ṿ Ṿ */ - 0x1e81, 499, /* ẁ Ẁ */ - 0x1e83, 499, /* ẃ Ẃ */ - 0x1e85, 499, /* ẅ Ẅ */ - 0x1e87, 499, /* ẇ Ẇ */ - 0x1e89, 499, /* ẉ Ẉ */ - 0x1e8b, 499, /* ẋ Ẋ */ - 0x1e8d, 499, /* ẍ Ẍ */ - 0x1e8f, 499, /* ẏ Ẏ */ - 0x1e91, 499, /* ẑ Ẑ */ - 0x1e93, 499, /* ẓ Ẓ */ - 0x1e95, 499, /* ẕ Ẕ */ - 0x1ea1, 499, /* ạ Ạ */ - 0x1ea3, 499, /* ả Ả */ - 0x1ea5, 499, /* ấ Ấ */ - 0x1ea7, 499, /* ầ Ầ */ - 0x1ea9, 499, /* ẩ Ẩ */ - 0x1eab, 499, /* ẫ Ẫ */ - 0x1ead, 499, /* ậ Ậ */ - 0x1eaf, 499, /* ắ Ắ */ - 0x1eb1, 499, /* ằ Ằ */ - 0x1eb3, 499, /* ẳ Ẳ */ - 0x1eb5, 499, /* ẵ Ẵ */ - 0x1eb7, 499, /* ặ Ặ */ - 0x1eb9, 499, /* ẹ Ẹ */ - 0x1ebb, 499, /* ẻ Ẻ */ - 0x1ebd, 499, /* ẽ Ẽ */ - 0x1ebf, 499, /* ế Ế */ - 0x1ec1, 499, /* ề Ề */ - 0x1ec3, 499, /* ể Ể */ - 0x1ec5, 499, /* ễ Ễ */ - 0x1ec7, 499, /* ệ Ệ */ - 0x1ec9, 499, /* ỉ Ỉ */ - 0x1ecb, 499, /* ị Ị */ - 0x1ecd, 499, /* ọ Ọ */ - 0x1ecf, 499, /* ỏ Ỏ */ - 0x1ed1, 499, /* ố Ố */ - 0x1ed3, 499, /* ồ Ồ */ - 0x1ed5, 499, /* ổ Ổ */ - 0x1ed7, 499, /* ỗ Ỗ */ - 0x1ed9, 499, /* ộ Ộ */ - 0x1edb, 499, /* ớ Ớ */ - 0x1edd, 499, /* ờ Ờ */ - 0x1edf, 499, /* ở Ở */ - 0x1ee1, 499, /* ỡ Ỡ */ - 0x1ee3, 499, /* ợ Ợ */ - 0x1ee5, 499, /* ụ Ụ */ - 0x1ee7, 499, /* ủ Ủ */ - 0x1ee9, 499, /* ứ Ứ */ - 0x1eeb, 499, /* ừ Ừ */ - 0x1eed, 499, /* ử Ử */ - 0x1eef, 499, /* ữ Ữ */ - 0x1ef1, 499, /* ự Ự */ - 0x1ef3, 499, /* ỳ Ỳ */ - 0x1ef5, 499, /* ỵ Ỵ */ - 0x1ef7, 499, /* ỷ Ỷ */ - 0x1ef9, 499, /* ỹ Ỹ */ - 0x1f51, 508, /* ὑ Ὑ */ - 0x1f53, 508, /* ὓ Ὓ */ - 0x1f55, 508, /* ὕ Ὕ */ - 0x1f57, 508, /* ὗ Ὗ */ - 0x1fb3, 509, /* ᾳ ᾼ */ - 0x1fc3, 509, /* ῃ ῌ */ - 0x1fe5, 507, /* ῥ Ῥ */ - 0x1ff3, 509, /* ῳ ῼ */ -}; - -static Rune __isdigitr[] = { - 0x0030, 0x0039, - 0x0660, 0x0669, - 0x06f0, 0x06f9, - 0x07c0, 0x07c9, - 0x0966, 0x096f, - 0x09e6, 0x09ef, - 0x0a66, 0x0a6f, - 0x0ae6, 0x0aef, - 0x0b66, 0x0b6f, - 0x0be6, 0x0bef, - 0x0c66, 0x0c6f, - 0x0ce6, 0x0cef, - 0x0d66, 0x0d6f, - 0x0e50, 0x0e59, - 0x0ed0, 0x0ed9, - 0x0f20, 0x0f29, - 0x1040, 0x1049, - 0x17e0, 0x17e9, - 0x1810, 0x1819, - 0x1946, 0x194f, - 0x19d0, 0x19d9, - 0x1b50, 0x1b59, - 0xff10, 0xff19, - 0x104a0, 0x104a9, - 0x1d7ce, 0x1d7ff, -}; - -/* - * upper case ranges - * 3rd col is conversion excess 500 - */ -static -Rune _tolower2[] = -{ - 0x0041, 0x005a, 532, /* A-Z a-z */ - 0x00c0, 0x00d6, 532, /* À-Ö à-ö */ - 0x00d8, 0x00de, 532, /* Ø-Þ ø-þ */ - 0x0189, 0x018a, 705, /* Ɖ-Ɗ ɖ-ɗ */ - 0x018e, 0x018f, 702, /* Ǝ-Ə ɘ-ə */ - 0x01b1, 0x01b2, 717, /* Ʊ-Ʋ ʊ-ʋ */ - 0x0388, 0x038a, 537, /* Έ-Ί έ-ί */ - 0x038e, 0x038f, 563, /* Ύ-Ώ ύ-ώ */ - 0x0391, 0x03a1, 532, /* Α-Ρ α-ρ */ - 0x03a3, 0x03ab, 532, /* Σ-Ϋ σ-ϋ */ - 0x0401, 0x040c, 580, /* Ё-Ќ ё-ќ */ - 0x040e, 0x040f, 580, /* Ў-Џ ў-џ */ - 0x0410, 0x042f, 532, /* А-Я а-я */ - 0x0531, 0x0556, 548, /* Ա-Ֆ ա-ֆ */ - 0x10a0, 0x10c5, 548, /* Ⴀ-Ⴥ ა-ჵ */ - 0x1f08, 0x1f0f, 492, /* Ἀ-Ἇ ἀ-ἇ */ - 0x1f18, 0x1f1d, 492, /* Ἐ-Ἕ ἐ-ἕ */ - 0x1f28, 0x1f2f, 492, /* Ἠ-Ἧ ἠ-ἧ */ - 0x1f38, 0x1f3f, 492, /* Ἰ-Ἷ ἰ-ἷ */ - 0x1f48, 0x1f4d, 492, /* Ὀ-Ὅ ὀ-ὅ */ - 0x1f68, 0x1f6f, 492, /* Ὠ-Ὧ ὠ-ὧ */ - 0x1f88, 0x1f8f, 492, /* ᾈ-ᾏ ᾀ-ᾇ */ - 0x1f98, 0x1f9f, 492, /* ᾘ-ᾟ ᾐ-ᾗ */ - 0x1fa8, 0x1faf, 492, /* ᾨ-ᾯ ᾠ-ᾧ */ - 0x1fb8, 0x1fb9, 492, /* Ᾰ-Ᾱ ᾰ-ᾱ */ - 0x1fba, 0x1fbb, 426, /* Ὰ-Ά ὰ-ά */ - 0x1fc8, 0x1fcb, 414, /* Ὲ-Ή ὲ-ή */ - 0x1fd8, 0x1fd9, 492, /* Ῐ-Ῑ ῐ-ῑ */ - 0x1fda, 0x1fdb, 400, /* Ὶ-Ί ὶ-ί */ - 0x1fe8, 0x1fe9, 492, /* Ῠ-Ῡ ῠ-ῡ */ - 0x1fea, 0x1feb, 388, /* Ὺ-Ύ ὺ-ύ */ - 0x1ff8, 0x1ff9, 372, /* Ὸ-Ό ὸ-ό */ - 0x1ffa, 0x1ffb, 374, /* Ὼ-Ώ ὼ-ώ */ - 0x2160, 0x216f, 516, /* Ⅰ-Ⅿ ⅰ-ⅿ */ - 0x24b6, 0x24cf, 526, /* Ⓐ-Ⓩ ⓐ-ⓩ */ - 0xff21, 0xff3a, 532, /* A-Z a-z */ -}; - -/* - * upper case singlets - * 2nd col is conversion excess 500 - */ -static -Rune _tolower1[] = -{ - 0x0100, 501, /* Ā ā */ - 0x0102, 501, /* Ă ă */ - 0x0104, 501, /* Ą ą */ - 0x0106, 501, /* Ć ć */ - 0x0108, 501, /* Ĉ ĉ */ - 0x010a, 501, /* Ċ ċ */ - 0x010c, 501, /* Č č */ - 0x010e, 501, /* Ď ď */ - 0x0110, 501, /* Đ đ */ - 0x0112, 501, /* Ē ē */ - 0x0114, 501, /* Ĕ ĕ */ - 0x0116, 501, /* Ė ė */ - 0x0118, 501, /* Ę ę */ - 0x011a, 501, /* Ě ě */ - 0x011c, 501, /* Ĝ ĝ */ - 0x011e, 501, /* Ğ ğ */ - 0x0120, 501, /* Ġ ġ */ - 0x0122, 501, /* Ģ ģ */ - 0x0124, 501, /* Ĥ ĥ */ - 0x0126, 501, /* Ħ ħ */ - 0x0128, 501, /* Ĩ ĩ */ - 0x012a, 501, /* Ī ī */ - 0x012c, 501, /* Ĭ ĭ */ - 0x012e, 501, /* Į į */ - 0x0130, 301, /* İ i */ - 0x0132, 501, /* IJ ij */ - 0x0134, 501, /* Ĵ ĵ */ - 0x0136, 501, /* Ķ ķ */ - 0x0139, 501, /* Ĺ ĺ */ - 0x013b, 501, /* Ļ ļ */ - 0x013d, 501, /* Ľ ľ */ - 0x013f, 501, /* Ŀ ŀ */ - 0x0141, 501, /* Ł ł */ - 0x0143, 501, /* Ń ń */ - 0x0145, 501, /* Ņ ņ */ - 0x0147, 501, /* Ň ň */ - 0x014a, 501, /* Ŋ ŋ */ - 0x014c, 501, /* Ō ō */ - 0x014e, 501, /* Ŏ ŏ */ - 0x0150, 501, /* Ő ő */ - 0x0152, 501, /* Œ œ */ - 0x0154, 501, /* Ŕ ŕ */ - 0x0156, 501, /* Ŗ ŗ */ - 0x0158, 501, /* Ř ř */ - 0x015a, 501, /* Ś ś */ - 0x015c, 501, /* Ŝ ŝ */ - 0x015e, 501, /* Ş ş */ - 0x0160, 501, /* Š š */ - 0x0162, 501, /* Ţ ţ */ - 0x0164, 501, /* Ť ť */ - 0x0166, 501, /* Ŧ ŧ */ - 0x0168, 501, /* Ũ ũ */ - 0x016a, 501, /* Ū ū */ - 0x016c, 501, /* Ŭ ŭ */ - 0x016e, 501, /* Ů ů */ - 0x0170, 501, /* Ű ű */ - 0x0172, 501, /* Ų ų */ - 0x0174, 501, /* Ŵ ŵ */ - 0x0176, 501, /* Ŷ ŷ */ - 0x0178, 379, /* Ÿ ÿ */ - 0x0179, 501, /* Ź ź */ - 0x017b, 501, /* Ż ż */ - 0x017d, 501, /* Ž ž */ - 0x0181, 710, /* Ɓ ɓ */ - 0x0182, 501, /* Ƃ ƃ */ - 0x0184, 501, /* Ƅ ƅ */ - 0x0186, 706, /* Ɔ ɔ */ - 0x0187, 501, /* Ƈ ƈ */ - 0x018b, 501, /* Ƌ ƌ */ - 0x0190, 703, /* Ɛ ɛ */ - 0x0191, 501, /* Ƒ ƒ */ - 0x0193, 705, /* Ɠ ɠ */ - 0x0194, 707, /* Ɣ ɣ */ - 0x0196, 711, /* Ɩ ɩ */ - 0x0197, 709, /* Ɨ ɨ */ - 0x0198, 501, /* Ƙ ƙ */ - 0x019c, 711, /* Ɯ ɯ */ - 0x019d, 713, /* Ɲ ɲ */ - 0x01a0, 501, /* Ơ ơ */ - 0x01a2, 501, /* Ƣ ƣ */ - 0x01a4, 501, /* Ƥ ƥ */ - 0x01a7, 501, /* Ƨ ƨ */ - 0x01a9, 718, /* Ʃ ʃ */ - 0x01ac, 501, /* Ƭ ƭ */ - 0x01ae, 718, /* Ʈ ʈ */ - 0x01af, 501, /* Ư ư */ - 0x01b3, 501, /* Ƴ ƴ */ - 0x01b5, 501, /* Ƶ ƶ */ - 0x01b7, 719, /* Ʒ ʒ */ - 0x01b8, 501, /* Ƹ ƹ */ - 0x01bc, 501, /* Ƽ ƽ */ - 0x01c4, 502, /* DŽ dž */ - 0x01c5, 501, /* Dž dž */ - 0x01c7, 502, /* LJ lj */ - 0x01c8, 501, /* Lj lj */ - 0x01ca, 502, /* NJ nj */ - 0x01cb, 501, /* Nj nj */ - 0x01cd, 501, /* Ǎ ǎ */ - 0x01cf, 501, /* Ǐ ǐ */ - 0x01d1, 501, /* Ǒ ǒ */ - 0x01d3, 501, /* Ǔ ǔ */ - 0x01d5, 501, /* Ǖ ǖ */ - 0x01d7, 501, /* Ǘ ǘ */ - 0x01d9, 501, /* Ǚ ǚ */ - 0x01db, 501, /* Ǜ ǜ */ - 0x01de, 501, /* Ǟ ǟ */ - 0x01e0, 501, /* Ǡ ǡ */ - 0x01e2, 501, /* Ǣ ǣ */ - 0x01e4, 501, /* Ǥ ǥ */ - 0x01e6, 501, /* Ǧ ǧ */ - 0x01e8, 501, /* Ǩ ǩ */ - 0x01ea, 501, /* Ǫ ǫ */ - 0x01ec, 501, /* Ǭ ǭ */ - 0x01ee, 501, /* Ǯ ǯ */ - 0x01f1, 502, /* DZ dz */ - 0x01f2, 501, /* Dz dz */ - 0x01f4, 501, /* Ǵ ǵ */ - 0x01fa, 501, /* Ǻ ǻ */ - 0x01fc, 501, /* Ǽ ǽ */ - 0x01fe, 501, /* Ǿ ǿ */ - 0x0200, 501, /* Ȁ ȁ */ - 0x0202, 501, /* Ȃ ȃ */ - 0x0204, 501, /* Ȅ ȅ */ - 0x0206, 501, /* Ȇ ȇ */ - 0x0208, 501, /* Ȉ ȉ */ - 0x020a, 501, /* Ȋ ȋ */ - 0x020c, 501, /* Ȍ ȍ */ - 0x020e, 501, /* Ȏ ȏ */ - 0x0210, 501, /* Ȑ ȑ */ - 0x0212, 501, /* Ȓ ȓ */ - 0x0214, 501, /* Ȕ ȕ */ - 0x0216, 501, /* Ȗ ȗ */ - 0x0386, 538, /* Ά ά */ - 0x038c, 564, /* Ό ό */ - 0x03e2, 501, /* Ϣ ϣ */ - 0x03e4, 501, /* Ϥ ϥ */ - 0x03e6, 501, /* Ϧ ϧ */ - 0x03e8, 501, /* Ϩ ϩ */ - 0x03ea, 501, /* Ϫ ϫ */ - 0x03ec, 501, /* Ϭ ϭ */ - 0x03ee, 501, /* Ϯ ϯ */ - 0x0460, 501, /* Ѡ ѡ */ - 0x0462, 501, /* Ѣ ѣ */ - 0x0464, 501, /* Ѥ ѥ */ - 0x0466, 501, /* Ѧ ѧ */ - 0x0468, 501, /* Ѩ ѩ */ - 0x046a, 501, /* Ѫ ѫ */ - 0x046c, 501, /* Ѭ ѭ */ - 0x046e, 501, /* Ѯ ѯ */ - 0x0470, 501, /* Ѱ ѱ */ - 0x0472, 501, /* Ѳ ѳ */ - 0x0474, 501, /* Ѵ ѵ */ - 0x0476, 501, /* Ѷ ѷ */ - 0x0478, 501, /* Ѹ ѹ */ - 0x047a, 501, /* Ѻ ѻ */ - 0x047c, 501, /* Ѽ ѽ */ - 0x047e, 501, /* Ѿ ѿ */ - 0x0480, 501, /* Ҁ ҁ */ - 0x0490, 501, /* Ґ ґ */ - 0x0492, 501, /* Ғ ғ */ - 0x0494, 501, /* Ҕ ҕ */ - 0x0496, 501, /* Җ җ */ - 0x0498, 501, /* Ҙ ҙ */ - 0x049a, 501, /* Қ қ */ - 0x049c, 501, /* Ҝ ҝ */ - 0x049e, 501, /* Ҟ ҟ */ - 0x04a0, 501, /* Ҡ ҡ */ - 0x04a2, 501, /* Ң ң */ - 0x04a4, 501, /* Ҥ ҥ */ - 0x04a6, 501, /* Ҧ ҧ */ - 0x04a8, 501, /* Ҩ ҩ */ - 0x04aa, 501, /* Ҫ ҫ */ - 0x04ac, 501, /* Ҭ ҭ */ - 0x04ae, 501, /* Ү ү */ - 0x04b0, 501, /* Ұ ұ */ - 0x04b2, 501, /* Ҳ ҳ */ - 0x04b4, 501, /* Ҵ ҵ */ - 0x04b6, 501, /* Ҷ ҷ */ - 0x04b8, 501, /* Ҹ ҹ */ - 0x04ba, 501, /* Һ һ */ - 0x04bc, 501, /* Ҽ ҽ */ - 0x04be, 501, /* Ҿ ҿ */ - 0x04c1, 501, /* Ӂ ӂ */ - 0x04c3, 501, /* Ӄ ӄ */ - 0x04c7, 501, /* Ӈ ӈ */ - 0x04cb, 501, /* Ӌ ӌ */ - 0x04d0, 501, /* Ӑ ӑ */ - 0x04d2, 501, /* Ӓ ӓ */ - 0x04d4, 501, /* Ӕ ӕ */ - 0x04d6, 501, /* Ӗ ӗ */ - 0x04d8, 501, /* Ә ә */ - 0x04da, 501, /* Ӛ ӛ */ - 0x04dc, 501, /* Ӝ ӝ */ - 0x04de, 501, /* Ӟ ӟ */ - 0x04e0, 501, /* Ӡ ӡ */ - 0x04e2, 501, /* Ӣ ӣ */ - 0x04e4, 501, /* Ӥ ӥ */ - 0x04e6, 501, /* Ӧ ӧ */ - 0x04e8, 501, /* Ө ө */ - 0x04ea, 501, /* Ӫ ӫ */ - 0x04ee, 501, /* Ӯ ӯ */ - 0x04f0, 501, /* Ӱ ӱ */ - 0x04f2, 501, /* Ӳ ӳ */ - 0x04f4, 501, /* Ӵ ӵ */ - 0x04f8, 501, /* Ӹ ӹ */ - 0x1e00, 501, /* Ḁ ḁ */ - 0x1e02, 501, /* Ḃ ḃ */ - 0x1e04, 501, /* Ḅ ḅ */ - 0x1e06, 501, /* Ḇ ḇ */ - 0x1e08, 501, /* Ḉ ḉ */ - 0x1e0a, 501, /* Ḋ ḋ */ - 0x1e0c, 501, /* Ḍ ḍ */ - 0x1e0e, 501, /* Ḏ ḏ */ - 0x1e10, 501, /* Ḑ ḑ */ - 0x1e12, 501, /* Ḓ ḓ */ - 0x1e14, 501, /* Ḕ ḕ */ - 0x1e16, 501, /* Ḗ ḗ */ - 0x1e18, 501, /* Ḙ ḙ */ - 0x1e1a, 501, /* Ḛ ḛ */ - 0x1e1c, 501, /* Ḝ ḝ */ - 0x1e1e, 501, /* Ḟ ḟ */ - 0x1e20, 501, /* Ḡ ḡ */ - 0x1e22, 501, /* Ḣ ḣ */ - 0x1e24, 501, /* Ḥ ḥ */ - 0x1e26, 501, /* Ḧ ḧ */ - 0x1e28, 501, /* Ḩ ḩ */ - 0x1e2a, 501, /* Ḫ ḫ */ - 0x1e2c, 501, /* Ḭ ḭ */ - 0x1e2e, 501, /* Ḯ ḯ */ - 0x1e30, 501, /* Ḱ ḱ */ - 0x1e32, 501, /* Ḳ ḳ */ - 0x1e34, 501, /* Ḵ ḵ */ - 0x1e36, 501, /* Ḷ ḷ */ - 0x1e38, 501, /* Ḹ ḹ */ - 0x1e3a, 501, /* Ḻ ḻ */ - 0x1e3c, 501, /* Ḽ ḽ */ - 0x1e3e, 501, /* Ḿ ḿ */ - 0x1e40, 501, /* Ṁ ṁ */ - 0x1e42, 501, /* Ṃ ṃ */ - 0x1e44, 501, /* Ṅ ṅ */ - 0x1e46, 501, /* Ṇ ṇ */ - 0x1e48, 501, /* Ṉ ṉ */ - 0x1e4a, 501, /* Ṋ ṋ */ - 0x1e4c, 501, /* Ṍ ṍ */ - 0x1e4e, 501, /* Ṏ ṏ */ - 0x1e50, 501, /* Ṑ ṑ */ - 0x1e52, 501, /* Ṓ ṓ */ - 0x1e54, 501, /* Ṕ ṕ */ - 0x1e56, 501, /* Ṗ ṗ */ - 0x1e58, 501, /* Ṙ ṙ */ - 0x1e5a, 501, /* Ṛ ṛ */ - 0x1e5c, 501, /* Ṝ ṝ */ - 0x1e5e, 501, /* Ṟ ṟ */ - 0x1e60, 501, /* Ṡ ṡ */ - 0x1e62, 501, /* Ṣ ṣ */ - 0x1e64, 501, /* Ṥ ṥ */ - 0x1e66, 501, /* Ṧ ṧ */ - 0x1e68, 501, /* Ṩ ṩ */ - 0x1e6a, 501, /* Ṫ ṫ */ - 0x1e6c, 501, /* Ṭ ṭ */ - 0x1e6e, 501, /* Ṯ ṯ */ - 0x1e70, 501, /* Ṱ ṱ */ - 0x1e72, 501, /* Ṳ ṳ */ - 0x1e74, 501, /* Ṵ ṵ */ - 0x1e76, 501, /* Ṷ ṷ */ - 0x1e78, 501, /* Ṹ ṹ */ - 0x1e7a, 501, /* Ṻ ṻ */ - 0x1e7c, 501, /* Ṽ ṽ */ - 0x1e7e, 501, /* Ṿ ṿ */ - 0x1e80, 501, /* Ẁ ẁ */ - 0x1e82, 501, /* Ẃ ẃ */ - 0x1e84, 501, /* Ẅ ẅ */ - 0x1e86, 501, /* Ẇ ẇ */ - 0x1e88, 501, /* Ẉ ẉ */ - 0x1e8a, 501, /* Ẋ ẋ */ - 0x1e8c, 501, /* Ẍ ẍ */ - 0x1e8e, 501, /* Ẏ ẏ */ - 0x1e90, 501, /* Ẑ ẑ */ - 0x1e92, 501, /* Ẓ ẓ */ - 0x1e94, 501, /* Ẕ ẕ */ - 0x1ea0, 501, /* Ạ ạ */ - 0x1ea2, 501, /* Ả ả */ - 0x1ea4, 501, /* Ấ ấ */ - 0x1ea6, 501, /* Ầ ầ */ - 0x1ea8, 501, /* Ẩ ẩ */ - 0x1eaa, 501, /* Ẫ ẫ */ - 0x1eac, 501, /* Ậ ậ */ - 0x1eae, 501, /* Ắ ắ */ - 0x1eb0, 501, /* Ằ ằ */ - 0x1eb2, 501, /* Ẳ ẳ */ - 0x1eb4, 501, /* Ẵ ẵ */ - 0x1eb6, 501, /* Ặ ặ */ - 0x1eb8, 501, /* Ẹ ẹ */ - 0x1eba, 501, /* Ẻ ẻ */ - 0x1ebc, 501, /* Ẽ ẽ */ - 0x1ebe, 501, /* Ế ế */ - 0x1ec0, 501, /* Ề ề */ - 0x1ec2, 501, /* Ể ể */ - 0x1ec4, 501, /* Ễ ễ */ - 0x1ec6, 501, /* Ệ ệ */ - 0x1ec8, 501, /* Ỉ ỉ */ - 0x1eca, 501, /* Ị ị */ - 0x1ecc, 501, /* Ọ ọ */ - 0x1ece, 501, /* Ỏ ỏ */ - 0x1ed0, 501, /* Ố ố */ - 0x1ed2, 501, /* Ồ ồ */ - 0x1ed4, 501, /* Ổ ổ */ - 0x1ed6, 501, /* Ỗ ỗ */ - 0x1ed8, 501, /* Ộ ộ */ - 0x1eda, 501, /* Ớ ớ */ - 0x1edc, 501, /* Ờ ờ */ - 0x1ede, 501, /* Ở ở */ - 0x1ee0, 501, /* Ỡ ỡ */ - 0x1ee2, 501, /* Ợ ợ */ - 0x1ee4, 501, /* Ụ ụ */ - 0x1ee6, 501, /* Ủ ủ */ - 0x1ee8, 501, /* Ứ ứ */ - 0x1eea, 501, /* Ừ ừ */ - 0x1eec, 501, /* Ử ử */ - 0x1eee, 501, /* Ữ ữ */ - 0x1ef0, 501, /* Ự ự */ - 0x1ef2, 501, /* Ỳ ỳ */ - 0x1ef4, 501, /* Ỵ ỵ */ - 0x1ef6, 501, /* Ỷ ỷ */ - 0x1ef8, 501, /* Ỹ ỹ */ - 0x1f59, 492, /* Ὑ ὑ */ - 0x1f5b, 492, /* Ὓ ὓ */ - 0x1f5d, 492, /* Ὕ ὕ */ - 0x1f5f, 492, /* Ὗ ὗ */ - 0x1fbc, 491, /* ᾼ ᾳ */ - 0x1fcc, 491, /* ῌ ῃ */ - 0x1fec, 493, /* Ῥ ῥ */ - 0x1ffc, 491, /* ῼ ῳ */ -}; - -/* - * title characters are those between - * upper and lower case. ie DZ Dz dz - */ -static -Rune _totitle1[] = -{ - 0x01c4, 501, /* DŽ Dž */ - 0x01c6, 499, /* dž Dž */ - 0x01c7, 501, /* LJ Lj */ - 0x01c9, 499, /* lj Lj */ - 0x01ca, 501, /* NJ Nj */ - 0x01cc, 499, /* nj Nj */ - 0x01f1, 501, /* DZ Dz */ - 0x01f3, 499, /* dz Dz */ -}; - -static -Rune* -bsearch(Rune c, Rune *t, int n, int ne) -{ - Rune *p; - int m; - - while(n > 1) { - m = n/2; - p = t + m*ne; - if(c >= p[0]) { - t = p; - n = n-m; - } else - n = m; - } - if(n && c >= t[0]) - return t; - return 0; -} - -Rune -tolowerrune(Rune c) -{ - Rune *p; - - p = bsearch(c, _tolower2, nelem(_tolower2)/3, 3); - if(p && c >= p[0] && c <= p[1]) - return c + p[2] - 500; - p = bsearch(c, _tolower1, nelem(_tolower1)/2, 2); - if(p && c == p[0]) - return c + p[1] - 500; - return c; -} - -Rune -toupperrune(Rune c) -{ - Rune *p; - - p = bsearch(c, _toupper2, nelem(_toupper2)/3, 3); - if(p && c >= p[0] && c <= p[1]) - return c + p[2] - 500; - p = bsearch(c, _toupper1, nelem(_toupper1)/2, 2); - if(p && c == p[0]) - return c + p[1] - 500; - return c; -} - -Rune -totitlerune(Rune c) -{ - Rune *p; - - p = bsearch(c, _totitle1, nelem(_totitle1)/2, 2); - if(p && c == p[0]) - return c + p[1] - 500; - return c; -} - -int -islowerrune(Rune c) -{ - Rune *p; - - p = bsearch(c, _toupper2, nelem(_toupper2)/3, 3); - if(p && c >= p[0] && c <= p[1]) - return 1; - p = bsearch(c, _toupper1, nelem(_toupper1)/2, 2); - if(p && c == p[0]) - return 1; - return 0; -} - -int -isupperrune(Rune c) -{ - Rune *p; - - p = bsearch(c, _tolower2, nelem(_tolower2)/3, 3); - if(p && c >= p[0] && c <= p[1]) - return 1; - p = bsearch(c, _tolower1, nelem(_tolower1)/2, 2); - if(p && c == p[0]) - return 1; - return 0; -} - -int -isalpharune(Rune c) -{ - Rune *p; - - if(isupperrune(c) || islowerrune(c)) - return 1; - p = bsearch(c, _alpha2, nelem(_alpha2)/2, 2); - if(p && c >= p[0] && c <= p[1]) - return 1; - p = bsearch(c, _alpha1, nelem(_alpha1), 1); - if(p && c == p[0]) - return 1; - return 0; -} - -int -istitlerune(Rune c) -{ - return isupperrune(c) && islowerrune(c); -} - -int -isspacerune(Rune c) -{ - Rune *p; - - p = bsearch(c, _space2, nelem(_space2)/2, 2); - if(p && c >= p[0] && c <= p[1]) - return 1; - return 0; -} - -int -isdigitrune(Rune c) -{ - Rune *p; - - p = bsearch(c, __isdigitr, nelem(__isdigitr)/2, 2); - if(p && c >= p[0] && c <= p[1]) - return 1; - return 0; -} --- a//sys/src/libc/test/mkfile +++ b//sys/src/libc/test/mkfile @@ -3,6 +3,8 @@ TEST=\ date\ pow\ + runebreak\ + runenorm\ strchr\ </sys/src/cmd/mktest --- /dev/null +++ b//sys/src/libc/test/runebreak.c @@ -1,0 +1,93 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> + +static int +estrtoul(char *s) +{ + char *epr; + Rune code; + + code = strtoul(s, &epr, 16); + if(s == epr) + sysfatal("bad code point hex string"); + return code; +} + +static void +run(char *file, Rune* (*fn)(Rune*)) +{ + Biobuf *b; + char *p, *dot; + char *pieces[16]; + int i, j, n; + Rune stack[16], ops[16]; + int nstack, nops; + Rune r, *rp, *rp2; + char *line; + + b = Bopen(file, OREAD); + if(b == nil) + sysfatal("could not load composition exclusions: %r"); + + for(;(p = Brdline(b, '\n')) != nil; free(line)){ + p[Blinelen(b)-1] = 0; + line = strdup(p); + if(p[0] == 0 || p[0] == '#') + continue; + if((dot = strstr(p, "#")) != nil) + *dot = 0; + n = getfields(p, pieces, nelem(pieces), 0, " "); + nstack = nops = 0; + for(i = 0; i < n; i++){ + chartorune(&r, pieces[i]); + if(r != L'÷' && r != L'×'){ + r = estrtoul(pieces[i]); + stack[nstack++] = r; + stack[nstack] = 0; + } else { + ops[nops++] = r; + ops[nops] = 0; + } + } + + rp = stack; + for(i = 1; i < nops-1;){ + rp2 = fn(rp); + switch(ops[i]){ + case L'÷': + if(rp2 != rp+1){ + print("break fail %X %X || %s\n", rp[0], rp[1], line); + goto Break; + } + rp++; + i++; + break; + case L'×': + if(rp2 - rp == 0){ + for(j = i; j < nops - 1; j++) + if(ops[j] != L'×') + print("skipped %d %d %s\n", i, nops, line); + goto Break; + } + for(; rp < (rp2-1); rp++, i++){ + if(ops[i] != L'×') + print("skipped %d %d %s\n", i, nops, line); + } + rp = rp2; + i++; + break; + } + } +Break: + ; + } +} + +void +main(int, char) +{ + run("/lib/ucd/GraphemeBreakTest.txt", runegbreak); + run("/lib/ucd/WordBreakTest.txt", runewbreak); + exits(nil); +} --- /dev/null +++ b//sys/src/libc/test/runenorm.c @@ -1,0 +1,92 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> + +static int +estrtoul(char *s) +{ + char *epr; + Rune code; + + code = strtoul(s, &epr, 16); + if(s == epr) + sysfatal("bad code point hex string"); + return code; +} + +void +main(int, char) +{ + Rune buffer1[64]; + Rune buffer2[64]; + char utfbuff1[128]; + char utfbuff2[128]; + char srctmp[128], tmp1[128], tmp2[128]; + char *fields[10]; + char *runes[32]; + char *p; + int n, n2; + int i; + uint fail; + Biobuf *b; + + b = Bopen("/lib/ucd/NormalizationTest.txt", OREAD); + if(b == nil) + sysfatal("could not load composition exclusions: %r"); + + struct { + Rune src[32]; + Rune nfc[32]; + Rune nfd[32]; + } test; + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#' || p[0] == '@') + continue; + getfields(p, fields, 6 + 1, 0, ";"); + n = getfields(fields[0], runes, nelem(runes), 0, " "); + for(i = 0; i < n; i++) + test.src[i] = estrtoul(runes[i]); + test.src[i] = 0; + + n = getfields(fields[1], runes, nelem(runes), 0, " "); + for(i = 0; i < n; i++) + test.nfc[i] = estrtoul(runes[i]); + test.nfc[i] = 0; + + n = getfields(fields[2], runes, nelem(runes), 0, " "); + for(i = 0; i < n; i++) + test.nfd[i] = estrtoul(runes[i]); + test.nfd[i] = 0; + + n = runenorm(buffer1, test.src, nelem(buffer1), 1); + n2 = runenorm(buffer2, test.src, nelem(buffer2), 0); + fail = 0; + + if(runestrcmp(buffer1, test.nfc) != 0) + fail |= 1<<0; + if(runestrcmp(buffer2, test.nfd) != 0) + fail |= 1<<1; + if(fail) + print("%d %d %S %S %S %S %S\n", fail, i, test.src, test.nfd, test.nfc, buffer2, buffer1); + assert(n == runestrlen(test.nfc)); + assert(n2 == runestrlen(test.nfd)); + + snprint(srctmp, sizeof tmp1, "%S", test.src); + snprint(tmp1, sizeof tmp1, "%S", test.nfc); + snprint(tmp2, sizeof tmp2, "%S", test.nfd); + + n = utfnorm(utfbuff1, srctmp, nelem(utfbuff1), 1); + n2 = utfnorm(utfbuff2, srctmp, nelem(utfbuff2), 0); + + if(strcmp(utfbuff1, tmp1) != 0) + fail |= 1<<2; + if(strcmp(utfbuff2, tmp2) != 0) + fail |= 1<<3; + if(fail) + print("%d %d %s %s %s %s %s\n", fail, i, srctmp, tmp2, tmp1, utfbuff2, utfbuff1); + assert(n == strlen(tmp1)); + assert(n2 == strlen(tmp2)); + } + exits(nil); +}
Wed Mar 22 21:16:26 EDT 2023
diff b8ae7708fb3ef3acbb30ccf3181897f8157c18de uncommitted --- /dev/null +++ b//lib/ucd/mkfile @@ -1,0 +1,70 @@ +</$objtype/mkfile + +VERSION='15.0.0' +URL='https://www.unicode.org/Public/'$VERSION'/ucd/' + +TXT=\ + ArabicShaping.txt\ + BidiBrackets.txt\ + BidiMirroring.txt\ + BidiTest.txt\ + Blocks.txt\ + CJKRadicals.txt\ + CaseFolding.txt\ + CompositionExclusions.txt\ + DerivedAge.txt\ + DerivedCoreProperties.txt\ + DerivedNormalizationProps.txt\ + EastAsianWidth.txt\ + EmojiSources.txt\ + EquivalentUnifiedIdeograph.txt\ + HangulSyllableType.txt\ + Index.txt\ + IndicPositionalCategory.txt\ + IndicSyllabicCategory.txt\ + Jamo.txt\ + LineBreak.txt\ + NameAliases.txt\ + NamedSequences.txt\ + NamedSequencesProv.txt\ + NamesList.txt\ + NormalizationCorrections.txt\ + NushuSources.txt\ + PropList.txt\ + PropertyAliases.txt\ + PropertyValueAliases.txt\ + ScriptExtensions.txt\ + Scripts.txt\ + SpecialCasing.txt\ + StandardizedVariants.txt\ + TangutSources.txt\ + USourceData.txt\ + UnicodeData.txt\ + VerticalOrientation.txt\ + +TEST=\ + NormalizationTest.txt\ + BidiCharacterTest.txt\ + +PDF=\ + USourceGlyphs.pdf\ + USourceRSChart.pdf\ + +AUX=\ + WordBreakProperty.txt\ + GraphemeBreakProperty.txt\ + +ucd:V: UnicodeData.txt + +%.txt: + hget $URL^$target > $target >[2]/dev/null || hget $URL^'auxiliary/'^$target > $target +%.pdf: + hget $URL^$target > $target + +txt:V: $TXT + +pdf:V: $PDF + +test:V: $TEST + +all:V: $TXT $PDF $TEST --- a//sys/include/libc.h +++ b//sys/include/libc.h @@ -77,6 +77,14 @@ extern long runestrlen(Rune*); extern Rune* runestrstr(Rune*, Rune*); +extern int runenorm(Rune*, Rune*, int, int); +extern int utfnorm(char*,char*,int,int); +extern char* fullutfnorm(char*,int); +extern Rune* fullrunenorm(Rune*,int); + +extern Rune* runewbreak(Rune*); +extern Rune* runegbreak(Rune*); + extern Rune tolowerrune(Rune); extern Rune totitlerune(Rune); extern Rune toupperrune(Rune); @@ -404,7 +412,7 @@ extern int enc16chr(int); extern int encodefmt(Fmt*); -extern void exits(char*); +extern _Noreturn void exits(char*); extern double frexp(double, int*); extern uintptr getcallerpc(void*); extern char* getenv(char*); @@ -431,7 +439,7 @@ extern ulong strtoul(char*, char**, int); extern vlong strtoll(char*, char**, int); extern uvlong strtoull(char*, char**, int); -extern void sysfatal(char*, ...); +extern _Noreturn void sysfatal(char*, ...); #pragma varargck argpos sysfatal 1 extern void syslog(int, char*, char*, ...); #pragma varargck argpos syslog 3 @@ -677,7 +685,7 @@ ulong len; } IOchunk; -extern void _exits(char*); +extern _Noreturn void _exits(char*); extern void abort(void); extern int access(char*, int); --- a//sys/src/cmd/tcs/hdr.h +++ b//sys/src/cmd/tcs/hdr.h @@ -23,6 +23,8 @@ void utf_in(int, long *, struct convert *); void utf_out(Rune *, int, long *); +void utfnfc_out(Rune *, int, long *); +void utfnfd_out(Rune *, int, long *); void isoutf_in(int, long *, struct convert *); void isoutf_out(Rune *, int, long *); --- a//sys/src/cmd/tcs/tcs.c +++ b//sys/src/cmd/tcs/tcs.c @@ -613,6 +613,10 @@ { "utf-16be", "alias for unicode-be (MIME)", Func, 0, (Fnptr)unicode_out_be }, { "utf-16le", "alias for unicode-le (MIME)", From|Func, 0, (Fnptr)unicode_in_le }, { "utf-16le", "alias for unicode-le (MIME)", Func, 0, (Fnptr)unicode_out_le }, + { "nfc", "UTF Normalization Form C", From|Func, 0, (Fnptr)utf_in }, + { "nfc", "UTF Normalization Form C", Func, 0, (Fnptr)utfnfc_out }, + { "nfd", "UTF Normalization Form D", From|Func, 0, (Fnptr)utf_in }, + { "nfd", "UTF Normalization Form D", Func, 0, (Fnptr)utfnfd_out }, { "viet1", "Vietnamese VSCII-1 (1993)", Table, (void *)tabviet1 }, { "viet2", "Vietnamese VSCII-2 (1993)", Table, (void *)tabviet2 }, { "vscii", "Vietnamese VISCII 1.1 (1992)", Table, (void *)tabviscii }, --- a//sys/src/cmd/tcs/utf.c +++ b//sys/src/cmd/tcs/utf.c @@ -19,38 +19,27 @@ void utf_in(int fd, long *, struct convert *out) { - char buf[N]; - int i, j, c, n, tot; - unsigned long l; + char buf[N + 1]; + Rune r; + char *p; + int n, tot, j; tot = 0; + j = 0; while((n = read(fd, buf+tot, N-tot)) >= 0){ tot += n; - for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){ - c = our_mbtowc(&l, buf+i, tot-i); - if(c == -1){ - if(squawk) - warn("bad UTF sequence near byte %ld in input", ninput+i); - if(clean){ - i++; - continue; - } - nerrors++; - l = Runeerror; - c = 1; - } - runes[j++] = l; - i += c; + if(fullutfnorm(buf, tot) == buf) + continue; + /* fullutfnorm ensures rune boundary */ + for(p = buf; p < buf + tot;){ + p += chartorune(&r, p); + runes[j++] = r; + runes[j] = 0; } OUT(out, runes, j); - tot -= i; - ninput += i; - if(tot) - memmove(buf, buf+i, tot); - if(n == 0) - break; + j = 0; + tot = 0; } - OUT(out, runes, 0); } void @@ -66,6 +55,26 @@ noutput += p-obuf; if(p > obuf) write(1, obuf, p-obuf); +} + +void +utfnfc_out(Rune *base, int n, long *) +{ + Rune buf[N + 1]; + int w; + + w = runenorm(buf, base, n + 1, 1); + utf_out(buf, w, nil); +} + +void +utfnfd_out(Rune *base, int n, long *) +{ + Rune buf[N + 1]; + int w; + + w = runenorm(buf, base, n + 1, 0); + utf_out(buf, w, nil); } void --- a//sys/src/libc/port/mkfile +++ b//sys/src/libc/port/mkfile @@ -62,6 +62,9 @@ rand.c\ readn.c\ rune.c\ + runebreak.c\ + runeistype.c\ + runenorm.c\ runestrcat.c\ runestrchr.c\ runestrcmp.c\ @@ -74,7 +77,7 @@ runestrrchr.c\ runestrlen.c\ runestrstr.c\ - runetype.c\ + runetotype.c\ sin.c\ sinh.c\ sqrt.c\ @@ -127,3 +130,16 @@ </sys/src/cmd/mksyslib profile.$O: /sys/include/tos.h + +runenorm.$O: runenormdata runenorm.c +runetotype.$O: runetotypedata runetotype.c +runeistype.$O: runeistypedata runeistype.c +runebreak.$O: runebreakdata runebreak.c + +runenormdata runetotypedata runeistypedata runebreakdata: mkrunetype.c + @{ + eval `{grep '^[A-Z]' /$cputype/mkfile} + $CC $CFLAGS -o mkrunetype.$O $prereq + $LD $LDFLAGS -o $O.mkrunetype mkrunetype.$O + $O.mkrunetype + } --- /dev/null +++ b//sys/src/libc/port/mkrunetype.c @@ -1,0 +1,761 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> + +enum{ + NRUNES = 1<<21 +}; + +typedef struct Param Param; +typedef struct Lvl Lvl; +struct Lvl{ + int bits; + int max; + int mask; +}; +struct Param{ + Lvl idx1; + Lvl idx2; + Lvl data; + + int round1max; +}; + +static void +derive(Lvl *l) +{ + l->max = 1 << l->bits; + l->mask = l->max - 1; +} + +static void +param(Param *p, int idx1, int idx2) +{ + + assert(idx1 + idx2 < 21); + p->idx1.bits = idx1; + p->idx2.bits = idx2; + p->data.bits = 21 - idx1 - idx2; + derive(&p->idx1); + derive(&p->idx2); + derive(&p->data); + + p->round1max = NRUNES/p->data.max; +} + +static int +lkup(Param *p, int *idx1, int *idx2, int *data, int x) +{ + int y, z; + + y = (((x)>>(p->data.bits+p->idx2.bits))&p->idx1.mask); + z = (((x)>>p->data.bits)&p->idx2.mask); + return data[idx2[idx1[y] + z] + (x&p->data.mask)]; +} + +static int +mkarrvar(int fd, char *name, int *d, int len) +{ + int i, sz; + int max, min; + char *t; + + max = min = 0; + for(i = 0; i < len; i++){ + if(d[i] > max) + max = d[i]; + if(d[i] < min) + min = d[i]; + } + if(min == 0){ + if(max < (uchar)~0) + t = "uchar", sz = 1; + else if(max < 0xFFFF) + t = "ushort", sz = 2; + else + t = "uint", sz = 4; + } else { + if(max < 1<<7) + t = "char", sz = 1; + else if(max < 1<<15) + t = "short", sz = 2; + else + t = "int", sz = 4; + } + if(fd < 0) + return sz * len; + + fprint(fd, "static\n%s\t%s[%d] =\n{\n\t", t, name, len); + for(i = 0; i < len; i++){ + fprint(fd, "%d,", d[i]); + if((i+1) % 16 == 0) + fprint(fd, "\n\t"); + } + fprint(fd, "\n};\n"); + + return sz * len; +} + +static int +mkexceptarr(int fd, char *name, int *d, int n, int all) +{ + int i; + fprint(fd, "static\nRune %s[][%d] =\n{\n\t", name, all ? 3 : 2); + for(i = 0; i < n*3; i += 3){ + if(all && d[i] != 0) + fprint(fd, "{0x%X, 0x%X, 0x%X},", d[i], d[i+1], d[i+2]); + else if(!all) + fprint(fd, "{0x%X, 0x%X},", d[i+1], d[i+2]); + if((i+3) % (8*3) == 0) + fprint(fd, "\n\t"); + } + fprint(fd, "\n};\n"); + return n * sizeof(Rune) * 2; +} + +static int +compact(int *data, int *idx, int nidx, int *src, int chunksize) +{ + int i, n, ndata, best; + int *dot, *lp, *rp; + + dot = src; + ndata = 0; + idx[0] = 0; + for(i = 1; i <= nidx; i++){ + rp = dot + chunksize; + lp = rp - 1; + + for(best = 0, n = 0; i != nidx && n < chunksize; n++, lp--){ + if(memcmp(lp, rp, (n+1) * sizeof data[0]) == 0) + best = n+1; + } + memmove(data + ndata, dot, (chunksize - best) * sizeof data[0]); + ndata += (chunksize - best); + idx[i] = idx[i - 1] + (chunksize - best); + dot = rp; + } + return ndata; +} + + +static int +mklkup(int fd, char *label, int *map, Param *p) +{ + static int data[NRUNES]; + static int idx2[NRUNES]; + static int idx2dest[NRUNES]; + static int idx1[NRUNES]; + int i, nidx2, ndata; + int size; + + ndata = compact(data, idx2, p->round1max, map, p->data.max); + nidx2 = compact(idx2dest, idx1, p->idx1.max, idx2, p->idx2.max); + + if(fd >= 0){ + for(i = 0; i < NRUNES; i++) + if(map[i] != lkup(p, idx1, idx2dest, data, i)) + sysfatal("mismatch in %s at %d %d %d\n", label, i, map[i], lkup(p, idx1, idx2dest, data, i)); + } + + size = mkarrvar(fd, smprint("_%sdata", label), data, ndata); + size += mkarrvar(fd, smprint("_%sidx2", label), idx2dest, nidx2); + size += mkarrvar(fd, smprint("_%sidx1", label), idx1, p->idx1.max); + if(fd >= 0){ + fprint(fd, "\n"); + fprint(fd, "#define %sindex1(x) (((x)>>(%d+%d))&0x%X)\n", label, p->data.bits, p->idx2.bits, p->idx1.mask); + fprint(fd, "#define %sindex2(x) (((x)>>%d)&0x%X)\n", label, p->data.bits, p->idx2.mask); + fprint(fd, "#define %soffset(x) ((x)&0x%X)\n", label, p->data.mask); + fprint(fd, "#define %slkup(x) (_%sdata[_%sidx2[_%sidx1[%sindex1(x)] + %sindex2(x)] + %soffset(x)] )\n\n", + label, label, label, label, label, label, label); + } + return size; +} + +static void +mklkupmatrix(char *label, int *map, Param *p) +{ + int bestsize, size, bestx, besty; + int x, y; + + bestsize = bestx = besty = -1; + for(x = 4; x <= 12; x++) + for(y=4; y <= (19 - x); y++){ + param(p, x, y); + size = mklkup(-1, label, map, p); + if(bestsize == -1 || size < bestsize){ + bestx = x; + besty = y; + bestsize = size; + } + } + + assert(bestsize != -1); + fprint(2, "label: %s best: %d %d (%d)\n", label, bestx, besty, bestsize); + param(p, bestx, besty); +} + +static int myismerged[NRUNES]; +static int mytoupper[NRUNES]; +static int mytolower[NRUNES]; +static int mytotitle[NRUNES]; +static int mybreak[NRUNES]; + +enum{ DSTART = 0xEEEE }; +static int mydecomp[NRUNES]; +static int mydespecial[256*3]; +static int nspecial; +static int myccc[NRUNES]; + +typedef struct KV KV; +struct KV{ + uint key; + uint val; + ushort next; +}; + +static KV myrecomp[2000]; +static int nrecomp; + +static int recompext[256*3]; +static int nrecompext; + +static uint +hash(uint x) +{ + x ^= x >> 16; + x *= 0x21f0aaad; + x ^= x >> 15; + x *= 0xd35a2d97; + x ^= x >> 15; + return x; +} + +static void +mkrecomp(int fd) +{ + int i; + KV *p; + static KV vals[512]; + static KV coll[1000]; + int over; + int maxchain; + + for(i = 0; i < nelem(vals); i++) + vals[i] = (KV){0, 0, 0}; + for(i = 0; i < nelem(coll); i++) + coll[i] = (KV){0, 0, 0}; + over = 1; + for(i = 0; i < nrecomp; i++){ + p = vals + (hash(myrecomp[i].key) % nelem(vals)); + maxchain = 0; + while(p->key != 0){ + maxchain++; + if(p->next == 0){ + p->next = over; + p = coll + over - 1; + over++; + } else + p = coll + p->next - 1; + } + p->key = myrecomp[i].key; + p->val = myrecomp[i].val; + } + fprint(2, "recomp map [%d][%d]: %d\n", nelem(vals), over-1, (nelem(vals) + over-1) * (4+2+2)); + fprint(fd, "static\nuint\t_recompdata[] =\n{\n\t"); + for(p = vals, i = 0;; i++){ + assert(p->val < 0xFFFF); + assert(p->next < 0xFFFF); + fprint(fd, "%udU,%udU,", p->key, p->val | (p->next<<16)); + if((i+1) % 8 == 0) + fprint(fd, "\n\t"); + + if(p == vals+nelem(vals)-1) + p = coll; + else if(p == coll + over - 2) + break; + else + p++; + } + fprint(fd, "\n};\n"); + fprint(fd, "static uint *_recompcoll = _recompdata+%d*2;\n", nelem(vals)); + /* + fprint(fd, + " x ^= x >> 16;\n" + " x *= 0x21f0aaad;\n" + " x ^= x >> 15;\n" + " x *= 0xd35a2d97;\n" + " x ^= x >> 15;\n" + " p = _recompdata + (x%%%d)*2;\n" + "}\n", nelem(vals)); + */ +} + +static void +mktables(void) +{ + Param p; + int tofd, isfd, normfd, breakfd; + int size; + + tofd = create("runetotypedata", OWRITE, 0664); + if(tofd < 0) + sysfatal("could not create runetotypedata: %r"); + param(&p, 10, 7); + size = mklkup(tofd, "upper", mytoupper, &p); + fprint(2, "%s: %d\n", "upper", size); + + size = mklkup(tofd, "lower", mytolower, &p); + fprint(2, "%s: %d\n", "lower", size); + + size = mklkup(tofd, "title", mytotitle, &p); + fprint(2, "%s: %d\n", "title", size); + close(tofd); + + isfd = create("runeistypedata", OWRITE, 0664); + if(isfd < 0) + sysfatal("could not create runeistypedata: %r"); + param(&p, 11, 6); + size = mklkup(isfd, "merged", myismerged, &p); + fprint(2, "%s: %d\n", "merged", size); + fprint(isfd, "static\nenum {\n"); + fprint(isfd, "\tL%s = %s,\n", "space", "1<<0"); + fprint(isfd, "\tL%s = %s,\n", "alpha", "1<<1"); + fprint(isfd, "\tL%s = %s,\n", "digit", "1<<2"); + fprint(isfd, "\tL%s = %s,\n", "upper", "1<<3"); + fprint(isfd, "\tL%s = %s,\n", "lower", "1<<4"); + fprint(isfd, "\tL%s = %s,\n", "title", "1<<5"); + fprint(isfd, "};\n"); + close(isfd); + + normfd = create("runenormdata", OWRITE, 0664); + if(normfd < 0) + sysfatal("could not create runenormdata: %r"); + param(&p, 10, 7); + size = mklkup(normfd, "decomp", mydecomp, &p); + fprint(2, "%s: %d\n", "decomp", size); + + param(&p, 9, 7); + size = mklkup(normfd, "ccc", myccc, &p); + fprint(2, "%s: %d\n", "ccc", size); + + mkexceptarr(normfd, "_decompexceptions", mydespecial, nspecial, 0); + mkexceptarr(normfd, "_recompexceptions", recompext, nrecompext, 1); + mkrecomp(normfd); + close(normfd); + + param(&p, 10, 6); + breakfd = create("runebreakdata", OWRITE, 0644); + if(breakfd < 0) + sysfatal("could not create runebreakdata: %r"); + size = mklkup(breakfd, "break", mybreak, &p); + fprint(2, "%s: %d\n", "break", size); +} + +enum { + FIELD_CODE, + FIELD_NAME, + FIELD_CATEGORY, + FIELD_COMBINING, + FIELD_BIDIR, + FIELD_DECOMP, + FIELD_DECIMAL_DIG, + FIELD_DIG, + FIELD_NUMERIC_VAL, + FIELD_MIRRORED, + FIELD_UNICODE_1_NAME, + FIELD_COMMENT, + FIELD_UPPER, + FIELD_LOWER, + FIELD_TITLE, + NFIELDS, +}; + +static int +getunicodeline(Biobuf *in, char **fields) +{ + char *p; + + if((p = Brdline(in, '\n')) == nil) + return 0; + + p[Blinelen(in)-1] = '\0'; + + if (getfields(p, fields, NFIELDS + 1, 0, ";") != NFIELDS) + sysfatal("bad number of fields"); + + return 1; +} + +static int +estrtoul(char *s, int base) +{ + char *epr; + Rune code; + + code = strtoul(s, &epr, base); + if(s == epr) + sysfatal("bad code point hex string"); + return code; +} + +enum { + OTHER, + Hebrew_Letter, Newline, Extend, Format, + Katakana, ALetter, MidLetter, MidNum, + MidNumLet, Numeric, ExtendNumLet, WSegSpace, + PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40, + L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0, + EMOJIEX = 0xB0, +}; + +static void +markbreak(void) +{ + Biobuf *b; + char *p, *dot; + int i, s, e; + uchar v; + + b = Bopen("/lib/ucd/WordBreakProperty.txt", OREAD); + if(b == nil) + sysfatal("could not load word breaks: %r"); + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#') + continue; + if((dot = strstr(p, "..")) != nil){ + *dot = 0; + dot += 2; + s = estrtoul(p, 16); + e = estrtoul(dot, 16); + } else { + s = e = estrtoul(p, 16); + dot = p; + } + v = 0; + if(strstr(dot, "ExtendNumLet") != nil) + v = ExtendNumLet; + else if(strstr(dot, "Hebrew_Letter") != nil) + v = Hebrew_Letter; + else if(strstr(dot, "Newline") != nil) + v = Newline; + else if(strstr(dot, "Extend") != nil) + v = Extend; + else if(strstr(dot, "Format") != nil) + v = Format; + else if(strstr(dot, "Katakana") != nil) + v = Katakana; + else if(strstr(dot, "ALetter") != nil) + v = ALetter; + else if(strstr(dot, "MidLetter") != nil) + v = MidLetter; + else if(strstr(dot, "MidNum") != nil) + v = MidNum; + else if(strstr(dot, "Numeric") != nil) + v = Numeric; + else if(strstr(dot, "WSegSpace") != nil) + v = WSegSpace; + for(i = s; i <= e; i++) + mybreak[i] = v; + } + Bterm(b); + b = Bopen("/lib/ucd/GraphemeBreakProperty.txt", OREAD); + if(b == nil) + sysfatal("could not load Grapheme breaks: %r"); + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#') + continue; + if((dot = strstr(p, "..")) != nil){ + *dot = 0; + dot += 2; + s = estrtoul(p, 16); + e = estrtoul(dot, 16); + } else { + s = e = estrtoul(p, 16); + dot = p; + } + v = 0; + if(strstr(dot, "; Prepend #") != nil) + v = PREPEND; + else if(strstr(dot, "; Control #") != nil) + v = CONTROL; + else if(strstr(dot, "; Extend #") != nil) + v = EXTEND; + else if(strstr(dot, "; Regional_Indicator #") != nil) + v = REGION; + else if(strstr(dot, "; SpacingMark #") != nil) + v = SPACEMK; + else if(strstr(dot, "; L #") != nil) + v = L; + else if(strstr(dot, "; V #") != nil) + v = V; + else if(strstr(dot, "; T #") != nil) + v = T; + else if(strstr(dot, "; LV #") != nil) + v = LV; + else if(strstr(dot, "; LVT #") != nil) + v = LVT; + for(i = s; i <= e; i++) + mybreak[i] |= v; + } + Bterm(b); + + b = Bopen("/lib/ucd/emoji-data.txt", OREAD); + if(b == nil) + sysfatal("could not load emoji-data: %r"); + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#') + continue; + if((dot = strstr(p, "..")) != nil){ + *dot = 0; + dot += 2; + s = estrtoul(p, 16); + e = estrtoul(dot, 16); + } else { + s = e = estrtoul(p, 16); + dot = p; + } + v = 0; + if(strstr(dot, "; Extended_Pictographic") != nil) + v = EMOJIEX; + for(i = s; i <= e; i++) + mybreak[i] |= v; + } + Bterm(b); +} + +static void +markexclusions(void) +{ + Biobuf *b; + char *p; + int i; + uint x; + + b = Bopen("/lib/ucd/CompositionExclusions.txt", OREAD); + if(b == nil) + sysfatal("could not load composition exclusions: %r"); + + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#') + continue; + x = estrtoul(p, 16); + for(i = 0; i < nrecomp; i++){ + if(myrecomp[i].val == x){ + myrecomp[i].val = 0; + break; + } + } + if(i == nrecomp){ + for(i = 0; i < nrecompext; i++){ + if(recompext[i*3] == x){ + recompext[i*3] = 0; + break; + } + } + } + } + Bterm(b); +} + +void +main(int, char) +{ + static char myisspace[NRUNES]; + static char myisalpha[NRUNES]; + static char myisdigit[NRUNES]; + static char myisupper[NRUNES]; + static char myislower[NRUNES]; + static char myistitle[NRUNES]; + Biobuf *in; + char *fields[NFIELDS + 1], *fields2[NFIELDS + 1]; + char *p, *d; + int i, code, last; + int decomp[2], *ip; + + in = Bopen("/lib/ucd/UnicodeData.txt", OREAD); + if(in == nil) + sysfatal("can't open UnicodeData.txt: %r"); + + for(i = 0; i < NRUNES; i++){ + mytoupper[i] = -1; + mytolower[i] = -1; + mytotitle[i] = -1; + mydecomp[i] = 0; + myccc[i] = 0; + mybreak[i] = 0; + } + + myisspace['\t'] = 1; + myisspace['\n'] = 1; + myisspace['\r'] = 1; + myisspace['\f'] = 1; + myisspace['\v'] = 1; + myisspace[0x85] = 1; /* control char, "next line" */ + myisspace[0xfeff] = 1; /* zero-width non-break space */ + + last = -1; + nspecial = nrecomp = nrecompext = 0; + while(getunicodeline(in, fields)){ + code = estrtoul(fields[FIELD_CODE], 16); + if (code >= NRUNES) + sysfatal("code-point value too big: %x", code); + if(code <= last) + sysfatal("bad code sequence: %x then %x", last, code); + last = code; + + p = fields[FIELD_CATEGORY]; + if(strstr(fields[FIELD_NAME], ", First>") != nil){ + if(!getunicodeline(in, fields2)) + sysfatal("range start at eof"); + if (strstr(fields2[FIELD_NAME], ", Last>") == nil) + sysfatal("range start not followed by range end"); + last = estrtoul(fields2[FIELD_CODE], 16); + if(last <= code) + sysfatal("range out of sequence: %x then %x", code, last); + if(strcmp(p, fields2[FIELD_CATEGORY]) != 0) + sysfatal("range with mismatched category"); + } + + d = fields[FIELD_DECOMP]; + if(strlen(d) > 0 && strstr(d, "<") == nil){ + decomp[0] = estrtoul(d, 16); + d = strstr(d, " "); + if(d == nil){ + /* singleton recompositions are verboden */ + decomp[1] = 0; + if(decomp[0] > 0xFFFF){ + //fprint(2, "case1 %X %X\n", code, decomp[0]); + ip = mydespecial + nspecial*3; + ip[0] = code; + ip[1] = decomp[0]; + ip[2] = 0; + mydecomp[code] = (DSTART+nspecial)<<16; + nspecial++; + } else + mydecomp[code] = decomp[0]<<16; + } else { + d++; + decomp[1] = estrtoul(d, 16); + if(decomp[0] > 0xFFFF || decomp[1] > 0xFFFF){ + //fprint(2, "case2 %X %X %X\n", code, decomp[0], decomp[1]); + ip = mydespecial + nspecial*3; + ip[0] = code; + ip[1] = decomp[0]; + ip[2] = decomp[1]; + mydecomp[code] = (DSTART+nspecial)<<16; + nspecial++; + ip = recompext + nrecompext*3; + ip[0] = code; + ip[1] = decomp[0]; + ip[2] = decomp[1]; + nrecompext++; + } else { + mydecomp[code] = decomp[0]<<16 | decomp[1]; + myrecomp[nrecomp++] = (KV){decomp[0]<<16 | decomp[1], code, 0}; + } + } + } + + for (; code <= last; code++){ + if(p[0] == 'L') + myisalpha[code] = 1; + if(p[0] == 'Z') + myisspace[code] = 1; + + if(strcmp(p, "Lu") == 0) + myisupper[code] = 1; + if(strcmp(p, "Ll") == 0) + myislower[code] = 1; + + if(strcmp(p, "Lt") == 0) + myistitle[code] = 1; + + if(strcmp(p, "Nd") == 0) + myisdigit[code] = 1; + + if(fields[FIELD_UPPER][0] != '\0') + mytoupper[code] = estrtoul(fields[FIELD_UPPER], 16); + + if(fields[FIELD_LOWER][0] != '\0') + mytolower[code] = estrtoul(fields[FIELD_LOWER], 16); + + if(fields[FIELD_TITLE][0] != '\0') + mytotitle[code] = estrtoul(fields[FIELD_TITLE], 16); + + myccc[code] = estrtoul(fields[FIELD_COMBINING], 10); + } + } + + Bterm(in); + + markexclusions(); + + /* + * according to standard, if totitle(x) is not defined in ucd + * but toupper(x) is, then totitle is defined to be toupper(x) + */ + for(i = 0; i < NRUNES; i++){ + if(mytotitle[i] == -1 + && mytoupper[i] != -1 + && !myistitle[i]) + mytotitle[i] = mytoupper[i]; + } + + /* + * A couple corrections: + * is*(to*(x)) should be true. + * restore undefined transformations. + * store offset instead of value, makes them sparse. + */ + for(i = 0; i < NRUNES; i++){ + if(mytoupper[i] != -1) + myisupper[mytoupper[i]] = 1; + else + mytoupper[i] = i; + + if(mytolower[i] != -1) + myislower[mytolower[i]] = 1; + else + mytolower[i] = i; + + if(mytotitle[i] != -1) + myistitle[mytotitle[i]] = 1; + else + mytotitle[i] = i; + + mytoupper[i] = mytoupper[i] - i; + mytolower[i] = mytolower[i] - i; + mytotitle[i] = mytotitle[i] - i; + } + + uchar b; + for(i = 0; i < NRUNES; i++){ + b = 0; + if(myisspace[i]) + b |= 1<<0; + if(myisalpha[i]) + b |= 1<<1; + if(myisdigit[i]) + b |= 1<<2; + if(myisupper[i]) + b |= 1<<3; + if(myislower[i]) + b |= 1<<4; + if(myistitle[i]) + b |= 1<<5; + + myismerged[i] = b; + } + + markbreak(); + mktables(); + exits(nil); +} --- /dev/null +++ b//sys/src/libc/port/runebreak.c @@ -1,0 +1,149 @@ +#include <u.h> +#include <libc.h> + +#include "/sys/src/libc/port/runebreakdata" + +enum { + OTHER, + Hebrew_Letter, Newline, Extend, Format, + Katakana, ALetter, MidLetter, MidNum, + MidNumLet, Numeric, ExtendNumLet, WSegSpace, + PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40, + L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0, + EMOJIEX = 0xB0, + + ZWJ = 0x200DU, + LINETAB = 0xB, +}; + +#define IS(x, y) ((x&0xf) == y) +#define ISG(x, y) ((x&0xf0) == y) + +Rune* +runegbreak(Rune *s) +{ + Rune l, r; + uchar lt, rt; + Rune *p; + + p = s; + if((l = *p++) == 0) + return s; + if((r = *p) == 0) + return s; + lt = breaklkup(l); + rt = breaklkup(r); + if(l == '\r' && r == '\n') + goto Done; + if(ISG(lt, CONTROL) || l == '\r' || l == '\n') + return p; + if(ISG(rt, CONTROL) || r == '\r' || r == '\n') + return p; + if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT))) + goto Done; + if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T))) + goto Done; + if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T))) + goto Done; + if(ISG(rt, SPACEMK) || ISG(lt, PREPEND)) + goto Done; + if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){ + while(ISG(rt, EXTEND)){ + p++; + if((r = *p) == 0) + return s; + rt = breaklkup(r); + } + if(r != ZWJ) + return p; + p++; + if((r = *p) == 0) + return s; + rt = breaklkup(r); + if(ISG(rt, EMOJIEX)) + goto Done; + return p; + } + if(ISG(rt, EXTEND) || r == ZWJ) + goto Done; + if(ISG(lt, REGION) && ISG(rt, REGION)) + goto Done; + + return p; + +Done: + if(p[1] == 0) + return s; + return p + 1; +} + +#define AH(x) (IS(x, ALetter) || IS(x, Hebrew_Letter)) +#define MNLQ(x) (IS(x, MidNumLet) || x == '\'') + +Rune* +runewbreak(Rune *s) +{ + Rune l, r; + uchar lt, rt; + Rune *p; + + p = s; + if((l = *p++) == 0) + return s; + if((r = *p) == 0) + return s; + lt = breaklkup(l); + rt = breaklkup(r); + if(l == '\r' && r == '\n') + goto Done; + if(l == '\r' || l == '\n' || l == LINETAB) + return p; + if(r == '\r' || r == '\n' || l == LINETAB) + return p; + if(IS(lt, WSegSpace) && IS(rt, WSegSpace)) + goto Done; + if(IS(rt, Format) || IS(rt, Extend)) + goto Done; + if(AH(lt)){ + if(AH(rt)) + goto Done; + if((IS(rt, MidLetter) || MNLQ(rt)) && p[1] != 0 && AH(breaklkup(p[1]))) + goto Done; + if(IS(lt, Hebrew_Letter) && r == '\'') + goto Done; + if(IS(lt, Hebrew_Letter) && r == '"' && p[1] != 0 && IS(breaklkup(p[1]), Hebrew_Letter)) + goto Done; + if(IS(rt, Numeric)) + goto Done; + } + if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric))) + goto Done; + if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && p[1] != 0 && IS(breaklkup(p[1]), Numeric)) + goto Done; + if(IS(lt, Katakana) && IS(rt, Katakana)) + goto Done; + if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet)) + if(IS(rt, ExtendNumLet)) + goto Done; + if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana))) + goto Done; + if(ISG(lt, REGION)){ + if(ISG(rt, REGION)) + goto Done; + if(r != ZWJ) + return p; + p++; + if((r = *p) == 0) + return s; + rt = breaklkup(r); + if(ISG(rt, REGION)) + goto Done; + } + + return p; + +Done: + if(p[1] == 0) + return s; + return p + 1; +} --- /dev/null +++ b//sys/src/libc/port/runeistype.c @@ -1,0 +1,40 @@ +#include <u.h> +#include <libc.h> + +#include "/sys/src/libc/port/runeistypedata" + +int +isspacerune(Rune c) +{ + return (mergedlkup(c) & Lspace) == Lspace; +} + +int +isalpharune(Rune c) +{ + return (mergedlkup(c) & Lalpha) == Lalpha; +} + +int +isdigitrune(Rune c) +{ + return (mergedlkup(c) & Ldigit) == Ldigit; +} + +int +isupperrune(Rune c) +{ + return (mergedlkup(c) & Lupper) == Lupper; +} + +int +islowerrune(Rune c) +{ + return (mergedlkup(c) & Llower) == Llower; +} + +int +istitlerune(Rune c) +{ + return (mergedlkup(c) & Ltitle) == Ltitle; +} --- /dev/null +++ b//sys/src/libc/port/runenorm.c @@ -1,0 +1,328 @@ +#include <u.h> +#include <libc.h> + +#include "/sys/src/libc/port/runenormdata" + +//Unicode Standard: Section 3.12 Conjoining Jamo Behavior +enum { + SBase = 0xAC00, + LBase = 0x1100, + VBase = 0x1161, + TBase = 0x11A7, + + LCount = 19, + VCount = 21, + TCount = 28, + NCount = VCount * TCount, + SCount = LCount * NCount, + + LLast = LBase + LCount - 1, + SLast = SBase + SCount - 1, + VLast = VBase + VCount - 1, + TLast = TBase + TCount - 1, +}; + +void +decomposerune(Rune c, Rune dst[2]) +{ + uint x; + + if(c >= SBase && c <= SLast){ + c -= SBase; + x = c % TCount; + if(x){ + dst[0] = SBase + ((c / TCount) * TCount); + dst[1] = TBase + x; + return; + } + dst[0] = LBase + (c / NCount); + dst[1] = VBase + ((c % NCount) / TCount); + return; + } + x = decomplkup(c); + if((x & (ushort)~0) != 0){ + dst[0] = x>>16; + dst[1] = x & (ushort)~0; + return; + } + x >>= 16; + if(x >= 0xEEEE && x <0xF8FF){ + memmove(dst, _decompexceptions[x - 0xEEEE], sizeof(Rune)*2); + return; + } + dst[0] = x; + dst[1] = 0; +} + +Rune +composerune(Rune r[2]) +{ + uint x, y, *p, next; + + if(r[0] >= LBase && r[0] <= LLast){ + if(r[1] < VBase || r[1] > VLast) + return 0; + x = (r[0] - LBase) * NCount + (r[1] - VBase) * TCount; + return SBase + x; + } + if(r[0] >= SBase && r[0] <= SLast && (r[0] - SBase) % TCount == 0){ + if(r[1] > TBase && r[1] <= TLast) + return r[0] + (r[1] - TBase); + return 0; + } + if(r[0] > (ushort)~0 || r[1] > (ushort)~0){ + for(x = 0; x < nelem(_recompexceptions); x++) + if(r[0] == _recompexceptions[x][1] && r[1] == _recompexceptions[x][2]) + return _recompexceptions[x][0]; + return 0; + } + y = x = r[0]<<16 | r[1]; + x ^= x >> 16; + x *= 0x21f0aaad; + x ^= x >> 15; + x *= 0xd35a2d97; + x ^= x >> 15; + p = _recompdata + (x%512)*2; + while(p[0] != y){ + next = p[1]>>16; + if(!next) + return 0; + p = _recompcoll + (next-1)*2; + } + return p[1] & 0xFFFF; +} + +int +runeccc(Rune c) +{ + return ccclkup(c); +} + +void +runecccsort(Rune *a, int len) +{ + Rune r; + int i; + int fail; + + do { + fail = 0; + for(i = 0; i < len - 1; i++){ + if(runeccc(a[i]) > runeccc(a[i+1]) > 0){ + r = a[i]; + a[i] = a[i+1]; + a[i + 1] = r; + fail = 1; + } + } + } while(fail); +} + +char* +fullutfnorm(char *s, int n) +{ + Rune r, peek; + char *p, *p2; + + p = s; + if(fullrune(p, n) == 0) + return s; + + p += chartorune(&r, p); + n -= (p - s); + + if((r >= LBase && r <= LLast) || (r >= SBase && r <= SLast)){ + do { + if(fullrune(p, n) == 0) + return s; + p2 = p + chartorune(&peek, p); + n -= (p2 - p); + p = p2; + } while(n > 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast)); + if(n <= 0) + return s; + return p; + } + + do { + if(fullrune(p, n) == 0) + return s; + p2 = p + chartorune(&peek, p); + n -= (p2 - p); + p = p2; + if(runeccc(peek) == 0) + return p; + } while(n > 0); + + return s; +} + +Rune* +fullrunenorm(Rune *r, int n) +{ + Rune *e, *p; + + p = r; + e = p + n; + + if((*p >= LBase && *p <= LLast) || (*p >= SBase && *p <= SLast)){ + p++; + while(p < e && (*p >= VBase && *p <= VLast) || (*p > TBase && *p <= TLast)) + p++; + + if(p >= e) + return r; + return p; + } + + for(; p < e && p + 1 < e; p++) + if(runeccc(p[1]) == 0) + return p + 1; + + return r; +} + +int +_runenorm(Rune *dst, Rune *src, char *sdst, char *ssrc, int max, int compose) +{ + Rune c, r[2], _stack[32]; + Rune *p, *stack, *sp, *tp; + char *strp, *strstop; + Rune *rp, *rrp; + Rune *stop; + Rune peek; + int w, w2, size; + int mode; + + if(src){ + mode = 1; + p = src; + stop = dst + (max - 1); + strp = ""; + strstop = nil; + } else { + mode = 0; + p = L""; + stop = nil; + strp = ssrc; + strstop = sdst + (max - 1); + } + + stack = _stack + nelem(_stack)/2; + size = 0; + w = w2 = 0; + while(*strp || *p){ + if(mode) + c = *p; + else + w = chartorune(&c, strp); + + sp = stack - 1; + tp = stack; + decomposerune(c, r); + while(r[0] != 0){ + c = r[0]; + if(r[1] != 0){ + *sp-- = r[1]; + if(sp == _stack) + break; + } + decomposerune(c, r); + } + + *sp = c; + if(mode) + peek = p[1]; + else + w2 = chartorune(&peek, strp+w); + + if((*sp >= LBase && *sp <= LLast) || (*sp >= SBase && *sp <= SLast)){ + while(peek != 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast)){ + *tp++ = peek; + if(mode){ + p++; + peek = p[1]; + } else { + strp += w; + w = w2; + w2 = chartorune(&peek, strp+w); + } + if(tp == _stack + nelem(_stack)) + break; + } + } + while(peek != 0 && runeccc(peek) != 0){ + decomposerune(peek, r); + if(r[1] != 0){ + if(tp+1 >= _stack + nelem(_stack)) + break; + *tp++ = r[0]; + *tp++ = r[1]; + } else if(r[0] != 0) + *tp++ = r[0]; + else + *tp++ = peek; + + if(mode){ + p++; + peek = p[1]; + } else { + strp += w; + w = w2; + w2 = chartorune(&peek, strp+w); + } + if(tp == _stack + nelem(_stack)) + break; + } + runecccsort(sp, tp - sp); + + if(compose && runeccc(*sp) == 0){ + for(rp = sp + 1; rp < tp; rp++){ + r[0] = *sp; + r[1] = *rp; + c = composerune(r); + if(c != 0){ + *sp = c; + for(rrp = rp; rrp > sp; rrp--) + *rrp = rrp[-1]; + sp++; + } else while(rp + 1 < tp && runeccc(*rp) == runeccc(*(rp+1))) + rp++; + } + } + + for(; sp < tp; sp++){ + if(mode){ + if(dst < stop) + *dst++ = *sp; + size++; + } else { + w2 = runelen(*sp); + if(sdst+w2 < strstop) + sdst += runetochar(sdst, sp); + size += w2; + } + } + if(mode) + p++; + else + strp += w; + } + if(mode) + *dst = 0; + else + *sdst = 0; + return size; +} + +int +runenorm(Rune *dst, Rune *src, int max, int compose) +{ + return _runenorm(dst, src, nil, nil, max, compose); +} + +int +utfnorm(char *dst, char *src, int max, int compose) +{ + return _runenorm(nil, nil, dst, src, max, compose); +} --- /dev/null +++ b//sys/src/libc/port/runetotype.c @@ -1,0 +1,22 @@ +#include <u.h> +#include <libc.h> + +#include "/sys/src/libc/port/runetotypedata" + +Rune +toupperrune(Rune c) +{ + return c + upperlkup(c); +} + +Rune +tolowerrune(Rune c) +{ + return c + lowerlkup(c); +} + +Rune +totitlerune(Rune c) +{ + return c + titlelkup(c); +} --- a//sys/src/libc/port/runetype.c +++ /dev/null @@ -1,1181 +1,0 @@ -#include <u.h> -#include <libc.h> - -/* - * alpha ranges - - * only covers ranges not in lower||upper - */ -static -Rune _alpha2[] = -{ - 0x00d8, 0x00f6, /* Ø - ö */ - 0x00f8, 0x01f5, /* ø - ǵ */ - 0x0250, 0x02a8, /* ɐ - ʨ */ - 0x038e, 0x03a1, /* Ύ - Ρ */ - 0x03a3, 0x03ce, /* Σ - ώ */ - 0x03d0, 0x03d6, /* ϐ - ϖ */ - 0x03e2, 0x03f3, /* Ϣ - ϳ */ - 0x0490, 0x04c4, /* Ґ - ӄ */ - 0x0561, 0x0587, /* ա - և */ - 0x05d0, 0x05ea, /* א - ת */ - 0x05f0, 0x05f2, /* װ - ײ */ - 0x0621, 0x063a, /* ء - غ */ - 0x0640, 0x064a, /* ـ - ي */ - 0x0671, 0x06b7, /* ٱ - ڷ */ - 0x06ba, 0x06be, /* ں - ھ */ - 0x06c0, 0x06ce, /* ۀ - ێ */ - 0x06d0, 0x06d3, /* ې - ۓ */ - 0x0905, 0x0939, /* अ - ह */ - 0x0958, 0x0961, /* क़ - ॡ */ - 0x0985, 0x098c, /* অ - ঌ */ - 0x098f, 0x0990, /* এ - ঐ */ - 0x0993, 0x09a8, /* ও - ন */ - 0x09aa, 0x09b0, /* প - র */ - 0x09b6, 0x09b9, /* শ - হ */ - 0x09dc, 0x09dd, /* ড় - ঢ় */ - 0x09df, 0x09e1, /* য় - ৡ */ - 0x09f0, 0x09f1, /* ৰ - ৱ */ - 0x0a05, 0x0a0a, /* ਅ - ਊ */ - 0x0a0f, 0x0a10, /* ਏ - ਐ */ - 0x0a13, 0x0a28, /* ਓ - ਨ */ - 0x0a2a, 0x0a30, /* ਪ - ਰ */ - 0x0a32, 0x0a33, /* ਲ - ਲ਼ */ - 0x0a35, 0x0a36, /* ਵ - ਸ਼ */ - 0x0a38, 0x0a39, /* ਸ - ਹ */ - 0x0a59, 0x0a5c, /* ਖ਼ - ੜ */ - 0x0a85, 0x0a8b, /* અ - ઋ */ - 0x0a8f, 0x0a91, /* એ - ઑ */ - 0x0a93, 0x0aa8, /* ઓ - ન */ - 0x0aaa, 0x0ab0, /* પ - ર */ - 0x0ab2, 0x0ab3, /* લ - ળ */ - 0x0ab5, 0x0ab9, /* વ - હ */ - 0x0b05, 0x0b0c, /* ଅ - ଌ */ - 0x0b0f, 0x0b10, /* ଏ - ଐ */ - 0x0b13, 0x0b28, /* ଓ - ନ */ - 0x0b2a, 0x0b30, /* ପ - ର */ - 0x0b32, 0x0b33, /* ଲ - ଳ */ - 0x0b36, 0x0b39, /* ଶ - ହ */ - 0x0b5c, 0x0b5d, /* ଡ଼ - ଢ଼ */ - 0x0b5f, 0x0b61, /* ୟ - ୡ */ - 0x0b85, 0x0b8a, /* அ - ஊ */ - 0x0b8e, 0x0b90, /* எ - ஐ */ - 0x0b92, 0x0b95, /* ஒ - க */ - 0x0b99, 0x0b9a, /* ங - ச */ - 0x0b9e, 0x0b9f, /* ஞ - ட */ - 0x0ba3, 0x0ba4, /* ண - த */ - 0x0ba8, 0x0baa, /* ந - ப */ - 0x0bae, 0x0bb5, /* ம - வ */ - 0x0bb7, 0x0bb9, /* ஷ - ஹ */ - 0x0c05, 0x0c0c, /* అ - ఌ */ - 0x0c0e, 0x0c10, /* ఎ - ఐ */ - 0x0c12, 0x0c28, /* ఒ - న */ - 0x0c2a, 0x0c33, /* ప - ళ */ - 0x0c35, 0x0c39, /* వ - హ */ - 0x0c60, 0x0c61, /* ౠ - ౡ */ - 0x0c85, 0x0c8c, /* ಅ - ಌ */ - 0x0c8e, 0x0c90, /* ಎ - ಐ */ - 0x0c92, 0x0ca8, /* ಒ - ನ */ - 0x0caa, 0x0cb3, /* ಪ - ಳ */ - 0x0cb5, 0x0cb9, /* ವ - ಹ */ - 0x0ce0, 0x0ce1, /* ೠ - ೡ */ - 0x0d05, 0x0d0c, /* അ - ഌ */ - 0x0d0e, 0x0d10, /* എ - ഐ */ - 0x0d12, 0x0d28, /* ഒ - ന */ - 0x0d2a, 0x0d39, /* പ - ഹ */ - 0x0d60, 0x0d61, /* ൠ - ൡ */ - 0x0e01, 0x0e30, /* ก - ะ */ - 0x0e32, 0x0e33, /* า - ำ */ - 0x0e40, 0x0e46, /* เ - ๆ */ - 0x0e5a, 0x0e5b, /* ๚ - ๛ */ - 0x0e81, 0x0e82, /* ກ - ຂ */ - 0x0e87, 0x0e88, /* ງ - ຈ */ - 0x0e94, 0x0e97, /* ດ - ທ */ - 0x0e99, 0x0e9f, /* ນ - ຟ */ - 0x0ea1, 0x0ea3, /* ມ - ຣ */ - 0x0eaa, 0x0eab, /* ສ - ຫ */ - 0x0ead, 0x0eae, /* ອ - ຮ */ - 0x0eb2, 0x0eb3, /* າ - ຳ */ - 0x0ec0, 0x0ec4, /* ເ - ໄ */ - 0x0edc, 0x0edd, /* ໜ - ໝ */ - 0x0f18, 0x0f19, /* ༘ - ༙ */ - 0x0f40, 0x0f47, /* ཀ - ཇ */ - 0x0f49, 0x0f69, /* ཉ - ཀྵ */ - 0x10d0, 0x10f6, /* ა - ჶ */ - 0x1100, 0x1159, /* ᄀ - ᅙ */ - 0x115f, 0x11a2, /* ᅟ - ᆢ */ - 0x11a8, 0x11f9, /* ᆨ - ᇹ */ - 0x1e00, 0x1e9b, /* Ḁ - ẛ */ - 0x1f50, 0x1f57, /* ὐ - ὗ */ - 0x1f80, 0x1fb4, /* ᾀ - ᾴ */ - 0x1fb6, 0x1fbc, /* ᾶ - ᾼ */ - 0x1fc2, 0x1fc4, /* ῂ - ῄ */ - 0x1fc6, 0x1fcc, /* ῆ - ῌ */ - 0x1fd0, 0x1fd3, /* ῐ - ΐ */ - 0x1fd6, 0x1fdb, /* ῖ - Ί */ - 0x1fe0, 0x1fec, /* ῠ - Ῥ */ - 0x1ff2, 0x1ff4, /* ῲ - ῴ */ - 0x1ff6, 0x1ffc, /* ῶ - ῼ */ - 0x210a, 0x2113, /* ℊ - ℓ */ - 0x2115, 0x211d, /* ℕ - ℝ */ - 0x2120, 0x2122, /* ℠ - ™ */ - 0x212a, 0x2131, /* K - ℱ */ - 0x2133, 0x2138, /* ℳ - ℸ */ - 0x3041, 0x3094, /* ぁ - ゔ */ - 0x30a1, 0x30fa, /* ァ - ヺ */ - 0x3105, 0x312c, /* ㄅ - ㄬ */ - 0x3131, 0x318e, /* ㄱ - ㆎ */ - 0x3192, 0x319f, /* ㆒ - ㆟ */ - 0x3260, 0x327b, /* ㉠ - ㉻ */ - 0x328a, 0x32b0, /* ㊊ - ㊰ */ - 0x32d0, 0x32fe, /* ㋐ - ㋾ */ - 0x3300, 0x3357, /* ㌀ - ㍗ */ - 0x3371, 0x3376, /* ㍱ - ㍶ */ - 0x337b, 0x3394, /* ㍻ - ㎔ */ - 0x3399, 0x339e, /* ㎙ - ㎞ */ - 0x33a9, 0x33ad, /* ㎩ - ㎭ */ - 0x33b0, 0x33c1, /* ㎰ - ㏁ */ - 0x33c3, 0x33c5, /* ㏃ - ㏅ */ - 0x33c7, 0x33d7, /* ㏇ - ㏗ */ - 0x33d9, 0x33dd, /* ㏙ - ㏝ */ - 0x4e00, 0x9fff, /* 一 - 鿿 */ - 0xac00, 0xd7a3, /* 가 - 힣 */ - 0xf900, 0xfb06, /* 豈 - st */ - 0xfb13, 0xfb17, /* ﬓ - ﬗ */ - 0xfb1f, 0xfb28, /* ײַ - ﬨ */ - 0xfb2a, 0xfb36, /* שׁ - זּ */ - 0xfb38, 0xfb3c, /* טּ - לּ */ - 0xfb40, 0xfb41, /* נּ - סּ */ - 0xfb43, 0xfb44, /* ףּ - פּ */ - 0xfb46, 0xfbb1, /* צּ - ﮱ */ - 0xfbd3, 0xfd3d, /* ﯓ - ﴽ */ - 0xfd50, 0xfd8f, /* ﵐ - ﶏ */ - 0xfd92, 0xfdc7, /* ﶒ - ﷇ */ - 0xfdf0, 0xfdf9, /* ﷰ - ﷹ */ - 0xfe70, 0xfe72, /* ﹰ - ﹲ */ - 0xfe76, 0xfefc, /* ﹶ - ﻼ */ - 0xff66, 0xff6f, /* ヲ - ッ */ - 0xff71, 0xff9d, /* ア - ン */ - 0xffa0, 0xffbe, /* ᅠ - ᄒ */ - 0xffc2, 0xffc7, /* ᅡ - ᅦ */ - 0xffca, 0xffcf, /* ᅧ - ᅬ */ - 0xffd2, 0xffd7, /* ᅭ - ᅲ */ - 0xffda, 0xffdc, /* ᅳ - ᅵ */ -}; - -/* - * alpha singlets - - * only covers ranges not in lower||upper - */ -static -Rune _alpha1[] = -{ - 0x00aa, /* ª */ - 0x00b5, /* µ */ - 0x00ba, /* º */ - 0x03da, /* Ϛ */ - 0x03dc, /* Ϝ */ - 0x03de, /* Ϟ */ - 0x03e0, /* Ϡ */ - 0x06d5, /* ە */ - 0x09b2, /* ল */ - 0x0a5e, /* ਫ਼ */ - 0x0a8d, /* ઍ */ - 0x0ae0, /* ૠ */ - 0x0b9c, /* ஜ */ - 0x0cde, /* ೞ */ - 0x0e4f, /* ๏ */ - 0x0e84, /* ຄ */ - 0x0e8a, /* ຊ */ - 0x0e8d, /* ຍ */ - 0x0ea5, /* ລ */ - 0x0ea7, /* ວ */ - 0x0eb0, /* ະ */ - 0x0ebd, /* ຽ */ - 0x1fbe, /* ι */ - 0x207f, /* ⁿ */ - 0x20a8, /* ₨ */ - 0x2102, /* ℂ */ - 0x2107, /* ℇ */ - 0x2124, /* ℤ */ - 0x2126, /* Ω */ - 0x2128, /* ℨ */ - 0xfb3e, /* מּ */ - 0xfe74, /* ﹴ */ -}; - -/* - * space ranges - */ -static -Rune _space2[] = -{ - 0x0009, 0x000a, /* tab and newline */ - 0x0020, 0x0020, /* space */ - 0x0085, 0x0085, - 0x00a0, 0x00a0, /* */ - 0x1680, 0x1680, - 0x180e, 0x180e, - 0x2000, 0x200b, /* - */ - 0x2028, 0x2029, /* - */ - 0x202f, 0x202f, - 0x205f, 0x205f, - 0x3000, 0x3000, /* */ - 0xfeff, 0xfeff, /* */ -}; - -/* - * lower case ranges - * 3rd col is conversion excess 500 - */ -static -Rune _toupper2[] = -{ - 0x0061, 0x007a, 468, /* a-z A-Z */ - 0x00e0, 0x00f6, 468, /* à-ö À-Ö */ - 0x00f8, 0x00fe, 468, /* ø-þ Ø-Þ */ - 0x0256, 0x0257, 295, /* ɖ-ɗ Ɖ-Ɗ */ - 0x0258, 0x0259, 298, /* ɘ-ə Ǝ-Ə */ - 0x028a, 0x028b, 283, /* ʊ-ʋ Ʊ-Ʋ */ - 0x03ad, 0x03af, 463, /* έ-ί Έ-Ί */ - 0x03b1, 0x03c1, 468, /* α-ρ Α-Ρ */ - 0x03c3, 0x03cb, 468, /* σ-ϋ Σ-Ϋ */ - 0x03cd, 0x03ce, 437, /* ύ-ώ Ύ-Ώ */ - 0x0430, 0x044f, 468, /* а-я А-Я */ - 0x0451, 0x045c, 420, /* ё-ќ Ё-Ќ */ - 0x045e, 0x045f, 420, /* ў-џ Ў-Џ */ - 0x0561, 0x0586, 452, /* ա-ֆ Ա-Ֆ */ - 0x1f00, 0x1f07, 508, /* ἀ-ἇ Ἀ-Ἇ */ - 0x1f10, 0x1f15, 508, /* ἐ-ἕ Ἐ-Ἕ */ - 0x1f20, 0x1f27, 508, /* ἠ-ἧ Ἠ-Ἧ */ - 0x1f30, 0x1f37, 508, /* ἰ-ἷ Ἰ-Ἷ */ - 0x1f40, 0x1f45, 508, /* ὀ-ὅ Ὀ-Ὅ */ - 0x1f60, 0x1f67, 508, /* ὠ-ὧ Ὠ-Ὧ */ - 0x1f70, 0x1f71, 574, /* ὰ-ά Ὰ-Ά */ - 0x1f72, 0x1f75, 586, /* ὲ-ή Ὲ-Ή */ - 0x1f76, 0x1f77, 600, /* ὶ-ί Ὶ-Ί */ - 0x1f78, 0x1f79, 628, /* ὸ-ό Ὸ-Ό */ - 0x1f7a, 0x1f7b, 612, /* ὺ-ύ Ὺ-Ύ */ - 0x1f7c, 0x1f7d, 626, /* ὼ-ώ Ὼ-Ώ */ - 0x1f80, 0x1f87, 508, /* ᾀ-ᾇ ᾈ-ᾏ */ - 0x1f90, 0x1f97, 508, /* ᾐ-ᾗ ᾘ-ᾟ */ - 0x1fa0, 0x1fa7, 508, /* ᾠ-ᾧ ᾨ-ᾯ */ - 0x1fb0, 0x1fb1, 508, /* ᾰ-ᾱ Ᾰ-Ᾱ */ - 0x1fd0, 0x1fd1, 508, /* ῐ-ῑ Ῐ-Ῑ */ - 0x1fe0, 0x1fe1, 508, /* ῠ-ῡ Ῠ-Ῡ */ - 0x2170, 0x217f, 484, /* ⅰ-ⅿ Ⅰ-Ⅿ */ - 0x24d0, 0x24e9, 474, /* ⓐ-ⓩ Ⓐ-Ⓩ */ - 0xff41, 0xff5a, 468, /* a-z A-Z */ -}; - -/* - * lower case singlets - * 2nd col is conversion excess 500 - */ -static -Rune _toupper1[] = -{ - 0x00ff, 621, /* ÿ Ÿ */ - 0x0101, 499, /* ā Ā */ - 0x0103, 499, /* ă Ă */ - 0x0105, 499, /* ą Ą */ - 0x0107, 499, /* ć Ć */ - 0x0109, 499, /* ĉ Ĉ */ - 0x010b, 499, /* ċ Ċ */ - 0x010d, 499, /* č Č */ - 0x010f, 499, /* ď Ď */ - 0x0111, 499, /* đ Đ */ - 0x0113, 499, /* ē Ē */ - 0x0115, 499, /* ĕ Ĕ */ - 0x0117, 499, /* ė Ė */ - 0x0119, 499, /* ę Ę */ - 0x011b, 499, /* ě Ě */ - 0x011d, 499, /* ĝ Ĝ */ - 0x011f, 499, /* ğ Ğ */ - 0x0121, 499, /* ġ Ġ */ - 0x0123, 499, /* ģ Ģ */ - 0x0125, 499, /* ĥ Ĥ */ - 0x0127, 499, /* ħ Ħ */ - 0x0129, 499, /* ĩ Ĩ */ - 0x012b, 499, /* ī Ī */ - 0x012d, 499, /* ĭ Ĭ */ - 0x012f, 499, /* į Į */ - 0x0131, 268, /* ı I */ - 0x0133, 499, /* ij IJ */ - 0x0135, 499, /* ĵ Ĵ */ - 0x0137, 499, /* ķ Ķ */ - 0x013a, 499, /* ĺ Ĺ */ - 0x013c, 499, /* ļ Ļ */ - 0x013e, 499, /* ľ Ľ */ - 0x0140, 499, /* ŀ Ŀ */ - 0x0142, 499, /* ł Ł */ - 0x0144, 499, /* ń Ń */ - 0x0146, 499, /* ņ Ņ */ - 0x0148, 499, /* ň Ň */ - 0x014b, 499, /* ŋ Ŋ */ - 0x014d, 499, /* ō Ō */ - 0x014f, 499, /* ŏ Ŏ */ - 0x0151, 499, /* ő Ő */ - 0x0153, 499, /* œ Œ */ - 0x0155, 499, /* ŕ Ŕ */ - 0x0157, 499, /* ŗ Ŗ */ - 0x0159, 499, /* ř Ř */ - 0x015b, 499, /* ś Ś */ - 0x015d, 499, /* ŝ Ŝ */ - 0x015f, 499, /* ş Ş */ - 0x0161, 499, /* š Š */ - 0x0163, 499, /* ţ Ţ */ - 0x0165, 499, /* ť Ť */ - 0x0167, 499, /* ŧ Ŧ */ - 0x0169, 499, /* ũ Ũ */ - 0x016b, 499, /* ū Ū */ - 0x016d, 499, /* ŭ Ŭ */ - 0x016f, 499, /* ů Ů */ - 0x0171, 499, /* ű Ű */ - 0x0173, 499, /* ų Ų */ - 0x0175, 499, /* ŵ Ŵ */ - 0x0177, 499, /* ŷ Ŷ */ - 0x017a, 499, /* ź Ź */ - 0x017c, 499, /* ż Ż */ - 0x017e, 499, /* ž Ž */ - 0x017f, 200, /* ſ S */ - 0x0183, 499, /* ƃ Ƃ */ - 0x0185, 499, /* ƅ Ƅ */ - 0x0188, 499, /* ƈ Ƈ */ - 0x018c, 499, /* ƌ Ƌ */ - 0x0192, 499, /* ƒ Ƒ */ - 0x0199, 499, /* ƙ Ƙ */ - 0x01a1, 499, /* ơ Ơ */ - 0x01a3, 499, /* ƣ Ƣ */ - 0x01a5, 499, /* ƥ Ƥ */ - 0x01a8, 499, /* ƨ Ƨ */ - 0x01ad, 499, /* ƭ Ƭ */ - 0x01b0, 499, /* ư Ư */ - 0x01b4, 499, /* ƴ Ƴ */ - 0x01b6, 499, /* ƶ Ƶ */ - 0x01b9, 499, /* ƹ Ƹ */ - 0x01bd, 499, /* ƽ Ƽ */ - 0x01c5, 499, /* Dž DŽ */ - 0x01c6, 498, /* dž DŽ */ - 0x01c8, 499, /* Lj LJ */ - 0x01c9, 498, /* lj LJ */ - 0x01cb, 499, /* Nj NJ */ - 0x01cc, 498, /* nj NJ */ - 0x01ce, 499, /* ǎ Ǎ */ - 0x01d0, 499, /* ǐ Ǐ */ - 0x01d2, 499, /* ǒ Ǒ */ - 0x01d4, 499, /* ǔ Ǔ */ - 0x01d6, 499, /* ǖ Ǖ */ - 0x01d8, 499, /* ǘ Ǘ */ - 0x01da, 499, /* ǚ Ǚ */ - 0x01dc, 499, /* ǜ Ǜ */ - 0x01df, 499, /* ǟ Ǟ */ - 0x01e1, 499, /* ǡ Ǡ */ - 0x01e3, 499, /* ǣ Ǣ */ - 0x01e5, 499, /* ǥ Ǥ */ - 0x01e7, 499, /* ǧ Ǧ */ - 0x01e9, 499, /* ǩ Ǩ */ - 0x01eb, 499, /* ǫ Ǫ */ - 0x01ed, 499, /* ǭ Ǭ */ - 0x01ef, 499, /* ǯ Ǯ */ - 0x01f2, 499, /* Dz DZ */ - 0x01f3, 498, /* dz DZ */ - 0x01f5, 499, /* ǵ Ǵ */ - 0x01fb, 499, /* ǻ Ǻ */ - 0x01fd, 499, /* ǽ Ǽ */ - 0x01ff, 499, /* ǿ Ǿ */ - 0x0201, 499, /* ȁ Ȁ */ - 0x0203, 499, /* ȃ Ȃ */ - 0x0205, 499, /* ȅ Ȅ */ - 0x0207, 499, /* ȇ Ȇ */ - 0x0209, 499, /* ȉ Ȉ */ - 0x020b, 499, /* ȋ Ȋ */ - 0x020d, 499, /* ȍ Ȍ */ - 0x020f, 499, /* ȏ Ȏ */ - 0x0211, 499, /* ȑ Ȑ */ - 0x0213, 499, /* ȓ Ȓ */ - 0x0215, 499, /* ȕ Ȕ */ - 0x0217, 499, /* ȗ Ȗ */ - 0x0253, 290, /* ɓ Ɓ */ - 0x0254, 294, /* ɔ Ɔ */ - 0x025b, 297, /* ɛ Ɛ */ - 0x0260, 295, /* ɠ Ɠ */ - 0x0263, 293, /* ɣ Ɣ */ - 0x0268, 291, /* ɨ Ɨ */ - 0x0269, 289, /* ɩ Ɩ */ - 0x026f, 289, /* ɯ Ɯ */ - 0x0272, 287, /* ɲ Ɲ */ - 0x0283, 282, /* ʃ Ʃ */ - 0x0288, 282, /* ʈ Ʈ */ - 0x0292, 281, /* ʒ Ʒ */ - 0x03ac, 462, /* ά Ά */ - 0x03cc, 436, /* ό Ό */ - 0x03d0, 438, /* ϐ Β */ - 0x03d1, 443, /* ϑ Θ */ - 0x03d5, 453, /* ϕ Φ */ - 0x03d6, 446, /* ϖ Π */ - 0x03e3, 499, /* ϣ Ϣ */ - 0x03e5, 499, /* ϥ Ϥ */ - 0x03e7, 499, /* ϧ Ϧ */ - 0x03e9, 499, /* ϩ Ϩ */ - 0x03eb, 499, /* ϫ Ϫ */ - 0x03ed, 499, /* ϭ Ϭ */ - 0x03ef, 499, /* ϯ Ϯ */ - 0x03f0, 414, /* ϰ Κ */ - 0x03f1, 420, /* ϱ Ρ */ - 0x0461, 499, /* ѡ Ѡ */ - 0x0463, 499, /* ѣ Ѣ */ - 0x0465, 499, /* ѥ Ѥ */ - 0x0467, 499, /* ѧ Ѧ */ - 0x0469, 499, /* ѩ Ѩ */ - 0x046b, 499, /* ѫ Ѫ */ - 0x046d, 499, /* ѭ Ѭ */ - 0x046f, 499, /* ѯ Ѯ */ - 0x0471, 499, /* ѱ Ѱ */ - 0x0473, 499, /* ѳ Ѳ */ - 0x0475, 499, /* ѵ Ѵ */ - 0x0477, 499, /* ѷ Ѷ */ - 0x0479, 499, /* ѹ Ѹ */ - 0x047b, 499, /* ѻ Ѻ */ - 0x047d, 499, /* ѽ Ѽ */ - 0x047f, 499, /* ѿ Ѿ */ - 0x0481, 499, /* ҁ Ҁ */ - 0x0491, 499, /* ґ Ґ */ - 0x0493, 499, /* ғ Ғ */ - 0x0495, 499, /* ҕ Ҕ */ - 0x0497, 499, /* җ Җ */ - 0x0499, 499, /* ҙ Ҙ */ - 0x049b, 499, /* қ Қ */ - 0x049d, 499, /* ҝ Ҝ */ - 0x049f, 499, /* ҟ Ҟ */ - 0x04a1, 499, /* ҡ Ҡ */ - 0x04a3, 499, /* ң Ң */ - 0x04a5, 499, /* ҥ Ҥ */ - 0x04a7, 499, /* ҧ Ҧ */ - 0x04a9, 499, /* ҩ Ҩ */ - 0x04ab, 499, /* ҫ Ҫ */ - 0x04ad, 499, /* ҭ Ҭ */ - 0x04af, 499, /* ү Ү */ - 0x04b1, 499, /* ұ Ұ */ - 0x04b3, 499, /* ҳ Ҳ */ - 0x04b5, 499, /* ҵ Ҵ */ - 0x04b7, 499, /* ҷ Ҷ */ - 0x04b9, 499, /* ҹ Ҹ */ - 0x04bb, 499, /* һ Һ */ - 0x04bd, 499, /* ҽ Ҽ */ - 0x04bf, 499, /* ҿ Ҿ */ - 0x04c2, 499, /* ӂ Ӂ */ - 0x04c4, 499, /* ӄ Ӄ */ - 0x04c8, 499, /* ӈ Ӈ */ - 0x04cc, 499, /* ӌ Ӌ */ - 0x04d1, 499, /* ӑ Ӑ */ - 0x04d3, 499, /* ӓ Ӓ */ - 0x04d5, 499, /* ӕ Ӕ */ - 0x04d7, 499, /* ӗ Ӗ */ - 0x04d9, 499, /* ә Ә */ - 0x04db, 499, /* ӛ Ӛ */ - 0x04dd, 499, /* ӝ Ӝ */ - 0x04df, 499, /* ӟ Ӟ */ - 0x04e1, 499, /* ӡ Ӡ */ - 0x04e3, 499, /* ӣ Ӣ */ - 0x04e5, 499, /* ӥ Ӥ */ - 0x04e7, 499, /* ӧ Ӧ */ - 0x04e9, 499, /* ө Ө */ - 0x04eb, 499, /* ӫ Ӫ */ - 0x04ef, 499, /* ӯ Ӯ */ - 0x04f1, 499, /* ӱ Ӱ */ - 0x04f3, 499, /* ӳ Ӳ */ - 0x04f5, 499, /* ӵ Ӵ */ - 0x04f9, 499, /* ӹ Ӹ */ - 0x1e01, 499, /* ḁ Ḁ */ - 0x1e03, 499, /* ḃ Ḃ */ - 0x1e05, 499, /* ḅ Ḅ */ - 0x1e07, 499, /* ḇ Ḇ */ - 0x1e09, 499, /* ḉ Ḉ */ - 0x1e0b, 499, /* ḋ Ḋ */ - 0x1e0d, 499, /* ḍ Ḍ */ - 0x1e0f, 499, /* ḏ Ḏ */ - 0x1e11, 499, /* ḑ Ḑ */ - 0x1e13, 499, /* ḓ Ḓ */ - 0x1e15, 499, /* ḕ Ḕ */ - 0x1e17, 499, /* ḗ Ḗ */ - 0x1e19, 499, /* ḙ Ḙ */ - 0x1e1b, 499, /* ḛ Ḛ */ - 0x1e1d, 499, /* ḝ Ḝ */ - 0x1e1f, 499, /* ḟ Ḟ */ - 0x1e21, 499, /* ḡ Ḡ */ - 0x1e23, 499, /* ḣ Ḣ */ - 0x1e25, 499, /* ḥ Ḥ */ - 0x1e27, 499, /* ḧ Ḧ */ - 0x1e29, 499, /* ḩ Ḩ */ - 0x1e2b, 499, /* ḫ Ḫ */ - 0x1e2d, 499, /* ḭ Ḭ */ - 0x1e2f, 499, /* ḯ Ḯ */ - 0x1e31, 499, /* ḱ Ḱ */ - 0x1e33, 499, /* ḳ Ḳ */ - 0x1e35, 499, /* ḵ Ḵ */ - 0x1e37, 499, /* ḷ Ḷ */ - 0x1e39, 499, /* ḹ Ḹ */ - 0x1e3b, 499, /* ḻ Ḻ */ - 0x1e3d, 499, /* ḽ Ḽ */ - 0x1e3f, 499, /* ḿ Ḿ */ - 0x1e41, 499, /* ṁ Ṁ */ - 0x1e43, 499, /* ṃ Ṃ */ - 0x1e45, 499, /* ṅ Ṅ */ - 0x1e47, 499, /* ṇ Ṇ */ - 0x1e49, 499, /* ṉ Ṉ */ - 0x1e4b, 499, /* ṋ Ṋ */ - 0x1e4d, 499, /* ṍ Ṍ */ - 0x1e4f, 499, /* ṏ Ṏ */ - 0x1e51, 499, /* ṑ Ṑ */ - 0x1e53, 499, /* ṓ Ṓ */ - 0x1e55, 499, /* ṕ Ṕ */ - 0x1e57, 499, /* ṗ Ṗ */ - 0x1e59, 499, /* ṙ Ṙ */ - 0x1e5b, 499, /* ṛ Ṛ */ - 0x1e5d, 499, /* ṝ Ṝ */ - 0x1e5f, 499, /* ṟ Ṟ */ - 0x1e61, 499, /* ṡ Ṡ */ - 0x1e63, 499, /* ṣ Ṣ */ - 0x1e65, 499, /* ṥ Ṥ */ - 0x1e67, 499, /* ṧ Ṧ */ - 0x1e69, 499, /* ṩ Ṩ */ - 0x1e6b, 499, /* ṫ Ṫ */ - 0x1e6d, 499, /* ṭ Ṭ */ - 0x1e6f, 499, /* ṯ Ṯ */ - 0x1e71, 499, /* ṱ Ṱ */ - 0x1e73, 499, /* ṳ Ṳ */ - 0x1e75, 499, /* ṵ Ṵ */ - 0x1e77, 499, /* ṷ Ṷ */ - 0x1e79, 499, /* ṹ Ṹ */ - 0x1e7b, 499, /* ṻ Ṻ */ - 0x1e7d, 499, /* ṽ Ṽ */ - 0x1e7f, 499, /* ṿ Ṿ */ - 0x1e81, 499, /* ẁ Ẁ */ - 0x1e83, 499, /* ẃ Ẃ */ - 0x1e85, 499, /* ẅ Ẅ */ - 0x1e87, 499, /* ẇ Ẇ */ - 0x1e89, 499, /* ẉ Ẉ */ - 0x1e8b, 499, /* ẋ Ẋ */ - 0x1e8d, 499, /* ẍ Ẍ */ - 0x1e8f, 499, /* ẏ Ẏ */ - 0x1e91, 499, /* ẑ Ẑ */ - 0x1e93, 499, /* ẓ Ẓ */ - 0x1e95, 499, /* ẕ Ẕ */ - 0x1ea1, 499, /* ạ Ạ */ - 0x1ea3, 499, /* ả Ả */ - 0x1ea5, 499, /* ấ Ấ */ - 0x1ea7, 499, /* ầ Ầ */ - 0x1ea9, 499, /* ẩ Ẩ */ - 0x1eab, 499, /* ẫ Ẫ */ - 0x1ead, 499, /* ậ Ậ */ - 0x1eaf, 499, /* ắ Ắ */ - 0x1eb1, 499, /* ằ Ằ */ - 0x1eb3, 499, /* ẳ Ẳ */ - 0x1eb5, 499, /* ẵ Ẵ */ - 0x1eb7, 499, /* ặ Ặ */ - 0x1eb9, 499, /* ẹ Ẹ */ - 0x1ebb, 499, /* ẻ Ẻ */ - 0x1ebd, 499, /* ẽ Ẽ */ - 0x1ebf, 499, /* ế Ế */ - 0x1ec1, 499, /* ề Ề */ - 0x1ec3, 499, /* ể Ể */ - 0x1ec5, 499, /* ễ Ễ */ - 0x1ec7, 499, /* ệ Ệ */ - 0x1ec9, 499, /* ỉ Ỉ */ - 0x1ecb, 499, /* ị Ị */ - 0x1ecd, 499, /* ọ Ọ */ - 0x1ecf, 499, /* ỏ Ỏ */ - 0x1ed1, 499, /* ố Ố */ - 0x1ed3, 499, /* ồ Ồ */ - 0x1ed5, 499, /* ổ Ổ */ - 0x1ed7, 499, /* ỗ Ỗ */ - 0x1ed9, 499, /* ộ Ộ */ - 0x1edb, 499, /* ớ Ớ */ - 0x1edd, 499, /* ờ Ờ */ - 0x1edf, 499, /* ở Ở */ - 0x1ee1, 499, /* ỡ Ỡ */ - 0x1ee3, 499, /* ợ Ợ */ - 0x1ee5, 499, /* ụ Ụ */ - 0x1ee7, 499, /* ủ Ủ */ - 0x1ee9, 499, /* ứ Ứ */ - 0x1eeb, 499, /* ừ Ừ */ - 0x1eed, 499, /* ử Ử */ - 0x1eef, 499, /* ữ Ữ */ - 0x1ef1, 499, /* ự Ự */ - 0x1ef3, 499, /* ỳ Ỳ */ - 0x1ef5, 499, /* ỵ Ỵ */ - 0x1ef7, 499, /* ỷ Ỷ */ - 0x1ef9, 499, /* ỹ Ỹ */ - 0x1f51, 508, /* ὑ Ὑ */ - 0x1f53, 508, /* ὓ Ὓ */ - 0x1f55, 508, /* ὕ Ὕ */ - 0x1f57, 508, /* ὗ Ὗ */ - 0x1fb3, 509, /* ᾳ ᾼ */ - 0x1fc3, 509, /* ῃ ῌ */ - 0x1fe5, 507, /* ῥ Ῥ */ - 0x1ff3, 509, /* ῳ ῼ */ -}; - -static Rune __isdigitr[] = { - 0x0030, 0x0039, - 0x0660, 0x0669, - 0x06f0, 0x06f9, - 0x07c0, 0x07c9, - 0x0966, 0x096f, - 0x09e6, 0x09ef, - 0x0a66, 0x0a6f, - 0x0ae6, 0x0aef, - 0x0b66, 0x0b6f, - 0x0be6, 0x0bef, - 0x0c66, 0x0c6f, - 0x0ce6, 0x0cef, - 0x0d66, 0x0d6f, - 0x0e50, 0x0e59, - 0x0ed0, 0x0ed9, - 0x0f20, 0x0f29, - 0x1040, 0x1049, - 0x17e0, 0x17e9, - 0x1810, 0x1819, - 0x1946, 0x194f, - 0x19d0, 0x19d9, - 0x1b50, 0x1b59, - 0xff10, 0xff19, - 0x104a0, 0x104a9, - 0x1d7ce, 0x1d7ff, -}; - -/* - * upper case ranges - * 3rd col is conversion excess 500 - */ -static -Rune _tolower2[] = -{ - 0x0041, 0x005a, 532, /* A-Z a-z */ - 0x00c0, 0x00d6, 532, /* À-Ö à-ö */ - 0x00d8, 0x00de, 532, /* Ø-Þ ø-þ */ - 0x0189, 0x018a, 705, /* Ɖ-Ɗ ɖ-ɗ */ - 0x018e, 0x018f, 702, /* Ǝ-Ə ɘ-ə */ - 0x01b1, 0x01b2, 717, /* Ʊ-Ʋ ʊ-ʋ */ - 0x0388, 0x038a, 537, /* Έ-Ί έ-ί */ - 0x038e, 0x038f, 563, /* Ύ-Ώ ύ-ώ */ - 0x0391, 0x03a1, 532, /* Α-Ρ α-ρ */ - 0x03a3, 0x03ab, 532, /* Σ-Ϋ σ-ϋ */ - 0x0401, 0x040c, 580, /* Ё-Ќ ё-ќ */ - 0x040e, 0x040f, 580, /* Ў-Џ ў-џ */ - 0x0410, 0x042f, 532, /* А-Я а-я */ - 0x0531, 0x0556, 548, /* Ա-Ֆ ա-ֆ */ - 0x10a0, 0x10c5, 548, /* Ⴀ-Ⴥ ა-ჵ */ - 0x1f08, 0x1f0f, 492, /* Ἀ-Ἇ ἀ-ἇ */ - 0x1f18, 0x1f1d, 492, /* Ἐ-Ἕ ἐ-ἕ */ - 0x1f28, 0x1f2f, 492, /* Ἠ-Ἧ ἠ-ἧ */ - 0x1f38, 0x1f3f, 492, /* Ἰ-Ἷ ἰ-ἷ */ - 0x1f48, 0x1f4d, 492, /* Ὀ-Ὅ ὀ-ὅ */ - 0x1f68, 0x1f6f, 492, /* Ὠ-Ὧ ὠ-ὧ */ - 0x1f88, 0x1f8f, 492, /* ᾈ-ᾏ ᾀ-ᾇ */ - 0x1f98, 0x1f9f, 492, /* ᾘ-ᾟ ᾐ-ᾗ */ - 0x1fa8, 0x1faf, 492, /* ᾨ-ᾯ ᾠ-ᾧ */ - 0x1fb8, 0x1fb9, 492, /* Ᾰ-Ᾱ ᾰ-ᾱ */ - 0x1fba, 0x1fbb, 426, /* Ὰ-Ά ὰ-ά */ - 0x1fc8, 0x1fcb, 414, /* Ὲ-Ή ὲ-ή */ - 0x1fd8, 0x1fd9, 492, /* Ῐ-Ῑ ῐ-ῑ */ - 0x1fda, 0x1fdb, 400, /* Ὶ-Ί ὶ-ί */ - 0x1fe8, 0x1fe9, 492, /* Ῠ-Ῡ ῠ-ῡ */ - 0x1fea, 0x1feb, 388, /* Ὺ-Ύ ὺ-ύ */ - 0x1ff8, 0x1ff9, 372, /* Ὸ-Ό ὸ-ό */ - 0x1ffa, 0x1ffb, 374, /* Ὼ-Ώ ὼ-ώ */ - 0x2160, 0x216f, 516, /* Ⅰ-Ⅿ ⅰ-ⅿ */ - 0x24b6, 0x24cf, 526, /* Ⓐ-Ⓩ ⓐ-ⓩ */ - 0xff21, 0xff3a, 532, /* A-Z a-z */ -}; - -/* - * upper case singlets - * 2nd col is conversion excess 500 - */ -static -Rune _tolower1[] = -{ - 0x0100, 501, /* Ā ā */ - 0x0102, 501, /* Ă ă */ - 0x0104, 501, /* Ą ą */ - 0x0106, 501, /* Ć ć */ - 0x0108, 501, /* Ĉ ĉ */ - 0x010a, 501, /* Ċ ċ */ - 0x010c, 501, /* Č č */ - 0x010e, 501, /* Ď ď */ - 0x0110, 501, /* Đ đ */ - 0x0112, 501, /* Ē ē */ - 0x0114, 501, /* Ĕ ĕ */ - 0x0116, 501, /* Ė ė */ - 0x0118, 501, /* Ę ę */ - 0x011a, 501, /* Ě ě */ - 0x011c, 501, /* Ĝ ĝ */ - 0x011e, 501, /* Ğ ğ */ - 0x0120, 501, /* Ġ ġ */ - 0x0122, 501, /* Ģ ģ */ - 0x0124, 501, /* Ĥ ĥ */ - 0x0126, 501, /* Ħ ħ */ - 0x0128, 501, /* Ĩ ĩ */ - 0x012a, 501, /* Ī ī */ - 0x012c, 501, /* Ĭ ĭ */ - 0x012e, 501, /* Į į */ - 0x0130, 301, /* İ i */ - 0x0132, 501, /* IJ ij */ - 0x0134, 501, /* Ĵ ĵ */ - 0x0136, 501, /* Ķ ķ */ - 0x0139, 501, /* Ĺ ĺ */ - 0x013b, 501, /* Ļ ļ */ - 0x013d, 501, /* Ľ ľ */ - 0x013f, 501, /* Ŀ ŀ */ - 0x0141, 501, /* Ł ł */ - 0x0143, 501, /* Ń ń */ - 0x0145, 501, /* Ņ ņ */ - 0x0147, 501, /* Ň ň */ - 0x014a, 501, /* Ŋ ŋ */ - 0x014c, 501, /* Ō ō */ - 0x014e, 501, /* Ŏ ŏ */ - 0x0150, 501, /* Ő ő */ - 0x0152, 501, /* Œ œ */ - 0x0154, 501, /* Ŕ ŕ */ - 0x0156, 501, /* Ŗ ŗ */ - 0x0158, 501, /* Ř ř */ - 0x015a, 501, /* Ś ś */ - 0x015c, 501, /* Ŝ ŝ */ - 0x015e, 501, /* Ş ş */ - 0x0160, 501, /* Š š */ - 0x0162, 501, /* Ţ ţ */ - 0x0164, 501, /* Ť ť */ - 0x0166, 501, /* Ŧ ŧ */ - 0x0168, 501, /* Ũ ũ */ - 0x016a, 501, /* Ū ū */ - 0x016c, 501, /* Ŭ ŭ */ - 0x016e, 501, /* Ů ů */ - 0x0170, 501, /* Ű ű */ - 0x0172, 501, /* Ų ų */ - 0x0174, 501, /* Ŵ ŵ */ - 0x0176, 501, /* Ŷ ŷ */ - 0x0178, 379, /* Ÿ ÿ */ - 0x0179, 501, /* Ź ź */ - 0x017b, 501, /* Ż ż */ - 0x017d, 501, /* Ž ž */ - 0x0181, 710, /* Ɓ ɓ */ - 0x0182, 501, /* Ƃ ƃ */ - 0x0184, 501, /* Ƅ ƅ */ - 0x0186, 706, /* Ɔ ɔ */ - 0x0187, 501, /* Ƈ ƈ */ - 0x018b, 501, /* Ƌ ƌ */ - 0x0190, 703, /* Ɛ ɛ */ - 0x0191, 501, /* Ƒ ƒ */ - 0x0193, 705, /* Ɠ ɠ */ - 0x0194, 707, /* Ɣ ɣ */ - 0x0196, 711, /* Ɩ ɩ */ - 0x0197, 709, /* Ɨ ɨ */ - 0x0198, 501, /* Ƙ ƙ */ - 0x019c, 711, /* Ɯ ɯ */ - 0x019d, 713, /* Ɲ ɲ */ - 0x01a0, 501, /* Ơ ơ */ - 0x01a2, 501, /* Ƣ ƣ */ - 0x01a4, 501, /* Ƥ ƥ */ - 0x01a7, 501, /* Ƨ ƨ */ - 0x01a9, 718, /* Ʃ ʃ */ - 0x01ac, 501, /* Ƭ ƭ */ - 0x01ae, 718, /* Ʈ ʈ */ - 0x01af, 501, /* Ư ư */ - 0x01b3, 501, /* Ƴ ƴ */ - 0x01b5, 501, /* Ƶ ƶ */ - 0x01b7, 719, /* Ʒ ʒ */ - 0x01b8, 501, /* Ƹ ƹ */ - 0x01bc, 501, /* Ƽ ƽ */ - 0x01c4, 502, /* DŽ dž */ - 0x01c5, 501, /* Dž dž */ - 0x01c7, 502, /* LJ lj */ - 0x01c8, 501, /* Lj lj */ - 0x01ca, 502, /* NJ nj */ - 0x01cb, 501, /* Nj nj */ - 0x01cd, 501, /* Ǎ ǎ */ - 0x01cf, 501, /* Ǐ ǐ */ - 0x01d1, 501, /* Ǒ ǒ */ - 0x01d3, 501, /* Ǔ ǔ */ - 0x01d5, 501, /* Ǖ ǖ */ - 0x01d7, 501, /* Ǘ ǘ */ - 0x01d9, 501, /* Ǚ ǚ */ - 0x01db, 501, /* Ǜ ǜ */ - 0x01de, 501, /* Ǟ ǟ */ - 0x01e0, 501, /* Ǡ ǡ */ - 0x01e2, 501, /* Ǣ ǣ */ - 0x01e4, 501, /* Ǥ ǥ */ - 0x01e6, 501, /* Ǧ ǧ */ - 0x01e8, 501, /* Ǩ ǩ */ - 0x01ea, 501, /* Ǫ ǫ */ - 0x01ec, 501, /* Ǭ ǭ */ - 0x01ee, 501, /* Ǯ ǯ */ - 0x01f1, 502, /* DZ dz */ - 0x01f2, 501, /* Dz dz */ - 0x01f4, 501, /* Ǵ ǵ */ - 0x01fa, 501, /* Ǻ ǻ */ - 0x01fc, 501, /* Ǽ ǽ */ - 0x01fe, 501, /* Ǿ ǿ */ - 0x0200, 501, /* Ȁ ȁ */ - 0x0202, 501, /* Ȃ ȃ */ - 0x0204, 501, /* Ȅ ȅ */ - 0x0206, 501, /* Ȇ ȇ */ - 0x0208, 501, /* Ȉ ȉ */ - 0x020a, 501, /* Ȋ ȋ */ - 0x020c, 501, /* Ȍ ȍ */ - 0x020e, 501, /* Ȏ ȏ */ - 0x0210, 501, /* Ȑ ȑ */ - 0x0212, 501, /* Ȓ ȓ */ - 0x0214, 501, /* Ȕ ȕ */ - 0x0216, 501, /* Ȗ ȗ */ - 0x0386, 538, /* Ά ά */ - 0x038c, 564, /* Ό ό */ - 0x03e2, 501, /* Ϣ ϣ */ - 0x03e4, 501, /* Ϥ ϥ */ - 0x03e6, 501, /* Ϧ ϧ */ - 0x03e8, 501, /* Ϩ ϩ */ - 0x03ea, 501, /* Ϫ ϫ */ - 0x03ec, 501, /* Ϭ ϭ */ - 0x03ee, 501, /* Ϯ ϯ */ - 0x0460, 501, /* Ѡ ѡ */ - 0x0462, 501, /* Ѣ ѣ */ - 0x0464, 501, /* Ѥ ѥ */ - 0x0466, 501, /* Ѧ ѧ */ - 0x0468, 501, /* Ѩ ѩ */ - 0x046a, 501, /* Ѫ ѫ */ - 0x046c, 501, /* Ѭ ѭ */ - 0x046e, 501, /* Ѯ ѯ */ - 0x0470, 501, /* Ѱ ѱ */ - 0x0472, 501, /* Ѳ ѳ */ - 0x0474, 501, /* Ѵ ѵ */ - 0x0476, 501, /* Ѷ ѷ */ - 0x0478, 501, /* Ѹ ѹ */ - 0x047a, 501, /* Ѻ ѻ */ - 0x047c, 501, /* Ѽ ѽ */ - 0x047e, 501, /* Ѿ ѿ */ - 0x0480, 501, /* Ҁ ҁ */ - 0x0490, 501, /* Ґ ґ */ - 0x0492, 501, /* Ғ ғ */ - 0x0494, 501, /* Ҕ ҕ */ - 0x0496, 501, /* Җ җ */ - 0x0498, 501, /* Ҙ ҙ */ - 0x049a, 501, /* Қ қ */ - 0x049c, 501, /* Ҝ ҝ */ - 0x049e, 501, /* Ҟ ҟ */ - 0x04a0, 501, /* Ҡ ҡ */ - 0x04a2, 501, /* Ң ң */ - 0x04a4, 501, /* Ҥ ҥ */ - 0x04a6, 501, /* Ҧ ҧ */ - 0x04a8, 501, /* Ҩ ҩ */ - 0x04aa, 501, /* Ҫ ҫ */ - 0x04ac, 501, /* Ҭ ҭ */ - 0x04ae, 501, /* Ү ү */ - 0x04b0, 501, /* Ұ ұ */ - 0x04b2, 501, /* Ҳ ҳ */ - 0x04b4, 501, /* Ҵ ҵ */ - 0x04b6, 501, /* Ҷ ҷ */ - 0x04b8, 501, /* Ҹ ҹ */ - 0x04ba, 501, /* Һ һ */ - 0x04bc, 501, /* Ҽ ҽ */ - 0x04be, 501, /* Ҿ ҿ */ - 0x04c1, 501, /* Ӂ ӂ */ - 0x04c3, 501, /* Ӄ ӄ */ - 0x04c7, 501, /* Ӈ ӈ */ - 0x04cb, 501, /* Ӌ ӌ */ - 0x04d0, 501, /* Ӑ ӑ */ - 0x04d2, 501, /* Ӓ ӓ */ - 0x04d4, 501, /* Ӕ ӕ */ - 0x04d6, 501, /* Ӗ ӗ */ - 0x04d8, 501, /* Ә ә */ - 0x04da, 501, /* Ӛ ӛ */ - 0x04dc, 501, /* Ӝ ӝ */ - 0x04de, 501, /* Ӟ ӟ */ - 0x04e0, 501, /* Ӡ ӡ */ - 0x04e2, 501, /* Ӣ ӣ */ - 0x04e4, 501, /* Ӥ ӥ */ - 0x04e6, 501, /* Ӧ ӧ */ - 0x04e8, 501, /* Ө ө */ - 0x04ea, 501, /* Ӫ ӫ */ - 0x04ee, 501, /* Ӯ ӯ */ - 0x04f0, 501, /* Ӱ ӱ */ - 0x04f2, 501, /* Ӳ ӳ */ - 0x04f4, 501, /* Ӵ ӵ */ - 0x04f8, 501, /* Ӹ ӹ */ - 0x1e00, 501, /* Ḁ ḁ */ - 0x1e02, 501, /* Ḃ ḃ */ - 0x1e04, 501, /* Ḅ ḅ */ - 0x1e06, 501, /* Ḇ ḇ */ - 0x1e08, 501, /* Ḉ ḉ */ - 0x1e0a, 501, /* Ḋ ḋ */ - 0x1e0c, 501, /* Ḍ ḍ */ - 0x1e0e, 501, /* Ḏ ḏ */ - 0x1e10, 501, /* Ḑ ḑ */ - 0x1e12, 501, /* Ḓ ḓ */ - 0x1e14, 501, /* Ḕ ḕ */ - 0x1e16, 501, /* Ḗ ḗ */ - 0x1e18, 501, /* Ḙ ḙ */ - 0x1e1a, 501, /* Ḛ ḛ */ - 0x1e1c, 501, /* Ḝ ḝ */ - 0x1e1e, 501, /* Ḟ ḟ */ - 0x1e20, 501, /* Ḡ ḡ */ - 0x1e22, 501, /* Ḣ ḣ */ - 0x1e24, 501, /* Ḥ ḥ */ - 0x1e26, 501, /* Ḧ ḧ */ - 0x1e28, 501, /* Ḩ ḩ */ - 0x1e2a, 501, /* Ḫ ḫ */ - 0x1e2c, 501, /* Ḭ ḭ */ - 0x1e2e, 501, /* Ḯ ḯ */ - 0x1e30, 501, /* Ḱ ḱ */ - 0x1e32, 501, /* Ḳ ḳ */ - 0x1e34, 501, /* Ḵ ḵ */ - 0x1e36, 501, /* Ḷ ḷ */ - 0x1e38, 501, /* Ḹ ḹ */ - 0x1e3a, 501, /* Ḻ ḻ */ - 0x1e3c, 501, /* Ḽ ḽ */ - 0x1e3e, 501, /* Ḿ ḿ */ - 0x1e40, 501, /* Ṁ ṁ */ - 0x1e42, 501, /* Ṃ ṃ */ - 0x1e44, 501, /* Ṅ ṅ */ - 0x1e46, 501, /* Ṇ ṇ */ - 0x1e48, 501, /* Ṉ ṉ */ - 0x1e4a, 501, /* Ṋ ṋ */ - 0x1e4c, 501, /* Ṍ ṍ */ - 0x1e4e, 501, /* Ṏ ṏ */ - 0x1e50, 501, /* Ṑ ṑ */ - 0x1e52, 501, /* Ṓ ṓ */ - 0x1e54, 501, /* Ṕ ṕ */ - 0x1e56, 501, /* Ṗ ṗ */ - 0x1e58, 501, /* Ṙ ṙ */ - 0x1e5a, 501, /* Ṛ ṛ */ - 0x1e5c, 501, /* Ṝ ṝ */ - 0x1e5e, 501, /* Ṟ ṟ */ - 0x1e60, 501, /* Ṡ ṡ */ - 0x1e62, 501, /* Ṣ ṣ */ - 0x1e64, 501, /* Ṥ ṥ */ - 0x1e66, 501, /* Ṧ ṧ */ - 0x1e68, 501, /* Ṩ ṩ */ - 0x1e6a, 501, /* Ṫ ṫ */ - 0x1e6c, 501, /* Ṭ ṭ */ - 0x1e6e, 501, /* Ṯ ṯ */ - 0x1e70, 501, /* Ṱ ṱ */ - 0x1e72, 501, /* Ṳ ṳ */ - 0x1e74, 501, /* Ṵ ṵ */ - 0x1e76, 501, /* Ṷ ṷ */ - 0x1e78, 501, /* Ṹ ṹ */ - 0x1e7a, 501, /* Ṻ ṻ */ - 0x1e7c, 501, /* Ṽ ṽ */ - 0x1e7e, 501, /* Ṿ ṿ */ - 0x1e80, 501, /* Ẁ ẁ */ - 0x1e82, 501, /* Ẃ ẃ */ - 0x1e84, 501, /* Ẅ ẅ */ - 0x1e86, 501, /* Ẇ ẇ */ - 0x1e88, 501, /* Ẉ ẉ */ - 0x1e8a, 501, /* Ẋ ẋ */ - 0x1e8c, 501, /* Ẍ ẍ */ - 0x1e8e, 501, /* Ẏ ẏ */ - 0x1e90, 501, /* Ẑ ẑ */ - 0x1e92, 501, /* Ẓ ẓ */ - 0x1e94, 501, /* Ẕ ẕ */ - 0x1ea0, 501, /* Ạ ạ */ - 0x1ea2, 501, /* Ả ả */ - 0x1ea4, 501, /* Ấ ấ */ - 0x1ea6, 501, /* Ầ ầ */ - 0x1ea8, 501, /* Ẩ ẩ */ - 0x1eaa, 501, /* Ẫ ẫ */ - 0x1eac, 501, /* Ậ ậ */ - 0x1eae, 501, /* Ắ ắ */ - 0x1eb0, 501, /* Ằ ằ */ - 0x1eb2, 501, /* Ẳ ẳ */ - 0x1eb4, 501, /* Ẵ ẵ */ - 0x1eb6, 501, /* Ặ ặ */ - 0x1eb8, 501, /* Ẹ ẹ */ - 0x1eba, 501, /* Ẻ ẻ */ - 0x1ebc, 501, /* Ẽ ẽ */ - 0x1ebe, 501, /* Ế ế */ - 0x1ec0, 501, /* Ề ề */ - 0x1ec2, 501, /* Ể ể */ - 0x1ec4, 501, /* Ễ ễ */ - 0x1ec6, 501, /* Ệ ệ */ - 0x1ec8, 501, /* Ỉ ỉ */ - 0x1eca, 501, /* Ị ị */ - 0x1ecc, 501, /* Ọ ọ */ - 0x1ece, 501, /* Ỏ ỏ */ - 0x1ed0, 501, /* Ố ố */ - 0x1ed2, 501, /* Ồ ồ */ - 0x1ed4, 501, /* Ổ ổ */ - 0x1ed6, 501, /* Ỗ ỗ */ - 0x1ed8, 501, /* Ộ ộ */ - 0x1eda, 501, /* Ớ ớ */ - 0x1edc, 501, /* Ờ ờ */ - 0x1ede, 501, /* Ở ở */ - 0x1ee0, 501, /* Ỡ ỡ */ - 0x1ee2, 501, /* Ợ ợ */ - 0x1ee4, 501, /* Ụ ụ */ - 0x1ee6, 501, /* Ủ ủ */ - 0x1ee8, 501, /* Ứ ứ */ - 0x1eea, 501, /* Ừ ừ */ - 0x1eec, 501, /* Ử ử */ - 0x1eee, 501, /* Ữ ữ */ - 0x1ef0, 501, /* Ự ự */ - 0x1ef2, 501, /* Ỳ ỳ */ - 0x1ef4, 501, /* Ỵ ỵ */ - 0x1ef6, 501, /* Ỷ ỷ */ - 0x1ef8, 501, /* Ỹ ỹ */ - 0x1f59, 492, /* Ὑ ὑ */ - 0x1f5b, 492, /* Ὓ ὓ */ - 0x1f5d, 492, /* Ὕ ὕ */ - 0x1f5f, 492, /* Ὗ ὗ */ - 0x1fbc, 491, /* ᾼ ᾳ */ - 0x1fcc, 491, /* ῌ ῃ */ - 0x1fec, 493, /* Ῥ ῥ */ - 0x1ffc, 491, /* ῼ ῳ */ -}; - -/* - * title characters are those between - * upper and lower case. ie DZ Dz dz - */ -static -Rune _totitle1[] = -{ - 0x01c4, 501, /* DŽ Dž */ - 0x01c6, 499, /* dž Dž */ - 0x01c7, 501, /* LJ Lj */ - 0x01c9, 499, /* lj Lj */ - 0x01ca, 501, /* NJ Nj */ - 0x01cc, 499, /* nj Nj */ - 0x01f1, 501, /* DZ Dz */ - 0x01f3, 499, /* dz Dz */ -}; - -static -Rune* -bsearch(Rune c, Rune *t, int n, int ne) -{ - Rune *p; - int m; - - while(n > 1) { - m = n/2; - p = t + m*ne; - if(c >= p[0]) { - t = p; - n = n-m; - } else - n = m; - } - if(n && c >= t[0]) - return t; - return 0; -} - -Rune -tolowerrune(Rune c) -{ - Rune *p; - - p = bsearch(c, _tolower2, nelem(_tolower2)/3, 3); - if(p && c >= p[0] && c <= p[1]) - return c + p[2] - 500; - p = bsearch(c, _tolower1, nelem(_tolower1)/2, 2); - if(p && c == p[0]) - return c + p[1] - 500; - return c; -} - -Rune -toupperrune(Rune c) -{ - Rune *p; - - p = bsearch(c, _toupper2, nelem(_toupper2)/3, 3); - if(p && c >= p[0] && c <= p[1]) - return c + p[2] - 500; - p = bsearch(c, _toupper1, nelem(_toupper1)/2, 2); - if(p && c == p[0]) - return c + p[1] - 500; - return c; -} - -Rune -totitlerune(Rune c) -{ - Rune *p; - - p = bsearch(c, _totitle1, nelem(_totitle1)/2, 2); - if(p && c == p[0]) - return c + p[1] - 500; - return c; -} - -int -islowerrune(Rune c) -{ - Rune *p; - - p = bsearch(c, _toupper2, nelem(_toupper2)/3, 3); - if(p && c >= p[0] && c <= p[1]) - return 1; - p = bsearch(c, _toupper1, nelem(_toupper1)/2, 2); - if(p && c == p[0]) - return 1; - return 0; -} - -int -isupperrune(Rune c) -{ - Rune *p; - - p = bsearch(c, _tolower2, nelem(_tolower2)/3, 3); - if(p && c >= p[0] && c <= p[1]) - return 1; - p = bsearch(c, _tolower1, nelem(_tolower1)/2, 2); - if(p && c == p[0]) - return 1; - return 0; -} - -int -isalpharune(Rune c) -{ - Rune *p; - - if(isupperrune(c) || islowerrune(c)) - return 1; - p = bsearch(c, _alpha2, nelem(_alpha2)/2, 2); - if(p && c >= p[0] && c <= p[1]) - return 1; - p = bsearch(c, _alpha1, nelem(_alpha1), 1); - if(p && c == p[0]) - return 1; - return 0; -} - -int -istitlerune(Rune c) -{ - return isupperrune(c) && islowerrune(c); -} - -int -isspacerune(Rune c) -{ - Rune *p; - - p = bsearch(c, _space2, nelem(_space2)/2, 2); - if(p && c >= p[0] && c <= p[1]) - return 1; - return 0; -} - -int -isdigitrune(Rune c) -{ - Rune *p; - - p = bsearch(c, __isdigitr, nelem(__isdigitr)/2, 2); - if(p && c >= p[0] && c <= p[1]) - return 1; - return 0; -} --- a//sys/src/libc/test/mkfile +++ b//sys/src/libc/test/mkfile @@ -3,6 +3,8 @@ TEST=\ date\ pow\ + runebreak\ + runenorm\ strchr\ </sys/src/cmd/mktest --- /dev/null +++ b//sys/src/libc/test/runebreak.c @@ -1,0 +1,93 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> + +static int +estrtoul(char *s) +{ + char *epr; + Rune code; + + code = strtoul(s, &epr, 16); + if(s == epr) + sysfatal("bad code point hex string"); + return code; +} + +void +run(char *file, Rune* (*fn)(Rune*)) +{ + Biobuf *b; + char *p, *dot; + char *pieces[16]; + int i, j, n; + Rune stack[16], ops[16]; + int nstack, nops; + Rune r, *rp, *rp2; + char *line; + + b = Bopen(file, OREAD); + if(b == nil) + sysfatal("could not load composition exclusions: %r"); + + for(;(p = Brdline(b, '\n')) != nil; free(line)){ + p[Blinelen(b)-1] = 0; + line = strdup(p); + if(p[0] == 0 || p[0] == '#') + continue; + if((dot = strstr(p, "#")) != nil) + *dot = 0; + n = getfields(p, pieces, nelem(pieces), 0, " "); + nstack = nops = 0; + for(i = 0; i < n; i++){ + chartorune(&r, pieces[i]); + if(r != L'÷' && r != L'×'){ + r = estrtoul(pieces[i]); + stack[nstack++] = r; + stack[nstack] = 0; + } else { + ops[nops++] = r; + ops[nops] = 0; + } + } + + rp = stack; + for(i = 1; i < nops-1;){ + rp2 = fn(rp); + switch(ops[i]){ + case L'÷': + if(rp2 != rp+1){ + print("break fail %X %X || %s\n", rp[0], rp[1], line); + goto Break; + } + rp++; + i++; + break; + case L'×': + if(rp2 - rp == 0){ + for(j = i; j < nops - 1; j++) + if(ops[j] != L'×') + print("skipped %d %d %s\n", i, nops, line); + goto Break; + } + for(; rp < (rp2-1); rp++, i++){ + if(ops[i] != L'×') + print("skipped %d %d %s\n", i, nops, line); + } + rp = rp2; + i++; + break; + } + } +Break: + ; + } +} + +void +main(int, char) +{ + run("/lib/ucd/GraphemeBreakTest.txt", runegbreak); + run("/lib/ucd/WordBreakTest.txt", runewbreak); + exits(nil); +} --- /dev/null +++ b//sys/src/libc/test/runenorm.c @@ -1,0 +1,92 @@ +#include <u.h> +#include <libc.h> +#include <bio.h> + +static int +estrtoul(char *s) +{ + char *epr; + Rune code; + + code = strtoul(s, &epr, 16); + if(s == epr) + sysfatal("bad code point hex string"); + return code; +} + +void +main(int, char) +{ + Rune buffer1[64]; + Rune buffer2[64]; + char utfbuff1[128]; + char utfbuff2[128]; + char srctmp[128], tmp1[128], tmp2[128]; + char *fields[10]; + char *runes[32]; + char *p; + int n, n2; + int i; + uint fail; + Biobuf *b; + + b = Bopen("/lib/ucd/NormalizationTest.txt", OREAD); + if(b == nil) + sysfatal("could not load composition exclusions: %r"); + + struct { + Rune src[32]; + Rune nfc[32]; + Rune nfd[32]; + } test; + while((p = Brdline(b, '\n')) != nil){ + p[Blinelen(b)-1] = 0; + if(p[0] == 0 || p[0] == '#' || p[0] == '@') + continue; + getfields(p, fields, 6 + 1, 0, ";"); + n = getfields(fields[0], runes, nelem(runes), 0, " "); + for(i = 0; i < n; i++) + test.src[i] = estrtoul(runes[i]); + test.src[i] = 0; + + n = getfields(fields[1], runes, nelem(runes), 0, " "); + for(i = 0; i < n; i++) + test.nfc[i] = estrtoul(runes[i]); + test.nfc[i] = 0; + + n = getfields(fields[2], runes, nelem(runes), 0, " "); + for(i = 0; i < n; i++) + test.nfd[i] = estrtoul(runes[i]); + test.nfd[i] = 0; + + n = runenorm(buffer1, test.src, nelem(buffer1), 1); + n2 = runenorm(buffer2, test.src, nelem(buffer2), 0); + fail = 0; + + if(runestrcmp(buffer1, test.nfc) != 0) + fail |= 1<<0; + if(runestrcmp(buffer2, test.nfd) != 0) + fail |= 1<<1; + if(fail) + print("%d %d %S %S %S %S %S\n", fail, i, test.src, test.nfd, test.nfc, buffer2, buffer1); + assert(n == runestrlen(test.nfc)); + assert(n2 == runestrlen(test.nfd)); + + snprint(srctmp, sizeof tmp1, "%S", test.src); + snprint(tmp1, sizeof tmp1, "%S", test.nfc); + snprint(tmp2, sizeof tmp2, "%S", test.nfd); + + n = utfnorm(utfbuff1, srctmp, nelem(utfbuff1), 1); + n2 = utfnorm(utfbuff2, srctmp, nelem(utfbuff2), 0); + + if(strcmp(utfbuff1, tmp1) != 0) + fail |= 1<<2; + if(strcmp(utfbuff2, tmp2) != 0) + fail |= 1<<3; + if(fail) + print("%d %d %s %s %s %s %s\n", fail, i, srctmp, tmp2, tmp1, utfbuff2, utfbuff1); + assert(n == strlen(tmp1)); + assert(n2 == strlen(tmp2)); + } + exits(nil); +}