OK, turing.

<- leave blank

Fri Mar 24 11:15:22 EDT 2023


     RUNECOMP(2) RUNECOMP(2)

     NAME
	  runecomp, runedecomp, fullrunenorm, runegbreak, runewbreak,
	  utfcomp, utfdecomp, fullutfnorm, utfgbreak, utfwbreak -
	  multi-rune graphemes

     SYNOPSIS
	  #include <u.h>
	  #include <libc.h>

	  int runecomp(Rune *dst, Rune *src, int max)

	  int runedecomp(Rune *dst, Rune *src, int max)

	  Rune* fullrunenorm(Rune *s, int n)

	  Rune* runegbreak(Rune *s)

	  Rune* runewbreak(Rune *s)

	  int utfcomp(char *dst, char *src, int max)

	  int utfdecomp(char *dst, char *src, int max)

	  char* fullutfnorm(char *s, int n)

	  char* utfgbreak(char *s)

	  char* utfwbreak(char *s)

     DESCRIPTION
	  These routines help in handling graphemes that may span mul-
	  tiple runes.

	  Runecomp, runedecomp, utfcomp, and utfdecomp perform Uni-
	  code® normalization on src, storing the result in dst.  No
	  more then max elements will be written, and the resulting
	  string will always be null terminated.  The return value is
	  always the total number of elements required to store the
	  transformation.  If this value is larger then the supplied
	  max the caller can assume the result has been truncated.
	  Runecomp and utfcomp perform NFC normalization while
	  runedecomp and utfdecomp perform NFD normalization.

	  Fullrunenorm, and fullutfnorm determine if enough elements
	  are present in s to perform normalization.  If enough are
	  present, a pointer is returned to the first element that
	  begins the next context.  Otherwise s is returned.  No more
	  then n elements will be read.

	  Runegbreak, and utfgbreak search s for the next grapheme
	  break opportunity.  If none are found before the end of the
	  string, s is returned.

	  Runewbreak, and utfwbreak search s for the next word break
	  opportunity.  If none are found before the end of the
	  string, s is returned.

     SOURCE
	  /sys/src/libc/port/mkrunetype.c
	  /sys/src/libc/port/runenorm.c
	  /sys/src/libc/port/runebreak.c

     SEE ALSO
	  Unicode® Standard Annex #15
	  Unicode® Standard Annex #29
	  rune(2), utf(6), tcs(1)



Fri Mar 24 09:32:37 EDT 2023
JOiDoeivtOWFq+mBkyTvvIjnrKzkuIDniYjvvIlieSDkuIvljYjkuInngrnljYoKCjIwMjAuOS4yOO+8jOS4pOS6uuato+W8j+W8gOWni+WQjOahjOOAguS4pOS6uuWQjOahjOebtOWIsDIwMjAuMTAuMTnjgIIKCuS6i+WunuS4iuS4pOS6uuiupOivhuaYr+eUseS6juWcqOS/oeaBr+mYn+mbhuiureOAguesrOS4gOWkqTE144CBMzDjgIE0NuS4ieS6uuWOu+aJvum7hOafkOmcnuS4reWNiOivt+WBh+WOu+acuuaIv+WBmumimO+8jOivt+WBh+adoeWunumZheS4iuaYrzMw5YaZ55qE44CC5L2G5ZCO5p2lMTXnlLHkuo7lpKrllpzmrKLkuK3ljYjnnaHop4nkuobvvIzmiYDku6XlsLHmsqHmnInlho3ljrvkuobjgIIxNemCo+S4gOWkqei/mOmXruS6hjMw77yM6ZeuNDbnmoRpZOaYr+S7gOS5iOOAguS9huaYrzMw5oeS5b6X55CGMTXvvIzmiYDku6XlsLHlj6sxNeiHquW3seWOu+mXrjQ244CC77yI5aW977yM5beu54K55b+Y5LqG6L+Z5pys5Lic6KW/5Y+r6IOh6K+05YWr6YGT77yM5YGP6aKY5LqG77yJCgrlkIzmoYzpgqPkvJrlhL/vvIzmgbDpgKJDU1DliJ3otZvvvIwxNeW3ruWNiuWIhui/h+S9huaYrzMw6L+H57q/5LqG44CCMTXor7Toh6rlt7E2N+WIhu+8jOeEtuWQjjMw6ams5LiK6Lez6LW35p2l6K+05ZOH5aGe5L2g5aW95by65ZWK5q+UNDbpq5jkuoY45YiG77yB54S25ZCO5bCx56qc6L+H5Y675ZGK6K+JNDbkuobvvIHvvIjmnInmhI/mgJ3lkJfvvIzosIHmg7PlkYror4k0NuWViu+8iQoK5pyJ5LiA6IqC57u85ZCI6K++77yM6ICB5biI6YCa6L+H5oq95omR5YWL54mM55qE5pa55byP5Yaz5a6a5YiG57uE44CC6YKj5aSpMzDmir3liLDkuobkuIDlvKDpu5HmoYM544CC6ICB5biI6K+05LiN6IO95pKV6L+Z5Liq5aSn5Z6L55qE5omR5YWL54mM77yM5LiN6IO95Zyo5LiK6Z2i5YaZ5YaZ55S755S777yM5LiN6IO95oqY5Y+g5a6D44CC5LqO5pivMzDmiormiZHlhYvniYzlkIPkuobvvIznhLblkI7lkYror4nkuoYxNe+8jOi/mOivtOi/meagt+W5tuS4jei/neazleOAguWboOS4uuS7luayoeacieaKmOWPoOOAgeaSleavgeaIluiAheWcqOS4iumdouWGmeWGmeeUu+eUu+OAguS7luWPquaYr+aKiuaJkeWFi+eJjOWQg+S6huOAgu+8iOWGmei/meS4queahOaXtuWAmeecn+eahOeskeW+l+aIkeacieeCueW0qea6g++8jOi/meWwseaYr3d4ZuivtOeahOKAnOWGmeW+l+iuqeS6uuS4gOeci+WwseefpemBk+aYr3Rh4oCd55qE5paH56ug5ZCX77yM5oeC5LqG77yJCgrmnInkuIDlpKnliY3lkI7lt6blj7Plh6DkurrorqjorrrnlJ/ml6XvvIwzMOivtOiHquW3seaYrzA45bm0OeaciDTml6XnlJ/ml6XnmoTvvIwxNeWQrOWujOaDiuWPueS6huS4gOWjsO+8jOeEtuWQjuivtOS6huS4gOWPpeKAlOKAlArigJzkvaDnlJ/ml6Xlkoznjovlv4Plh4zlsLHlt67kuIDlpKnor7bvvIHigJ0KMzDvvJrvvJ8K77yIb2ggYmFieeaDheivneWkmuivtOS4gOeCue+8ie+8iOS4jeaYr++8iQrlhbblrp7lkI7mnaUxNeiAgeaYr+iusOmUmTMw55Sf5pel5bCx5piv5Zug5Li6546L5b+D5YeM77yI5LiN5piv77yJ44CCCuWFtuWunuS5i+aJgOS7pTE15Lya6L+Z5LmI6K+05piv5Zug5Li65Yia5Yia5p+l5a6M546L5b+D5YeM55Sf5pel44CCCgrpgZPlvrfkuI7ms5Xmsrvor77vvIzogIHluIjorqnlkIzmoYzorqjorrrmnIDnvqHmhZXmlofkuK3kuKTkvY3kurrnianvvIhta3MmZWdz77yJ55qE5ZOq5pa56Z2i55qE5Y+L6LCK77yM54S25ZCOMzDot58xNeivtO+8jOacgOe+oeaFleKAnGVnc+e7meS6iG1rc+W+iOWkmumSseKAneOAgu+8iOWlveWQp++8jOS7luWcqOmihOiogDE1NDblhavkuIvliafmg4XlkJfvvJ/vvIkKCuWOhuWPsuivvu+8jOiusueZvuWutuS6iem4o++8jOivvuWJjeivu+S5pu+8jDMw5LiOMTXnibnliKvlnLDlpKflo7DvvIzlvojpu5jlpZHlnLDmiormiYDmnInlkI3or43pg73lv7XmiJDigJzlpKflpYbigJ3vvIznhLblkI7ooqszOOiusOWQjeS6hu+8jOWXr++8jOS4gOi1t+iusOeahOOAggoK5pWw5a2m6K++77yM6buE5p+Q6Zye6KeB5LqM5Lq65bm25LiN6K6k55yf5ZCs6K++5Y+q5Zyo5Y+R5ZGG77yM6K+077ya4oCc6K+25L2g5Lus5a2m5L+h5oGv55qE5piv5LiN5piv5bCx5piv5Zac5qyi5oqK5LiA5aCG56yU5pGK5Zyo5qGM6Z2i54S25ZCO5ZWl5Lmf5LiN5bmy5ZWK77yf4oCd54S25ZCO5Lik5Lq66Lqr5ZCO55qENDflub3lub3lnLDmnaXkuobkuIDlj6XvvJrigJx6YXPourrmnqrkuobjgILigJ0KCuacieS4gOWkqeWNiOS8ke+8jDQ35pCeMzDnmoTmpIXlrZDvvIzlm6DkuLrljYjkvJHnmoTml7blgJkzMOWOu+S6huacuuaIv++8jOS4jeWcqOOAgueEtuWQjjQ35bCx5oqK5ZyG6KeE5o+S5YiwMzDluqfkvY3nmoTkuIvovrnvvIjorrDlvpfliJ3kuIDml7blgJnnmoTmpIXlrZDlkJfvvJ/mnInnvJ3nmoTvvIzliJrlpb3lj6/ku6XloZ7lvpfkuIvlnIbop4TvvInvvIznhLblkI7lgbflgbfot58xNeivtOi/meagt+WPr+S7peW6n+aOiTMw5ZCO5Y2K55Sf44CC44CC44CC5b2T54S2MTXlvojlv6vlsLHor7Tlh7rljrvkuobvvIHvvIjlpb3lkKfvvIzmiJHlv5jkuobmmK8zMOaQnjQ355qE6L+Y5pivNDfmkJ4zMOeahO+8jOWPjeato+W3ruS4jeWkmu+8ge+8iQoK5ZCM5qGM5pyA5ZCO5LiA5aSp77yMMTXnnIvnnYDmlrDnmoTluqfkvY3ooajlj5HlkYbvvIzpg4Hpl7flnLDnlKjpkqLlsLrliK7msLTlo7bvvIzliK7lh7rkuIDloIbloZHmlpnlsZHmnaXvvIjpmaTmraTku6XlpJbvvIzov5jliK7kuobmoYzlrZDjgIHmpIXlrZDvvInjgILnhLblkI4zMOW+iOW0qea6g++8jOaIkeS7rOS5n+S4jeefpemBk+S7luWIsOW6leWcqOW0qea6g+S7gOS5iO+8jOS4gOebtOi3nzE16K+077yM5L2g6IO95LiN6IO95LiN6KaB5Yiu5LqG77yM5L2g6IO95LiN6IO95LiN6KaB5Yiu5LqG44CCMTXnv7vkuobkuKrnmb3nnLzor7TkuI3og73vvIzkvYbmmK/lhbblrp7lgZzmiYvkuobjgILvvIjku47mraTku6XlkI4xNeaDszMw5LqG5bCx5Lya5Yiu5rC05aO244CC77yJCgo55pyI6IezMTHmnIjvvIwzMOWSjDQ25LiN5YGc5Zyw5py65oOoMTXvvIznhLblkI4zMOi/mOi3keWIsOeUqOaIt+S4vuaKpeS4k+WMuuS4vuaKpTE155qE6L+d6KeE6KGM5Li677yI5YW25a6e6L+d6KeE6KGM5Li65bCx5pivMzDmi78xNeeahOWPt+aQnueahO+8ieOAgjE15ZCO5p2l6LSo6Zeu5piv6LCB5bmy55qE77yMMzDor7TmmK80NuW5sueahO+8jDQ26K+05pivMzDlubLnmoTvvIzkuozkurrlnYfljYHliIbnnJ/mjJrjgILvvIjlhbblrp7ov5nmmK/ku5bku6zkv6nlr7kxNeacgOecn+aMmueahOS4gOasoe+8iOS4jeaYr++8ie+8ieWboOS4uuW9k+aXtjE155u45L+hMzDvvIzmiYDku6XorqTkuLrov5nku7bkuovmg4XkuIDlrprmmK80NuW5sueahOOAgu+8iOi/meaYr+S4gOacrOato+e7j+adkOaWmeWvueWQp+OAgu+8ie+8iOWFtuWunuacieWlveWHoOasoeexu+S8vOeahO+8jOWPquS4jei/h+W5tOS7o+S5hei/nDE15Lmf5b+Y5LqG77yJCgrlkI7mnaXvvIwxNeWWnOasojQ277yI5L2g56Gu5a6a6L+Z5piv6IOh6K+05YWr6YGT77yf5LiN5piv5LiA5pys5q2j57uP77yf77yJ77yM5b6I5b+r5bCx5ZGK6K+J5LqGMzDjgILmiJHor7TkvaDku6zlj6/og73kuI3kv6HigKbigKYxNei3nzMw5ZCM5qGM55qE5pe25YCZ6L+Y5b6I5oOz6LCD6LWw5pyJ5LiA5Liq5aWz5ZCM5qGM77yM5L2G5piv5o2i5bqn5L2N5LqG5Lul5ZCO5Y+I5aW95YOP5pyJ54K55oOz5b+1MzDvvIzmiYDku6XlsLHljrvmib4zMOOAgueEtuWQjjMw5aiB6IOBMTXkuIDlrpropoHor7TvvIwxNeivtHbmiJE1MOaIkeWwseivtO+8iOWOn+ivneW9k+eEtuS4jeaYr+i/meagt+eahOWVpu+8jHbmiJE1MOi/meaYrzIwMjLmiY3mnInnmoTor43or63llabjgILlj43mraMxNeWwsei/meS4quaEj+aAne+8ieOAgjMw5b2T5pe25bCx5oqKNDbnmoTppa3ljaHpgJLov4fmnaXor7TkvaDmi7/ov5nkuKrljaHljrvliLc1MHLigKbigKYxNeiCr+WumuWPjeW6lOi/h+adpeS6hui/meS4jeaYrzMw55qE6aWt5Y2h77yM54S25ZCO5LuU57uG5LiA55yL6L+Z5aW95YOP5pivNDbnmoTigKbigKbnhLblkI7lkI7mnaXlkYror4nkuoYzMOOAgui/meS4quWRiuivieeahOi/h+eoi+S5n+ibruacieaEj+aAne+8mjE155So55y856We56S65oSPMzDlpbnor7TnmoTmmK80Nu+8jOeEtuiAjDMw56ys5LiA5qyh5rKh5pyJZ2V05Yiw77yMMTXor7TkvaDnn6XpgZPkuoblkKfvvIwzMOivtOaIkeS4jeefpemBk+OAgueEtuWQjjE15Y+I5oyH5LqG5LiA5qyh77yMMzDnn6XpgZPkuobvvJsxNeaAlTMw6K6k6ZSZ54S25ZCO6KaBMzDmjIfkuIDkuIvigJzkvaDmjIfkuIDkuIvmiJHnnIvkvaDmnInmsqHmnInorqTplJnjgILigJ3lvZPml7YzMOWSjDQ2562J5Lq65Zyo6K6y5Y+w5LiK6Z2i546p6K6y5Y+w55qE55S16ISR77yM54S25ZCOMzDlsLHpobrlir/mjIfkuobmjIc0NuOAgjE16K+05piv5LuW5ZOI5ZOI5ZOI5ZOI5ZOI5ZOI5LiN6KaB5ZGK6K+J5LuW77yM54S25ZCO6LeR5byA5LqG44CC77yI5oiR5aOw5piO77yMMTXkuI3llpzmrKI0NuOAguS9huaYr+mCo+S4quaXtuWAmeWPr+iDveaYr+WWnOasoueahOOAgu+8iQoK54S25ZCO6YKj5aSp5L2T6ZS76K++77yM55Sx5LqO5a2m5Y+355qE5Y6f5Zug4oCm4oCmMTXlj4jlnZAzMOaXgei+ueOAgueEtuWQjjE15b2T5pe25Y+q5piv5oOz6ZqP5L6/5om+MzDorrLor53orrLngrnku4DkuYjvvIjlm6DkuLrpgqPkuKrml7blgJkxNeeci+edgDMw5bCx5oOz6Lef5LuW6K6y6K+d77yM5oiR5Y+R6KqT5piv6L+Z5qC355qE77yM6LefNDbmsqHmnInlhbPns7vvvIzll6/jgILvvInvvIzkuo7mmK/lpbnljrvpmo/lj6Ppl67kuobkuIDlj6XvvJrigJzkvaDmsqHmnInlkYror4nku5blkKfvvJ/igJ3nu5PmnpwzMOaWqemSieaIqumTgeWcsOivtO+8muKAnOaIkeWRiuivieS6huWViu+8geKAnTE177ya77yf77yf77yf6YKj5LuW5oCO5LmI6K+077yfMzDlvZPml7blsLHkuIDohLjlkIPnk5zvvIjvvJ/vvInnmoTmoLflrZDot58xNeivtO+8muKAnOS7luivtOS4jeWPr+iDve+8geKAneeEtuWQjuingTE155aR5oOR77yMMzDlj4jmnaXkuobkuIDlj6XvvJrigJznnJ/nmoTvvIHmiJHor7TkuobkuKTpgY3ku5bpg73or7TkuI3lj6/og73vvIHigJ0xNe+8mu+8n++8iOeUseatpOWPr+ingTMw5Lmf5piv5Liq5Zi05ryP55qE77yM5YW25a6e5pys6LSo5LiK6LefMDjmsqHku4DkuYjljLrliKvjgILvvIkKCjIwMjHlubQx5pyINeaXpe+8jOWPiOWQjuadpe+8jDE15Z2Q5ZyoMzDnmoTlkI7pnaLvvIwxNemCo+S4quaXtuWAmQ==

Fri Mar 24 03:11:11 EDT 2023

     RUNECOMP(2) RUNECOMP(2)

     NAME
	  runecomp, runedecomp, fullrunenorm, runegbreak, runewbreak,
	  utfcomp, utfdecomp, fullutfnorm, utfgbreak, utfwbreak -
	  multi-rune graphemes

     SYNOPSIS
	  #include <u.h>
	  #include <libc.h>

	  int runecomp(Rune *dst, Rune *src, int max)

	  int runedecomp(Rune *dst, Rune *src, int max)

	  Rune* fullrunenorm(Rune *s, int n)

	  Rune* runegbreak(Rune *s)

	  Rune* runewbreak(Rune *s)

	  int utfcomp(char *dst, char *src, int max)

	  int utfdecomp(char *dst, char *src, int max)

	  char* fullutfnorm(char *s, int n)

	  char* utfgbreak(char *s)

	  char* utfwbreak(char *s)

     DESCRIPTION
	  These routines help in handling graphemes that may span mul-
	  tiple runes.

	  Runecomp, runedecomp, utfcomp, and utfdecomp perform Uni-
	  code® normalization on src, storing the result in dst.  No
	  more then max elements will be written, and the resulting
	  string will always be null terminated.  The return value is
	  always the total number of elements required to store the
	  transformation.  If this value is larger then the supplied
	  max the caller can assume the result has been truncated.
	  Runecomp and utfcomp perform NFC normalization while
	  runedecomp and utfdecomp perform NFD normalization.

	  Fullrunenorm, and fullutfnorm determine if enough elements
	  are present in s to perform normalization.  If enough are
	  present, a pointer is returned to the first element that
	  begins the next context.  Otherwise s is returned.  No more
	  then n elements will be read.

	  Runegbreak, and utfgbreak search s for the next grapheme
	  break opportunity.  If none are found before the end of the
	  string s is returned.

	  Runewbreak, and utfwbreak search s for the next word break
	  opportunity.  If none are found before the end of the string
	  s is returned.

     SOURCE
	  /sys/src/libc/port/mkrunetype.c
	  /sys/src/libc/port/runenorm.c
	  /sys/src/libc/port/runebreak.c

     SEE ALSO
	  Unicode® Standard Annex #15
	  Unicode® Standard Annex #29
	  rune(2), utf(6), tcs(1)



Thu Mar 23 21:12:35 EDT 2023

     RUNENORM(2) RUNENORM(2)

     NAME
	  runedecomp, runerecomp, runenorm, utfnorm, fullrunenorm,
	  fullutfnorm, runegbreak, runewbreak - multi-rune graphemes

     SYNOPSIS
	  #include <u.h>
	  #include <libc.h>

	  void runedecomp(Rune dst[2], Rune src)

	  Rune runerecomp(Rune r[2])

	  int runenorm(Rune *dst, Rune *src, int max, int compose)

	  int utfnorm(char *dst, char *src, int max, int compose)

	  Rune* fullrunenorm(Rune *s, int n)

	  char* fullutfnorm(char *s, int n)

	  Rune* runegbreak(Rune *r)

	  Rune* runewbreak(Rune *r)

     DESCRIPTION
	  These routines help in handling graphemes that may span mul-
	  tiple runes.

	  Runedecomp decomposes the rune src and places the two decom-
	  posed runes into dst.  If no decomposition is found dst is
	  zeroed.

	  Runerecomp composes the two runes provided in r and returns
	  the result.  If no composition is found 0 is returned.

	  Runenorm (Utfnorm) copies the rune (UTF) sequence src to dst
	  while performing Unicode normalization.  No more then max
	  runes (bytes) will be copied, and the result is always null
	  terminated.  If compose is non-zero NFC normalization is per-
	  formed, otherwise NFD normalization is performed.

	  Fullrunenorm (fullutfnorm) determines if enough runes
	  (bytes) are present in s to perform normalization.  If enough
	  is present, a pointer is returned to the first rune (byte)
	  that begins the next context, otherwise s is returned.  No
	  more then n runes (bytes) will be read.

	  Runegbreak (Runewbreak) searches r for the next grapheme
	  (word) break opportunity.  If none are found before the end
	  of the string r is returned.

     SOURCE
	  /sys/src/libc/port/mkrunetype.c
	  /sys/src/libc/port/runenorm.c
	  /sys/src/libc/port/runebreak.c

     SEE ALSO
	  Unicode® Standard Annex #15
	  Unicode® Standard Annex #29
	  rune(2), utf(6), tcs(1)



Thu Mar 23 14:58:42 EDT 2023
diff 4c1d2d44db117f01d4af59d18d06c5fc0b20f3c1 uncommitted
--- a/tree.c
+++ b/tree.c
@@ -654,7 +654,6 @@
		if(d == l)
		if((i == b->nval-2) || (i >= 2 && copied >= halfsz)){
			d = r;
- full = 0;
			spc = Leafspc - (halfsz + Msgmax);
			getval(b, i, mid);
		}


Thu Mar 23 08:36:43 EDT 2023
From 8afe25e31e00575ae9e123348bbcf9d9cb948bfd Mon Sep 17 00:00:00 2001
From: Lennart Jablonka <humm@ljabl.com>
Date: Thu, 23 Mar 2023 13:30:17 +0100
Subject: [PATCH gridirc] don't require PRIVMSGs to have a colon
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When relaying an IRC message to gridchat, everything up to and including
the first colon is stripped:

	PRIVMSG #chat :hello there

becomes

	humm → hello there

If a message does not contain whitespace, it need not start in a colon,
though:

	PRIVMSG #chat hello

would get relayed as

	humm → PRIVMSG chat hello
---
 gridirc.rc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gridirc.rc b/gridirc.rc
index a8338cc..d7cfb7a 100755
--- a/gridirc.rc
+++ b/gridirc.rc
@@ -129,7 +129,9 @@ $1 ~ /^JOIN$/ {
 $1 ~ /^PRIVMSG$/ {
	sub(/^#/, "", $2)
	chan=$2
- sub(/^[^:]*:/, "")
+ $1 = ""
+ $2 = ""
+ sub(/^ *:?/, "")
	file=sprintf("/n/chat/%s", chan)
	printf "%s → %s\n", nick, $0 >> file
	fflush
--
2.39.2



Thu Mar 23 07:12:21 EDT 2023
diff --git a/Makefile b/Makefile
index f455219..801b634 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-PROGS=9pex 9gc
+PROG=9pex 9gc
 PREFIX?=/usr/local
 BIN=${DESTDIR}${PREFIX}/bin
 CFLAGS?=-g -O2
@@ -17,7 +17,7 @@ default: 9pex 9gc
 9gc: ${COMMON_O} ${GC_O}
	${CC} -o $@ ${COMMON_O} ${GC_O}

-install: progs
+install: all
	install -d ${BIN}
	install -m 755 ${PROG} ${BIN}


Thu Mar 23 07:11:39 EDT 2023
diff --git a/Makefile b/Makefile
index f455219..801b634 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-PROGS=9pex 9gc
+PROG=9pex 9gc
 PREFIX?=/usr/local
 BIN=${DESTDIR}${PREFIX}/bin
 CFLAGS?=-g -O2
@@ -17,7 +17,7 @@ default: 9pex 9gc
 9gc: ${COMMON_O} ${GC_O}
	${CC} -o $@ ${COMMON_O} ${GC_O}

-install: progs
+install: all
	install -d ${BIN}
	install -m 755 ${PROG} ${BIN}


Thu Mar 23 01:17:08 EDT 2023

     RUNENORM(2) RUNENORM(2)

     NAME
	  runedecomp, runerecomp, runenorm, utfnorm, fullrunenorm,
	  fullutfnorm, runegbreak, runewbreak - multi-rune graphemes

     SYNOPSIS
	  #include <u.h>
	  #include <libc.h>

	  void runedecomp(Rune dst[2], Rune src)

	  Rune runerecomp(Rune r[2])

	  int runenorm(Rune *dst, Rune *src, int max, int compose)

	  int utfnorm(char *dst, char *src, int max, int compose)

	  Rune* fullrunenorm(Rune *s, int n)

	  char* fullutfnorm(char *s, int n)

	  Rune* runegbreak(Rune *r)

	  Rune* runewbreak(Rune *r)

     DESCRIPTION
	  These routines help in handling graphemes that may span mul-
	  tiple runes.

	  Runedecomp decomposes the rune src and places the two decom-
	  posed runes into dst.  If no decomposition is found dst is
	  zerod.

	  Runerecomp composes the two runes provided in r and returns
	  the result.  If no composition is found 0 is returned.

	  Runenorm (Utfnorm) copies the rune (UTF) sequence src to dst
	  while performing Unicode normalization.  No more then max
	  runes (bytes) will be copied, and the result is always null
	  terminated.  If compose is non-zero NFC normalization is per-
	  formed, otherwise NFD normalization is performed.

	  Fullrunenorm (fullutfnorm) determines if enough runes
	  (bytes) are present in s to perform normalization.  If enough
	  is present, a pointer is returned to the first rune (byte)
	  that begins the next context, otherwise s is returned.  No
	  more then n runes (bytes) will be read.

	  Runegbreak (Runewbreak) searches r for the next grapheme
	  (word) break opportunity.  If none are found before the end
	  of the string r is returned.

     SOURCE
	  /sys/src/libc/port/mkrunetype.c
	  /sys/src/libc/port/runenorm.c
	  /sys/src/libc/port/runebreak.c

     SEE ALSO
	  rune(2), utf(6), tcs(1)



Wed Mar 22 21:17:40 EDT 2023
diff b8ae7708fb3ef3acbb30ccf3181897f8157c18de uncommitted
--- /dev/null
+++ b//lib/ucd/mkfile
@@ -1,0 +1,70 @@
+</$objtype/mkfile
+
+VERSION='15.0.0'
+URL='https://www.unicode.org/Public/'$VERSION'/ucd/'
+
+TXT=\
+ ArabicShaping.txt\
+ BidiBrackets.txt\
+ BidiMirroring.txt\
+ BidiTest.txt\
+ Blocks.txt\
+ CJKRadicals.txt\
+ CaseFolding.txt\
+ CompositionExclusions.txt\
+ DerivedAge.txt\
+ DerivedCoreProperties.txt\
+ DerivedNormalizationProps.txt\
+ EastAsianWidth.txt\
+ EmojiSources.txt\
+ EquivalentUnifiedIdeograph.txt\
+ HangulSyllableType.txt\
+ Index.txt\
+ IndicPositionalCategory.txt\
+ IndicSyllabicCategory.txt\
+ Jamo.txt\
+ LineBreak.txt\
+ NameAliases.txt\
+ NamedSequences.txt\
+ NamedSequencesProv.txt\
+ NamesList.txt\
+ NormalizationCorrections.txt\
+ NushuSources.txt\
+ PropList.txt\
+ PropertyAliases.txt\
+ PropertyValueAliases.txt\
+ ScriptExtensions.txt\
+ Scripts.txt\
+ SpecialCasing.txt\
+ StandardizedVariants.txt\
+ TangutSources.txt\
+ USourceData.txt\
+ UnicodeData.txt\
+ VerticalOrientation.txt\
+
+TEST=\
+ NormalizationTest.txt\
+ BidiCharacterTest.txt\
+
+PDF=\
+ USourceGlyphs.pdf\
+ USourceRSChart.pdf\
+
+AUX=\
+ WordBreakProperty.txt\
+ GraphemeBreakProperty.txt\
+
+ucd:V: UnicodeData.txt
+
+%.txt:
+ hget $URL^$target > $target >[2]/dev/null || hget
$URL^'auxiliary/'^$target > $target
+%.pdf:
+ hget $URL^$target > $target
+
+txt:V: $TXT
+
+pdf:V: $PDF
+
+test:V: $TEST
+
+all:V: $TXT $PDF $TEST
--- a//sys/include/libc.h
+++ b//sys/include/libc.h
@@ -77,6 +77,14 @@
 extern long runestrlen(Rune*);
 extern Rune* runestrstr(Rune*, Rune*);

+extern int runenorm(Rune*, Rune*, int, int);
+extern int utfnorm(char*,char*,int,int);
+extern char* fullutfnorm(char*,int);
+extern Rune* fullrunenorm(Rune*,int);
+
+extern Rune* runewbreak(Rune*);
+extern Rune* runegbreak(Rune*);
+
 extern Rune tolowerrune(Rune);
 extern Rune totitlerune(Rune);
 extern Rune toupperrune(Rune);
@@ -404,7 +412,7 @@
 extern int enc16chr(int);

 extern int encodefmt(Fmt*);
-extern void exits(char*);
+extern _Noreturn void exits(char*);
 extern double frexp(double, int*);
 extern uintptr getcallerpc(void*);
 extern char* getenv(char*);
@@ -431,7 +439,7 @@
 extern ulong strtoul(char*, char**, int);
 extern vlong strtoll(char*, char**, int);
 extern uvlong strtoull(char*, char**, int);
-extern void sysfatal(char*, ...);
+extern _Noreturn void sysfatal(char*, ...);
 #pragma varargck argpos sysfatal 1
 extern void syslog(int, char*, char*, ...);
 #pragma varargck argpos syslog 3
@@ -677,7 +685,7 @@
	ulong len;
 } IOchunk;

-extern void _exits(char*);
+extern _Noreturn void _exits(char*);

 extern void abort(void);
 extern int access(char*, int);
--- a//sys/src/cmd/tcs/hdr.h
+++ b//sys/src/cmd/tcs/hdr.h
@@ -23,6 +23,8 @@

 void utf_in(int, long *, struct convert *);
 void utf_out(Rune *, int, long *);
+void utfnfc_out(Rune *, int, long *);
+void utfnfd_out(Rune *, int, long *);
 void isoutf_in(int, long *, struct convert *);
 void isoutf_out(Rune *, int, long *);

--- a//sys/src/cmd/tcs/tcs.c
+++ b//sys/src/cmd/tcs/tcs.c
@@ -613,6 +613,10 @@
	{ "utf-16be", "alias for unicode-be (MIME)", Func, 0,
	(Fnptr)unicode_out_be },
	{ "utf-16le", "alias for unicode-le (MIME)", From|Func, 0,
	(Fnptr)unicode_in_le },
	{ "utf-16le", "alias for unicode-le (MIME)", Func, 0,
	(Fnptr)unicode_out_le },
+ { "nfc", "UTF Normalization Form C", From|Func, 0, (Fnptr)utf_in },
+ { "nfc", "UTF Normalization Form C", Func, 0, (Fnptr)utfnfc_out },
+ { "nfd", "UTF Normalization Form D", From|Func, 0, (Fnptr)utf_in },
+ { "nfd", "UTF Normalization Form D", Func, 0, (Fnptr)utfnfd_out },
	{ "viet1", "Vietnamese VSCII-1 (1993)", Table, (void *)tabviet1 },
	{ "viet2", "Vietnamese VSCII-2 (1993)", Table, (void *)tabviet2 },
	{ "vscii", "Vietnamese VISCII 1.1 (1992)", Table, (void *)tabviscii },
--- a//sys/src/cmd/tcs/utf.c
+++ b//sys/src/cmd/tcs/utf.c
@@ -19,38 +19,27 @@
 void
 utf_in(int fd, long *, struct convert *out)
 {
- char buf[N];
- int i, j, c, n, tot;
- unsigned long l;
+ char buf[N + 1];
+ Rune r;
+ char *p;
+ int n, tot, j;

	tot = 0;
+ j = 0;
	while((n = read(fd, buf+tot, N-tot)) >= 0){
		tot += n;
- for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i)));
){
- c = our_mbtowc(&l, buf+i, tot-i);
- if(c == -1){
- if(squawk)
- warn("bad UTF sequence near byte %ld in input", ninput+i);
- if(clean){
- i++;
- continue;
- }
- nerrors++;
- l = Runeerror;
- c = 1;
- }
- runes[j++] = l;
- i += c;
+ if(fullutfnorm(buf, tot) == buf)
+ continue;
+ /* fullutfnorm ensures rune boundary */
+ for(p = buf; p < buf + tot;){
+ p += chartorune(&r, p);
+ runes[j++] = r;
+ runes[j] = 0;
		}
		OUT(out, runes, j);
- tot -= i;
- ninput += i;
- if(tot)
- memmove(buf, buf+i, tot);
- if(n == 0)
- break;
+ j = 0;
+ tot = 0;
	}
- OUT(out, runes, 0);
 }

 void
@@ -66,6 +55,26 @@
	noutput += p-obuf;
	if(p > obuf)
		write(1, obuf, p-obuf);
+}
+
+void
+utfnfc_out(Rune *base, int n, long *)
+{
+ Rune buf[N + 1];
+ int w;
+
+ w = runenorm(buf, base, n + 1, 1);
+ utf_out(buf, w, nil);
+}
+
+void
+utfnfd_out(Rune *base, int n, long *)
+{
+ Rune buf[N + 1];
+ int w;
+
+ w = runenorm(buf, base, n + 1, 0);
+ utf_out(buf, w, nil);
 }

 void
--- a//sys/src/libc/port/mkfile
+++ b//sys/src/libc/port/mkfile
@@ -62,6 +62,9 @@
	rand.c\
	readn.c\
	rune.c\
+ runebreak.c\
+ runeistype.c\
+ runenorm.c\
	runestrcat.c\
	runestrchr.c\
	runestrcmp.c\
@@ -74,7 +77,7 @@
	runestrrchr.c\
	runestrlen.c\
	runestrstr.c\
- runetype.c\
+ runetotype.c\
	sin.c\
	sinh.c\
	sqrt.c\
@@ -127,3 +130,16 @@
 </sys/src/cmd/mksyslib

 profile.$O: /sys/include/tos.h
+
+runenorm.$O: runenormdata runenorm.c
+runetotype.$O: runetotypedata runetotype.c
+runeistype.$O: runeistypedata runeistype.c
+runebreak.$O: runebreakdata runebreak.c
+
+runenormdata runetotypedata runeistypedata runebreakdata: mkrunetype.c
+ @{
+ eval `{grep '^[A-Z]' /$cputype/mkfile}
+ $CC $CFLAGS -o mkrunetype.$O $prereq
+ $LD $LDFLAGS -o $O.mkrunetype mkrunetype.$O
+ $O.mkrunetype
+ }
--- /dev/null
+++ b//sys/src/libc/port/mkrunetype.c
@@ -1,0 +1,761 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+
+enum{
+ NRUNES = 1<<21
+};
+
+typedef struct Param Param;
+typedef struct Lvl Lvl;
+struct Lvl{
+ int bits;
+ int max;
+ int mask;
+};
+struct Param{
+ Lvl idx1;
+ Lvl idx2;
+ Lvl data;
+
+ int round1max;
+};
+
+static void
+derive(Lvl *l)
+{
+ l->max = 1 << l->bits;
+ l->mask = l->max - 1;
+}
+
+static void
+param(Param *p, int idx1, int idx2)
+{
+
+ assert(idx1 + idx2 < 21);
+ p->idx1.bits = idx1;
+ p->idx2.bits = idx2;
+ p->data.bits = 21 - idx1 - idx2;
+ derive(&p->idx1);
+ derive(&p->idx2);
+ derive(&p->data);
+
+ p->round1max = NRUNES/p->data.max;
+}
+
+static int
+lkup(Param *p, int *idx1, int *idx2, int *data, int x)
+{
+ int y, z;
+
+ y = (((x)>>(p->data.bits+p->idx2.bits))&p->idx1.mask);
+ z = (((x)>>p->data.bits)&p->idx2.mask);
+ return data[idx2[idx1[y] + z] + (x&p->data.mask)];
+}
+
+static int
+mkarrvar(int fd, char *name, int *d, int len)
+{
+ int i, sz;
+ int max, min;
+ char *t;
+
+ max = min = 0;
+ for(i = 0; i < len; i++){
+ if(d[i] > max)
+ max = d[i];
+ if(d[i] < min)
+ min = d[i];
+ }
+ if(min == 0){
+ if(max < (uchar)~0)
+ t = "uchar", sz = 1;
+ else if(max < 0xFFFF)
+ t = "ushort", sz = 2;
+ else
+ t = "uint", sz = 4;
+ } else {
+ if(max < 1<<7)
+ t = "char", sz = 1;
+ else if(max < 1<<15)
+ t = "short", sz = 2;
+ else
+ t = "int", sz = 4;
+ }
+ if(fd < 0)
+ return sz * len;
+
+ fprint(fd, "static\n%s\t%s[%d] =\n{\n\t", t, name, len);
+ for(i = 0; i < len; i++){
+ fprint(fd, "%d,", d[i]);
+ if((i+1) % 16 == 0)
+ fprint(fd, "\n\t");
+ }
+ fprint(fd, "\n};\n");
+
+ return sz * len;
+}
+
+static int
+mkexceptarr(int fd, char *name, int *d, int n, int all)
+{
+ int i;
+ fprint(fd, "static\nRune %s[][%d] =\n{\n\t", name, all ? 3 : 2);
+ for(i = 0; i < n*3; i += 3){
+ if(all && d[i] != 0)
+ fprint(fd, "{0x%X, 0x%X, 0x%X},", d[i], d[i+1], d[i+2]);
+ else if(!all)
+ fprint(fd, "{0x%X, 0x%X},", d[i+1], d[i+2]);
+ if((i+3) % (8*3) == 0)
+ fprint(fd, "\n\t");
+ }
+ fprint(fd, "\n};\n");
+ return n * sizeof(Rune) * 2;
+}
+
+static int
+compact(int *data, int *idx, int nidx, int *src, int chunksize)
+{
+ int i, n, ndata, best;
+ int *dot, *lp, *rp;
+
+ dot = src;
+ ndata = 0;
+ idx[0] = 0;
+ for(i = 1; i <= nidx; i++){
+ rp = dot + chunksize;
+ lp = rp - 1;
+
+ for(best = 0, n = 0; i != nidx && n < chunksize; n++, lp--){
+ if(memcmp(lp, rp, (n+1) * sizeof data[0]) == 0)
+ best = n+1;
+ }
+ memmove(data + ndata, dot, (chunksize - best) * sizeof data[0]);
+ ndata += (chunksize - best);
+ idx[i] = idx[i - 1] + (chunksize - best);
+ dot = rp;
+ }
+ return ndata;
+}
+
+
+static int
+mklkup(int fd, char *label, int *map, Param *p)
+{
+ static int data[NRUNES];
+ static int idx2[NRUNES];
+ static int idx2dest[NRUNES];
+ static int idx1[NRUNES];
+ int i, nidx2, ndata;
+ int size;
+
+ ndata = compact(data, idx2, p->round1max, map, p->data.max);
+ nidx2 = compact(idx2dest, idx1, p->idx1.max, idx2, p->idx2.max);
+
+ if(fd >= 0){
+ for(i = 0; i < NRUNES; i++)
+ if(map[i] != lkup(p, idx1, idx2dest, data, i))
+ sysfatal("mismatch in %s at %d %d %d\n", label, i, map[i], lkup(p, idx1,
idx2dest, data, i));
+ }
+
+ size = mkarrvar(fd, smprint("_%sdata", label), data, ndata);
+ size += mkarrvar(fd, smprint("_%sidx2", label), idx2dest, nidx2);
+ size += mkarrvar(fd, smprint("_%sidx1", label), idx1, p->idx1.max);
+ if(fd >= 0){
+ fprint(fd, "\n");
+ fprint(fd, "#define %sindex1(x) (((x)>>(%d+%d))&0x%X)\n", label,
p->data.bits, p->idx2.bits, p->idx1.mask);
+ fprint(fd, "#define %sindex2(x) (((x)>>%d)&0x%X)\n", label,
p->data.bits, p->idx2.mask);
+ fprint(fd, "#define %soffset(x) ((x)&0x%X)\n", label, p->data.mask);
+ fprint(fd, "#define %slkup(x) (_%sdata[_%sidx2[_%sidx1[%sindex1(x)] +
%sindex2(x)] + %soffset(x)] )\n\n",
+ label, label, label, label, label, label, label);
+ }
+ return size;
+}
+
+static void
+mklkupmatrix(char *label, int *map, Param *p)
+{
+ int bestsize, size, bestx, besty;
+ int x, y;
+
+ bestsize = bestx = besty = -1;
+ for(x = 4; x <= 12; x++)
+ for(y=4; y <= (19 - x); y++){
+ param(p, x, y);
+ size = mklkup(-1, label, map, p);
+ if(bestsize == -1 || size < bestsize){
+ bestx = x;
+ besty = y;
+ bestsize = size;
+ }
+ }
+
+ assert(bestsize != -1);
+ fprint(2, "label: %s best: %d %d (%d)\n", label, bestx, besty, bestsize);
+ param(p, bestx, besty);
+}
+
+static int myismerged[NRUNES];
+static int mytoupper[NRUNES];
+static int mytolower[NRUNES];
+static int mytotitle[NRUNES];
+static int mybreak[NRUNES];
+
+enum{ DSTART = 0xEEEE };
+static int mydecomp[NRUNES];
+static int mydespecial[256*3];
+static int nspecial;
+static int myccc[NRUNES];
+
+typedef struct KV KV;
+struct KV{
+ uint key;
+ uint val;
+ ushort next;
+};
+
+static KV myrecomp[2000];
+static int nrecomp;
+
+static int recompext[256*3];
+static int nrecompext;
+
+static uint
+hash(uint x)
+{
+ x ^= x >> 16;
+ x *= 0x21f0aaad;
+ x ^= x >> 15;
+ x *= 0xd35a2d97;
+ x ^= x >> 15;
+ return x;
+}
+
+static void
+mkrecomp(int fd)
+{
+ int i;
+ KV *p;
+ static KV vals[512];
+ static KV coll[1000];
+ int over;
+ int maxchain;
+
+ for(i = 0; i < nelem(vals); i++)
+ vals[i] = (KV){0, 0, 0};
+ for(i = 0; i < nelem(coll); i++)
+ coll[i] = (KV){0, 0, 0};
+ over = 1;
+ for(i = 0; i < nrecomp; i++){
+ p = vals + (hash(myrecomp[i].key) % nelem(vals));
+ maxchain = 0;
+ while(p->key != 0){
+ maxchain++;
+ if(p->next == 0){
+ p->next = over;
+ p = coll + over - 1;
+ over++;
+ } else
+ p = coll + p->next - 1;
+ }
+ p->key = myrecomp[i].key;
+ p->val = myrecomp[i].val;
+ }
+ fprint(2, "recomp map [%d][%d]: %d\n", nelem(vals), over-1, (nelem(vals) +
over-1) * (4+2+2));
+ fprint(fd, "static\nuint\t_recompdata[] =\n{\n\t");
+ for(p = vals, i = 0;; i++){
+ assert(p->val < 0xFFFF);
+ assert(p->next < 0xFFFF);
+ fprint(fd, "%udU,%udU,", p->key, p->val | (p->next<<16));
+ if((i+1) % 8 == 0)
+ fprint(fd, "\n\t");
+
+ if(p == vals+nelem(vals)-1)
+ p = coll;
+ else if(p == coll + over - 2)
+ break;
+ else
+ p++;
+ }
+ fprint(fd, "\n};\n");
+ fprint(fd, "static uint *_recompcoll = _recompdata+%d*2;\n", nelem(vals));
+ /*
+ fprint(fd,
+ " x ^= x >> 16;\n"
+ " x *= 0x21f0aaad;\n"
+ " x ^= x >> 15;\n"
+ " x *= 0xd35a2d97;\n"
+ " x ^= x >> 15;\n"
+ " p = _recompdata + (x%%%d)*2;\n"
+ "}\n", nelem(vals));
+ */
+}
+
+static void
+mktables(void)
+{
+ Param p;
+ int tofd, isfd, normfd, breakfd;
+ int size;
+
+ tofd = create("runetotypedata", OWRITE, 0664);
+ if(tofd < 0)
+ sysfatal("could not create runetotypedata: %r");
+ param(&p, 10, 7);
+ size = mklkup(tofd, "upper", mytoupper, &p);
+ fprint(2, "%s: %d\n", "upper", size);
+
+ size = mklkup(tofd, "lower", mytolower, &p);
+ fprint(2, "%s: %d\n", "lower", size);
+
+ size = mklkup(tofd, "title", mytotitle, &p);
+ fprint(2, "%s: %d\n", "title", size);
+ close(tofd);
+
+ isfd = create("runeistypedata", OWRITE, 0664);
+ if(isfd < 0)
+ sysfatal("could not create runeistypedata: %r");
+ param(&p, 11, 6);
+ size = mklkup(isfd, "merged", myismerged, &p);
+ fprint(2, "%s: %d\n", "merged", size);
+ fprint(isfd, "static\nenum {\n");
+ fprint(isfd, "\tL%s = %s,\n", "space", "1<<0");
+ fprint(isfd, "\tL%s = %s,\n", "alpha", "1<<1");
+ fprint(isfd, "\tL%s = %s,\n", "digit", "1<<2");
+ fprint(isfd, "\tL%s = %s,\n", "upper", "1<<3");
+ fprint(isfd, "\tL%s = %s,\n", "lower", "1<<4");
+ fprint(isfd, "\tL%s = %s,\n", "title", "1<<5");
+ fprint(isfd, "};\n");
+ close(isfd);
+
+ normfd = create("runenormdata", OWRITE, 0664);
+ if(normfd < 0)
+ sysfatal("could not create runenormdata: %r");
+ param(&p, 10, 7);
+ size = mklkup(normfd, "decomp", mydecomp, &p);
+ fprint(2, "%s: %d\n", "decomp", size);
+
+ param(&p, 9, 7);
+ size = mklkup(normfd, "ccc", myccc, &p);
+ fprint(2, "%s: %d\n", "ccc", size);
+
+ mkexceptarr(normfd, "_decompexceptions", mydespecial, nspecial, 0);
+ mkexceptarr(normfd, "_recompexceptions", recompext, nrecompext, 1);
+ mkrecomp(normfd);
+ close(normfd);
+
+ param(&p, 10, 6);
+ breakfd = create("runebreakdata", OWRITE, 0644);
+ if(breakfd < 0)
+ sysfatal("could not create runebreakdata: %r");
+ size = mklkup(breakfd, "break", mybreak, &p);
+ fprint(2, "%s: %d\n", "break", size);
+}
+
+enum {
+ FIELD_CODE,
+ FIELD_NAME,
+ FIELD_CATEGORY,
+ FIELD_COMBINING,
+ FIELD_BIDIR,
+ FIELD_DECOMP,
+ FIELD_DECIMAL_DIG,
+ FIELD_DIG,
+ FIELD_NUMERIC_VAL,
+ FIELD_MIRRORED,
+ FIELD_UNICODE_1_NAME,
+ FIELD_COMMENT,
+ FIELD_UPPER,
+ FIELD_LOWER,
+ FIELD_TITLE,
+ NFIELDS,
+};
+
+static int
+getunicodeline(Biobuf *in, char **fields)
+{
+ char *p;
+
+ if((p = Brdline(in, '\n')) == nil)
+ return 0;
+
+ p[Blinelen(in)-1] = '\0';
+
+ if (getfields(p, fields, NFIELDS + 1, 0, ";") != NFIELDS)
+ sysfatal("bad number of fields");
+
+ return 1;
+}
+
+static int
+estrtoul(char *s, int base)
+{
+ char *epr;
+ Rune code;
+
+ code = strtoul(s, &epr, base);
+ if(s == epr)
+ sysfatal("bad code point hex string");
+ return code;
+}
+
+enum {
+ OTHER,
+ Hebrew_Letter, Newline, Extend, Format,
+ Katakana, ALetter, MidLetter, MidNum,
+ MidNumLet, Numeric, ExtendNumLet, WSegSpace,
+ PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
+ L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
+ EMOJIEX = 0xB0,
+};
+
+static void
+markbreak(void)
+{
+ Biobuf *b;
+ char *p, *dot;
+ int i, s, e;
+ uchar v;
+
+ b = Bopen("/lib/ucd/WordBreakProperty.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load word breaks: %r");
+
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#')
+ continue;
+ if((dot = strstr(p, "..")) != nil){
+ *dot = 0;
+ dot += 2;
+ s = estrtoul(p, 16);
+ e = estrtoul(dot, 16);
+ } else {
+ s = e = estrtoul(p, 16);
+ dot = p;
+ }
+ v = 0;
+ if(strstr(dot, "ExtendNumLet") != nil)
+ v = ExtendNumLet;
+ else if(strstr(dot, "Hebrew_Letter") != nil)
+ v = Hebrew_Letter;
+ else if(strstr(dot, "Newline") != nil)
+ v = Newline;
+ else if(strstr(dot, "Extend") != nil)
+ v = Extend;
+ else if(strstr(dot, "Format") != nil)
+ v = Format;
+ else if(strstr(dot, "Katakana") != nil)
+ v = Katakana;
+ else if(strstr(dot, "ALetter") != nil)
+ v = ALetter;
+ else if(strstr(dot, "MidLetter") != nil)
+ v = MidLetter;
+ else if(strstr(dot, "MidNum") != nil)
+ v = MidNum;
+ else if(strstr(dot, "Numeric") != nil)
+ v = Numeric;
+ else if(strstr(dot, "WSegSpace") != nil)
+ v = WSegSpace;
+ for(i = s; i <= e; i++)
+ mybreak[i] = v;
+ }
+ Bterm(b);
+ b = Bopen("/lib/ucd/GraphemeBreakProperty.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load Grapheme breaks: %r");
+
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#')
+ continue;
+ if((dot = strstr(p, "..")) != nil){
+ *dot = 0;
+ dot += 2;
+ s = estrtoul(p, 16);
+ e = estrtoul(dot, 16);
+ } else {
+ s = e = estrtoul(p, 16);
+ dot = p;
+ }
+ v = 0;
+ if(strstr(dot, "; Prepend #") != nil)
+ v = PREPEND;
+ else if(strstr(dot, "; Control #") != nil)
+ v = CONTROL;
+ else if(strstr(dot, "; Extend #") != nil)
+ v = EXTEND;
+ else if(strstr(dot, "; Regional_Indicator #") != nil)
+ v = REGION;
+ else if(strstr(dot, "; SpacingMark #") != nil)
+ v = SPACEMK;
+ else if(strstr(dot, "; L #") != nil)
+ v = L;
+ else if(strstr(dot, "; V #") != nil)
+ v = V;
+ else if(strstr(dot, "; T #") != nil)
+ v = T;
+ else if(strstr(dot, "; LV #") != nil)
+ v = LV;
+ else if(strstr(dot, "; LVT #") != nil)
+ v = LVT;
+ for(i = s; i <= e; i++)
+ mybreak[i] |= v;
+ }
+ Bterm(b);
+
+ b = Bopen("/lib/ucd/emoji-data.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load emoji-data: %r");
+
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#')
+ continue;
+ if((dot = strstr(p, "..")) != nil){
+ *dot = 0;
+ dot += 2;
+ s = estrtoul(p, 16);
+ e = estrtoul(dot, 16);
+ } else {
+ s = e = estrtoul(p, 16);
+ dot = p;
+ }
+ v = 0;
+ if(strstr(dot, "; Extended_Pictographic") != nil)
+ v = EMOJIEX;
+ for(i = s; i <= e; i++)
+ mybreak[i] |= v;
+ }
+ Bterm(b);
+}
+
+static void
+markexclusions(void)
+{
+ Biobuf *b;
+ char *p;
+ int i;
+ uint x;
+
+ b = Bopen("/lib/ucd/CompositionExclusions.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load composition exclusions: %r");
+
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#')
+ continue;
+ x = estrtoul(p, 16);
+ for(i = 0; i < nrecomp; i++){
+ if(myrecomp[i].val == x){
+ myrecomp[i].val = 0;
+ break;
+ }
+ }
+ if(i == nrecomp){
+ for(i = 0; i < nrecompext; i++){
+ if(recompext[i*3] == x){
+ recompext[i*3] = 0;
+ break;
+ }
+ }
+ }
+ }
+ Bterm(b);
+}
+
+void
+main(int, char)
+{
+ static char myisspace[NRUNES];
+ static char myisalpha[NRUNES];
+ static char myisdigit[NRUNES];
+ static char myisupper[NRUNES];
+ static char myislower[NRUNES];
+ static char myistitle[NRUNES];
+ Biobuf *in;
+ char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
+ char *p, *d;
+ int i, code, last;
+ int decomp[2], *ip;
+
+ in = Bopen("/lib/ucd/UnicodeData.txt", OREAD);
+ if(in == nil)
+ sysfatal("can't open UnicodeData.txt: %r");
+
+ for(i = 0; i < NRUNES; i++){
+ mytoupper[i] = -1;
+ mytolower[i] = -1;
+ mytotitle[i] = -1;
+ mydecomp[i] = 0;
+ myccc[i] = 0;
+ mybreak[i] = 0;
+ }
+
+ myisspace['\t'] = 1;
+ myisspace['\n'] = 1;
+ myisspace['\r'] = 1;
+ myisspace['\f'] = 1;
+ myisspace['\v'] = 1;
+ myisspace[0x85] = 1; /* control char, "next line" */
+ myisspace[0xfeff] = 1; /* zero-width non-break space */
+
+ last = -1;
+ nspecial = nrecomp = nrecompext = 0;
+ while(getunicodeline(in, fields)){
+ code = estrtoul(fields[FIELD_CODE], 16);
+ if (code >= NRUNES)
+ sysfatal("code-point value too big: %x", code);
+ if(code <= last)
+ sysfatal("bad code sequence: %x then %x", last, code);
+ last = code;
+
+ p = fields[FIELD_CATEGORY];
+ if(strstr(fields[FIELD_NAME], ", First>") != nil){
+ if(!getunicodeline(in, fields2))
+ sysfatal("range start at eof");
+ if (strstr(fields2[FIELD_NAME], ", Last>") == nil)
+ sysfatal("range start not followed by range end");
+ last = estrtoul(fields2[FIELD_CODE], 16);
+ if(last <= code)
+ sysfatal("range out of sequence: %x then %x", code, last);
+ if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
+ sysfatal("range with mismatched category");
+ }
+
+ d = fields[FIELD_DECOMP];
+ if(strlen(d) > 0 && strstr(d, "<") == nil){
+ decomp[0] = estrtoul(d, 16);
+ d = strstr(d, " ");
+ if(d == nil){
+ /* singleton recompositions are verboden */
+ decomp[1] = 0;
+ if(decomp[0] > 0xFFFF){
+ //fprint(2, "case1 %X %X\n", code, decomp[0]);
+ ip = mydespecial + nspecial*3;
+ ip[0] = code;
+ ip[1] = decomp[0];
+ ip[2] = 0;
+ mydecomp[code] = (DSTART+nspecial)<<16;
+ nspecial++;
+ } else
+ mydecomp[code] = decomp[0]<<16;
+ } else {
+ d++;
+ decomp[1] = estrtoul(d, 16);
+ if(decomp[0] > 0xFFFF || decomp[1] > 0xFFFF){
+ //fprint(2, "case2 %X %X %X\n", code, decomp[0], decomp[1]);
+ ip = mydespecial + nspecial*3;
+ ip[0] = code;
+ ip[1] = decomp[0];
+ ip[2] = decomp[1];
+ mydecomp[code] = (DSTART+nspecial)<<16;
+ nspecial++;
+ ip = recompext + nrecompext*3;
+ ip[0] = code;
+ ip[1] = decomp[0];
+ ip[2] = decomp[1];
+ nrecompext++;
+ } else {
+ mydecomp[code] = decomp[0]<<16 | decomp[1];
+ myrecomp[nrecomp++] = (KV){decomp[0]<<16 | decomp[1], code, 0};
+ }
+ }
+ }
+
+ for (; code <= last; code++){
+ if(p[0] == 'L')
+ myisalpha[code] = 1;
+ if(p[0] == 'Z')
+ myisspace[code] = 1;
+
+ if(strcmp(p, "Lu") == 0)
+ myisupper[code] = 1;
+ if(strcmp(p, "Ll") == 0)
+ myislower[code] = 1;
+
+ if(strcmp(p, "Lt") == 0)
+ myistitle[code] = 1;
+
+ if(strcmp(p, "Nd") == 0)
+ myisdigit[code] = 1;
+
+ if(fields[FIELD_UPPER][0] != '\0')
+ mytoupper[code] = estrtoul(fields[FIELD_UPPER], 16);
+
+ if(fields[FIELD_LOWER][0] != '\0')
+ mytolower[code] = estrtoul(fields[FIELD_LOWER], 16);
+
+ if(fields[FIELD_TITLE][0] != '\0')
+ mytotitle[code] = estrtoul(fields[FIELD_TITLE], 16);
+
+ myccc[code] = estrtoul(fields[FIELD_COMBINING], 10);
+ }
+ }
+
+ Bterm(in);
+
+ markexclusions();
+
+ /*
+ * according to standard, if totitle(x) is not defined in ucd
+ * but toupper(x) is, then totitle is defined to be toupper(x)
+ */
+ for(i = 0; i < NRUNES; i++){
+ if(mytotitle[i] == -1
+ && mytoupper[i] != -1
+ && !myistitle[i])
+ mytotitle[i] = mytoupper[i];
+ }
+
+ /*
+ * A couple corrections:
+ * is*(to*(x)) should be true.
+ * restore undefined transformations.
+ * store offset instead of value, makes them sparse.
+ */
+ for(i = 0; i < NRUNES; i++){
+ if(mytoupper[i] != -1)
+ myisupper[mytoupper[i]] = 1;
+ else
+ mytoupper[i] = i;
+
+ if(mytolower[i] != -1)
+ myislower[mytolower[i]] = 1;
+ else
+ mytolower[i] = i;
+
+ if(mytotitle[i] != -1)
+ myistitle[mytotitle[i]] = 1;
+ else
+ mytotitle[i] = i;
+
+ mytoupper[i] = mytoupper[i] - i;
+ mytolower[i] = mytolower[i] - i;
+ mytotitle[i] = mytotitle[i] - i;
+ }
+
+ uchar b;
+ for(i = 0; i < NRUNES; i++){
+ b = 0;
+ if(myisspace[i])
+ b |= 1<<0;
+ if(myisalpha[i])
+ b |= 1<<1;
+ if(myisdigit[i])
+ b |= 1<<2;
+ if(myisupper[i])
+ b |= 1<<3;
+ if(myislower[i])
+ b |= 1<<4;
+ if(myistitle[i])
+ b |= 1<<5;
+
+ myismerged[i] = b;
+ }
+
+ markbreak();
+ mktables();
+ exits(nil);
+}
--- /dev/null
+++ b//sys/src/libc/port/runebreak.c
@@ -1,0 +1,149 @@
+#include <u.h>
+#include <libc.h>
+
+#include "/sys/src/libc/port/runebreakdata"
+
+enum {
+ OTHER,
+ Hebrew_Letter, Newline, Extend, Format,
+ Katakana, ALetter, MidLetter, MidNum,
+ MidNumLet, Numeric, ExtendNumLet, WSegSpace,
+ PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
+ L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
+ EMOJIEX = 0xB0,
+
+ ZWJ = 0x200DU,
+ LINETAB = 0xB,
+};
+
+#define IS(x, y) ((x&0xf) == y)
+#define ISG(x, y) ((x&0xf0) == y)
+
+Rune*
+runegbreak(Rune *s)
+{
+ Rune l, r;
+ uchar lt, rt;
+ Rune *p;
+
+ p = s;
+ if((l = *p++) == 0)
+ return s;
+ if((r = *p) == 0)
+ return s;
+ lt = breaklkup(l);
+ rt = breaklkup(r);
+ if(l == '\r' && r == '\n')
+ goto Done;
+ if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
+ return p;
+ if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
+ return p;
+ if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
+ goto Done;
+ if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
+ goto Done;
+ if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
+ goto Done;
+ if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
+ goto Done;
+ if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
+ while(ISG(rt, EXTEND)){
+ p++;
+ if((r = *p) == 0)
+ return s;
+ rt = breaklkup(r);
+ }
+ if(r != ZWJ)
+ return p;
+ p++;
+ if((r = *p) == 0)
+ return s;
+ rt = breaklkup(r);
+ if(ISG(rt, EMOJIEX))
+ goto Done;
+ return p;
+ }
+ if(ISG(rt, EXTEND) || r == ZWJ)
+ goto Done;
+ if(ISG(lt, REGION) && ISG(rt, REGION))
+ goto Done;
+
+ return p;
+
+Done:
+ if(p[1] == 0)
+ return s;
+ return p + 1;
+}
+
+#define AH(x) (IS(x, ALetter) || IS(x, Hebrew_Letter))
+#define MNLQ(x) (IS(x, MidNumLet) || x == '\'')
+
+Rune*
+runewbreak(Rune *s)
+{
+ Rune l, r;
+ uchar lt, rt;
+ Rune *p;
+
+ p = s;
+ if((l = *p++) == 0)
+ return s;
+ if((r = *p) == 0)
+ return s;
+ lt = breaklkup(l);
+ rt = breaklkup(r);
+ if(l == '\r' && r == '\n')
+ goto Done;
+ if(l == '\r' || l == '\n' || l == LINETAB)
+ return p;
+ if(r == '\r' || r == '\n' || l == LINETAB)
+ return p;
+ if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
+ goto Done;
+ if(IS(rt, Format) || IS(rt, Extend))
+ goto Done;
+ if(AH(lt)){
+ if(AH(rt))
+ goto Done;
+ if((IS(rt, MidLetter) || MNLQ(rt)) && p[1] != 0 && AH(breaklkup(p[1])))
+ goto Done;
+ if(IS(lt, Hebrew_Letter) && r == '\'')
+ goto Done;
+ if(IS(lt, Hebrew_Letter) && r == '"' && p[1] != 0 && IS(breaklkup(p[1]),
Hebrew_Letter))
+ goto Done;
+ if(IS(rt, Numeric))
+ goto Done;
+ }
+ if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
+ goto Done;
+ if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && p[1] != 0 &&
IS(breaklkup(p[1]), Numeric))
+ goto Done;
+ if(IS(lt, Katakana) && IS(rt, Katakana))
+ goto Done;
+ if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
+ if(IS(rt, ExtendNumLet))
+ goto Done;
+ if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
+ goto Done;
+ if(ISG(lt, REGION)){
+ if(ISG(rt, REGION))
+ goto Done;
+ if(r != ZWJ)
+ return p;
+ p++;
+ if((r = *p) == 0)
+ return s;
+ rt = breaklkup(r);
+ if(ISG(rt, REGION))
+ goto Done;
+ }
+
+ return p;
+
+Done:
+ if(p[1] == 0)
+ return s;
+ return p + 1;
+}
--- /dev/null
+++ b//sys/src/libc/port/runeistype.c
@@ -1,0 +1,40 @@
+#include <u.h>
+#include <libc.h>
+
+#include "/sys/src/libc/port/runeistypedata"
+
+int
+isspacerune(Rune c)
+{
+ return (mergedlkup(c) & Lspace) == Lspace;
+}
+
+int
+isalpharune(Rune c)
+{
+ return (mergedlkup(c) & Lalpha) == Lalpha;
+}
+
+int
+isdigitrune(Rune c)
+{
+ return (mergedlkup(c) & Ldigit) == Ldigit;
+}
+
+int
+isupperrune(Rune c)
+{
+ return (mergedlkup(c) & Lupper) == Lupper;
+}
+
+int
+islowerrune(Rune c)
+{
+ return (mergedlkup(c) & Llower) == Llower;
+}
+
+int
+istitlerune(Rune c)
+{
+ return (mergedlkup(c) & Ltitle) == Ltitle;
+}
--- /dev/null
+++ b//sys/src/libc/port/runenorm.c
@@ -1,0 +1,328 @@
+#include <u.h>
+#include <libc.h>
+
+#include "/sys/src/libc/port/runenormdata"
+
+//Unicode Standard: Section 3.12 Conjoining Jamo Behavior
+enum {
+ SBase = 0xAC00,
+ LBase = 0x1100,
+ VBase = 0x1161,
+ TBase = 0x11A7,
+
+ LCount = 19,
+ VCount = 21,
+ TCount = 28,
+ NCount = VCount * TCount,
+ SCount = LCount * NCount,
+
+ LLast = LBase + LCount - 1,
+ SLast = SBase + SCount - 1,
+ VLast = VBase + VCount - 1,
+ TLast = TBase + TCount - 1,
+};
+
+void
+decomposerune(Rune c, Rune dst[2])
+{
+ uint x;
+
+ if(c >= SBase && c <= SLast){
+ c -= SBase;
+ x = c % TCount;
+ if(x){
+ dst[0] = SBase + ((c / TCount) * TCount);
+ dst[1] = TBase + x;
+ return;
+ }
+ dst[0] = LBase + (c / NCount);
+ dst[1] = VBase + ((c % NCount) / TCount);
+ return;
+ }
+ x = decomplkup(c);
+ if((x & (ushort)~0) != 0){
+ dst[0] = x>>16;
+ dst[1] = x & (ushort)~0;
+ return;
+ }
+ x >>= 16;
+ if(x >= 0xEEEE && x <0xF8FF){
+ memmove(dst, _decompexceptions[x - 0xEEEE], sizeof(Rune)*2);
+ return;
+ }
+ dst[0] = x;
+ dst[1] = 0;
+}
+
+Rune
+composerune(Rune r[2])
+{
+ uint x, y, *p, next;
+
+ if(r[0] >= LBase && r[0] <= LLast){
+ if(r[1] < VBase || r[1] > VLast)
+ return 0;
+ x = (r[0] - LBase) * NCount + (r[1] - VBase) * TCount;
+ return SBase + x;
+ }
+ if(r[0] >= SBase && r[0] <= SLast && (r[0] - SBase) % TCount == 0){
+ if(r[1] > TBase && r[1] <= TLast)
+ return r[0] + (r[1] - TBase);
+ return 0;
+ }
+ if(r[0] > (ushort)~0 || r[1] > (ushort)~0){
+ for(x = 0; x < nelem(_recompexceptions); x++)
+ if(r[0] == _recompexceptions[x][1] && r[1] == _recompexceptions[x][2])
+ return _recompexceptions[x][0];
+ return 0;
+ }
+ y = x = r[0]<<16 | r[1];
+ x ^= x >> 16;
+ x *= 0x21f0aaad;
+ x ^= x >> 15;
+ x *= 0xd35a2d97;
+ x ^= x >> 15;
+ p = _recompdata + (x%512)*2;
+ while(p[0] != y){
+ next = p[1]>>16;
+ if(!next)
+ return 0;
+ p = _recompcoll + (next-1)*2;
+ }
+ return p[1] & 0xFFFF;
+}
+
+int
+runeccc(Rune c)
+{
+ return ccclkup(c);
+}
+
+void
+runecccsort(Rune *a, int len)
+{
+ Rune r;
+ int i;
+ int fail;
+
+ do {
+ fail = 0;
+ for(i = 0; i < len - 1; i++){
+ if(runeccc(a[i]) > runeccc(a[i+1]) > 0){
+ r = a[i];
+ a[i] = a[i+1];
+ a[i + 1] = r;
+ fail = 1;
+ }
+ }
+ } while(fail);
+}
+
+char*
+fullutfnorm(char *s, int n)
+{
+ Rune r, peek;
+ char *p, *p2;
+
+ p = s;
+ if(fullrune(p, n) == 0)
+ return s;
+
+ p += chartorune(&r, p);
+ n -= (p - s);
+
+ if((r >= LBase && r <= LLast) || (r >= SBase && r <= SLast)){
+ do {
+ if(fullrune(p, n) == 0)
+ return s;
+ p2 = p + chartorune(&peek, p);
+ n -= (p2 - p);
+ p = p2;
+ } while(n > 0 && (peek >= VBase && peek <= VLast) || (peek > TBase
&& peek <= TLast));
+ if(n <= 0)
+ return s;
+ return p;
+ }
+
+ do {
+ if(fullrune(p, n) == 0)
+ return s;
+ p2 = p + chartorune(&peek, p);
+ n -= (p2 - p);
+ p = p2;
+ if(runeccc(peek) == 0)
+ return p;
+ } while(n > 0);
+
+ return s;
+}
+
+Rune*
+fullrunenorm(Rune *r, int n)
+{
+ Rune *e, *p;
+
+ p = r;
+ e = p + n;
+
+ if((*p >= LBase && *p <= LLast) || (*p >= SBase && *p <= SLast)){
+ p++;
+ while(p < e && (*p >= VBase && *p <= VLast) || (*p > TBase && *p
<= TLast))
+ p++;
+
+ if(p >= e)
+ return r;
+ return p;
+ }
+
+ for(; p < e && p + 1 < e; p++)
+ if(runeccc(p[1]) == 0)
+ return p + 1;
+
+ return r;
+}
+
+int
+_runenorm(Rune *dst, Rune *src, char *sdst, char *ssrc, int max, int compose)
+{
+ Rune c, r[2], _stack[32];
+ Rune *p, *stack, *sp, *tp;
+ char *strp, *strstop;
+ Rune *rp, *rrp;
+ Rune *stop;
+ Rune peek;
+ int w, w2, size;
+ int mode;
+
+ if(src){
+ mode = 1;
+ p = src;
+ stop = dst + (max - 1);
+ strp = "";
+ strstop = nil;
+ } else {
+ mode = 0;
+ p = L"";
+ stop = nil;
+ strp = ssrc;
+ strstop = sdst + (max - 1);
+ }
+
+ stack = _stack + nelem(_stack)/2;
+ size = 0;
+ w = w2 = 0;
+ while(*strp || *p){
+ if(mode)
+ c = *p;
+ else
+ w = chartorune(&c, strp);
+
+ sp = stack - 1;
+ tp = stack;
+ decomposerune(c, r);
+ while(r[0] != 0){
+ c = r[0];
+ if(r[1] != 0){
+ *sp-- = r[1];
+ if(sp == _stack)
+ break;
+ }
+ decomposerune(c, r);
+ }
+
+ *sp = c;
+ if(mode)
+ peek = p[1];
+ else
+ w2 = chartorune(&peek, strp+w);
+
+ if((*sp >= LBase && *sp <= LLast) || (*sp >= SBase && *sp <=
SLast)){
+ while(peek != 0 && (peek >= VBase && peek <= VLast) || (peek > TBase &&
peek <= TLast)){
+ *tp++ = peek;
+ if(mode){
+ p++;
+ peek = p[1];
+ } else {
+ strp += w;
+ w = w2;
+ w2 = chartorune(&peek, strp+w);
+ }
+ if(tp == _stack + nelem(_stack))
+ break;
+ }
+ }
+ while(peek != 0 && runeccc(peek) != 0){
+ decomposerune(peek, r);
+ if(r[1] != 0){
+ if(tp+1 >= _stack + nelem(_stack))
+ break;
+ *tp++ = r[0];
+ *tp++ = r[1];
+ } else if(r[0] != 0)
+ *tp++ = r[0];
+ else
+ *tp++ = peek;
+
+ if(mode){
+ p++;
+ peek = p[1];
+ } else {
+ strp += w;
+ w = w2;
+ w2 = chartorune(&peek, strp+w);
+ }
+ if(tp == _stack + nelem(_stack))
+ break;
+ }
+ runecccsort(sp, tp - sp);
+
+ if(compose && runeccc(*sp) == 0){
+ for(rp = sp + 1; rp < tp; rp++){
+ r[0] = *sp;
+ r[1] = *rp;
+ c = composerune(r);
+ if(c != 0){
+ *sp = c;
+ for(rrp = rp; rrp > sp; rrp--)
+ *rrp = rrp[-1];
+ sp++;
+ } else while(rp + 1 < tp && runeccc(*rp) == runeccc(*(rp+1)))
+ rp++;
+ }
+ }
+
+ for(; sp < tp; sp++){
+ if(mode){
+ if(dst < stop)
+ *dst++ = *sp;
+ size++;
+ } else {
+ w2 = runelen(*sp);
+ if(sdst+w2 < strstop)
+ sdst += runetochar(sdst, sp);
+ size += w2;
+ }
+ }
+ if(mode)
+ p++;
+ else
+ strp += w;
+ }
+ if(mode)
+ *dst = 0;
+ else
+ *sdst = 0;
+ return size;
+}
+
+int
+runenorm(Rune *dst, Rune *src, int max, int compose)
+{
+ return _runenorm(dst, src, nil, nil, max, compose);
+}
+
+int
+utfnorm(char *dst, char *src, int max, int compose)
+{
+ return _runenorm(nil, nil, dst, src, max, compose);
+}
--- /dev/null
+++ b//sys/src/libc/port/runetotype.c
@@ -1,0 +1,22 @@
+#include <u.h>
+#include <libc.h>
+
+#include "/sys/src/libc/port/runetotypedata"
+
+Rune
+toupperrune(Rune c)
+{
+ return c + upperlkup(c);
+}
+
+Rune
+tolowerrune(Rune c)
+{
+ return c + lowerlkup(c);
+}
+
+Rune
+totitlerune(Rune c)
+{
+ return c + titlelkup(c);
+}
--- a//sys/src/libc/port/runetype.c
+++ /dev/null
@@ -1,1181 +1,0 @@
-#include <u.h>
-#include <libc.h>
-
-/*
- * alpha ranges -
- * only covers ranges not in lower||upper
- */
-static
-Rune _alpha2[] =
-{
- 0x00d8, 0x00f6, /* Ø - ö */
- 0x00f8, 0x01f5, /* ø - ǵ */
- 0x0250, 0x02a8, /* ɐ - ʨ */
- 0x038e, 0x03a1, /* Ύ - Ρ */
- 0x03a3, 0x03ce, /* Σ - ώ */
- 0x03d0, 0x03d6, /* ϐ - ϖ */
- 0x03e2, 0x03f3, /* Ϣ - ϳ */
- 0x0490, 0x04c4, /* Ґ - ӄ */
- 0x0561, 0x0587, /* ա - և */
- 0x05d0, 0x05ea, /* א - ת */
- 0x05f0, 0x05f2, /* װ - ײ */
- 0x0621, 0x063a, /* ء - غ */
- 0x0640, 0x064a, /* ـ - ي */
- 0x0671, 0x06b7, /* ٱ - ڷ */
- 0x06ba, 0x06be, /* ں - ھ */
- 0x06c0, 0x06ce, /* ۀ - ێ */
- 0x06d0, 0x06d3, /* ې - ۓ */
- 0x0905, 0x0939, /* अ - ह */
- 0x0958, 0x0961, /* क़ - ॡ */
- 0x0985, 0x098c, /* অ - ঌ */
- 0x098f, 0x0990, /* এ - ঐ */
- 0x0993, 0x09a8, /* ও - ন */
- 0x09aa, 0x09b0, /* প - র */
- 0x09b6, 0x09b9, /* শ - হ */
- 0x09dc, 0x09dd, /* ড় - ঢ় */
- 0x09df, 0x09e1, /* য় - ৡ */
- 0x09f0, 0x09f1, /* ৰ - ৱ */
- 0x0a05, 0x0a0a, /* ਅ - ਊ */
- 0x0a0f, 0x0a10, /* ਏ - ਐ */
- 0x0a13, 0x0a28, /* ਓ - ਨ */
- 0x0a2a, 0x0a30, /* ਪ - ਰ */
- 0x0a32, 0x0a33, /* ਲ - ਲ਼ */
- 0x0a35, 0x0a36, /* ਵ - ਸ਼ */
- 0x0a38, 0x0a39, /* ਸ - ਹ */
- 0x0a59, 0x0a5c, /* ਖ਼ - ੜ */
- 0x0a85, 0x0a8b, /* અ - ઋ */
- 0x0a8f, 0x0a91, /* એ - ઑ */
- 0x0a93, 0x0aa8, /* ઓ - ન */
- 0x0aaa, 0x0ab0, /* પ - ર */
- 0x0ab2, 0x0ab3, /* લ - ળ */
- 0x0ab5, 0x0ab9, /* વ - હ */
- 0x0b05, 0x0b0c, /* ଅ - ଌ */
- 0x0b0f, 0x0b10, /* ଏ - ଐ */
- 0x0b13, 0x0b28, /* ଓ - ନ */
- 0x0b2a, 0x0b30, /* ପ - ର */
- 0x0b32, 0x0b33, /* ଲ - ଳ */
- 0x0b36, 0x0b39, /* ଶ - ହ */
- 0x0b5c, 0x0b5d, /* ଡ଼ - ଢ଼ */
- 0x0b5f, 0x0b61, /* ୟ - ୡ */
- 0x0b85, 0x0b8a, /* அ - ஊ */
- 0x0b8e, 0x0b90, /* எ - ஐ */
- 0x0b92, 0x0b95, /* ஒ - க */
- 0x0b99, 0x0b9a, /* ங - ச */
- 0x0b9e, 0x0b9f, /* ஞ - ட */
- 0x0ba3, 0x0ba4, /* ண - த */
- 0x0ba8, 0x0baa, /* ந - ப */
- 0x0bae, 0x0bb5, /* ம - வ */
- 0x0bb7, 0x0bb9, /* ஷ - ஹ */
- 0x0c05, 0x0c0c, /* అ - ఌ */
- 0x0c0e, 0x0c10, /* ఎ - ఐ */
- 0x0c12, 0x0c28, /* ఒ - న */
- 0x0c2a, 0x0c33, /* ప - ళ */
- 0x0c35, 0x0c39, /* వ - హ */
- 0x0c60, 0x0c61, /* ౠ - ౡ */
- 0x0c85, 0x0c8c, /* ಅ - ಌ */
- 0x0c8e, 0x0c90, /* ಎ - ಐ */
- 0x0c92, 0x0ca8, /* ಒ - ನ */
- 0x0caa, 0x0cb3, /* ಪ - ಳ */
- 0x0cb5, 0x0cb9, /* ವ - ಹ */
- 0x0ce0, 0x0ce1, /* ೠ - ೡ */
- 0x0d05, 0x0d0c, /* അ - ഌ */
- 0x0d0e, 0x0d10, /* എ - ഐ */
- 0x0d12, 0x0d28, /* ഒ - ന */
- 0x0d2a, 0x0d39, /* പ - ഹ */
- 0x0d60, 0x0d61, /* ൠ - ൡ */
- 0x0e01, 0x0e30, /* ก - ะ */
- 0x0e32, 0x0e33, /* า - ำ */
- 0x0e40, 0x0e46, /* เ - ๆ */
- 0x0e5a, 0x0e5b, /* ๚ - ๛ */
- 0x0e81, 0x0e82, /* ກ - ຂ */
- 0x0e87, 0x0e88, /* ງ - ຈ */
- 0x0e94, 0x0e97, /* ດ - ທ */
- 0x0e99, 0x0e9f, /* ນ - ຟ */
- 0x0ea1, 0x0ea3, /* ມ - ຣ */
- 0x0eaa, 0x0eab, /* ສ - ຫ */
- 0x0ead, 0x0eae, /* ອ - ຮ */
- 0x0eb2, 0x0eb3, /* າ - ຳ */
- 0x0ec0, 0x0ec4, /* ເ - ໄ */
- 0x0edc, 0x0edd, /* ໜ - ໝ */
- 0x0f18, 0x0f19, /* ༘ - ༙ */
- 0x0f40, 0x0f47, /* ཀ - ཇ */
- 0x0f49, 0x0f69, /* ཉ - ཀྵ */
- 0x10d0, 0x10f6, /* ა - ჶ */
- 0x1100, 0x1159, /* ᄀ - ᅙ */
- 0x115f, 0x11a2, /* ᅟ - ᆢ */
- 0x11a8, 0x11f9, /* ᆨ - ᇹ */
- 0x1e00, 0x1e9b, /* Ḁ - ẛ */
- 0x1f50, 0x1f57, /* ὐ - ὗ */
- 0x1f80, 0x1fb4, /* ᾀ - ᾴ */
- 0x1fb6, 0x1fbc, /* ᾶ - ᾼ */
- 0x1fc2, 0x1fc4, /* ῂ - ῄ */
- 0x1fc6, 0x1fcc, /* ῆ - ῌ */
- 0x1fd0, 0x1fd3, /* ῐ - ΐ */
- 0x1fd6, 0x1fdb, /* ῖ - Ί */
- 0x1fe0, 0x1fec, /* ῠ - Ῥ */
- 0x1ff2, 0x1ff4, /* ῲ - ῴ */
- 0x1ff6, 0x1ffc, /* ῶ - ῼ */
- 0x210a, 0x2113, /* ℊ - ℓ */
- 0x2115, 0x211d, /* ℕ - ℝ */
- 0x2120, 0x2122, /* ℠ - ™ */
- 0x212a, 0x2131, /* K - ℱ */
- 0x2133, 0x2138, /* ℳ - ℸ */
- 0x3041, 0x3094, /* ぁ - ゔ */
- 0x30a1, 0x30fa, /* ァ - ヺ */
- 0x3105, 0x312c, /* ㄅ - ㄬ */
- 0x3131, 0x318e, /* ㄱ - ㆎ */
- 0x3192, 0x319f, /* ㆒ - ㆟ */
- 0x3260, 0x327b, /* ㉠ - ㉻ */
- 0x328a, 0x32b0, /* ㊊ - ㊰ */
- 0x32d0, 0x32fe, /* ㋐ - ㋾ */
- 0x3300, 0x3357, /* ㌀ - ㍗ */
- 0x3371, 0x3376, /* ㍱ - ㍶ */
- 0x337b, 0x3394, /* ㍻ - ㎔ */
- 0x3399, 0x339e, /* ㎙ - ㎞ */
- 0x33a9, 0x33ad, /* ㎩ - ㎭ */
- 0x33b0, 0x33c1, /* ㎰ - ㏁ */
- 0x33c3, 0x33c5, /* ㏃ - ㏅ */
- 0x33c7, 0x33d7, /* ㏇ - ㏗ */
- 0x33d9, 0x33dd, /* ㏙ - ㏝ */
- 0x4e00, 0x9fff, /* 一 - 鿿 */
- 0xac00, 0xd7a3, /* 가 - 힣 */
- 0xf900, 0xfb06, /* 豈 - st */
- 0xfb13, 0xfb17, /* ﬓ - ﬗ */
- 0xfb1f, 0xfb28, /* ײַ - ﬨ */
- 0xfb2a, 0xfb36, /* שׁ - זּ */
- 0xfb38, 0xfb3c, /* טּ - לּ */
- 0xfb40, 0xfb41, /* נּ - סּ */
- 0xfb43, 0xfb44, /* ףּ - פּ */
- 0xfb46, 0xfbb1, /* צּ - ﮱ */
- 0xfbd3, 0xfd3d, /* ﯓ - ﴽ */
- 0xfd50, 0xfd8f, /* ﵐ - ﶏ */
- 0xfd92, 0xfdc7, /* ﶒ - ﷇ */
- 0xfdf0, 0xfdf9, /* ﷰ - ﷹ */
- 0xfe70, 0xfe72, /* ﹰ - ﹲ */
- 0xfe76, 0xfefc, /* ﹶ - ﻼ */
- 0xff66, 0xff6f, /* ヲ - ッ */
- 0xff71, 0xff9d, /* ア - ン */
- 0xffa0, 0xffbe, /* ᅠ - ᄒ */
- 0xffc2, 0xffc7, /* ᅡ - ᅦ */
- 0xffca, 0xffcf, /* ᅧ - ᅬ */
- 0xffd2, 0xffd7, /* ᅭ - ᅲ */
- 0xffda, 0xffdc, /* ᅳ - ᅵ */
-};
-
-/*
- * alpha singlets -
- * only covers ranges not in lower||upper
- */
-static
-Rune _alpha1[] =
-{
- 0x00aa, /* ª */
- 0x00b5, /* µ */
- 0x00ba, /* º */
- 0x03da, /* Ϛ */
- 0x03dc, /* Ϝ */
- 0x03de, /* Ϟ */
- 0x03e0, /* Ϡ */
- 0x06d5, /* ە */
- 0x09b2, /* ল */
- 0x0a5e, /* ਫ਼ */
- 0x0a8d, /* ઍ */
- 0x0ae0, /* ૠ */
- 0x0b9c, /* ஜ */
- 0x0cde, /* ೞ */
- 0x0e4f, /* ๏ */
- 0x0e84, /* ຄ */
- 0x0e8a, /* ຊ */
- 0x0e8d, /* ຍ */
- 0x0ea5, /* ລ */
- 0x0ea7, /* ວ */
- 0x0eb0, /* ະ */
- 0x0ebd, /* ຽ */
- 0x1fbe, /* ι */
- 0x207f, /* ⁿ */
- 0x20a8, /* ₨ */
- 0x2102, /* ℂ */
- 0x2107, /* ℇ */
- 0x2124, /* ℤ */
- 0x2126, /* Ω */
- 0x2128, /* ℨ */
- 0xfb3e, /* מּ */
- 0xfe74, /* ﹴ */
-};
-
-/*
- * space ranges
- */
-static
-Rune _space2[] =
-{
- 0x0009, 0x000a, /* tab and newline */
- 0x0020, 0x0020, /* space */
- 0x0085, 0x0085,
- 0x00a0, 0x00a0, /*   */
- 0x1680, 0x1680,
- 0x180e, 0x180e,
- 0x2000, 0x200b, /*   - ​ */
- 0x2028, 0x2029, /* 
 - 
 */
- 0x202f, 0x202f,
- 0x205f, 0x205f,
- 0x3000, 0x3000, /*   */
- 0xfeff, 0xfeff, /*  */
-};
-
-/*
- * lower case ranges
- * 3rd col is conversion excess 500
- */
-static
-Rune _toupper2[] =
-{
- 0x0061, 0x007a, 468, /* a-z A-Z */
- 0x00e0, 0x00f6, 468, /* à-ö À-Ö */
- 0x00f8, 0x00fe, 468, /* ø-þ Ø-Þ */
- 0x0256, 0x0257, 295, /* ɖ-ɗ Ɖ-Ɗ */
- 0x0258, 0x0259, 298, /* ɘ-ə Ǝ-Ə */
- 0x028a, 0x028b, 283, /* ʊ-ʋ Ʊ-Ʋ */
- 0x03ad, 0x03af, 463, /* έ-ί Έ-Ί */
- 0x03b1, 0x03c1, 468, /* α-ρ Α-Ρ */
- 0x03c3, 0x03cb, 468, /* σ-ϋ Σ-Ϋ */
- 0x03cd, 0x03ce, 437, /* ύ-ώ Ύ-Ώ */
- 0x0430, 0x044f, 468, /* а-я А-Я */
- 0x0451, 0x045c, 420, /* ё-ќ Ё-Ќ */
- 0x045e, 0x045f, 420, /* ў-џ Ў-Џ */
- 0x0561, 0x0586, 452, /* ա-ֆ Ա-Ֆ */
- 0x1f00, 0x1f07, 508, /* ἀ-ἇ Ἀ-Ἇ */
- 0x1f10, 0x1f15, 508, /* ἐ-ἕ Ἐ-Ἕ */
- 0x1f20, 0x1f27, 508, /* ἠ-ἧ Ἠ-Ἧ */
- 0x1f30, 0x1f37, 508, /* ἰ-ἷ Ἰ-Ἷ */
- 0x1f40, 0x1f45, 508, /* ὀ-ὅ Ὀ-Ὅ */
- 0x1f60, 0x1f67, 508, /* ὠ-ὧ Ὠ-Ὧ */
- 0x1f70, 0x1f71, 574, /* ὰ-ά Ὰ-Ά */
- 0x1f72, 0x1f75, 586, /* ὲ-ή Ὲ-Ή */
- 0x1f76, 0x1f77, 600, /* ὶ-ί Ὶ-Ί */
- 0x1f78, 0x1f79, 628, /* ὸ-ό Ὸ-Ό */
- 0x1f7a, 0x1f7b, 612, /* ὺ-ύ Ὺ-Ύ */
- 0x1f7c, 0x1f7d, 626, /* ὼ-ώ Ὼ-Ώ */
- 0x1f80, 0x1f87, 508, /* ᾀ-ᾇ ᾈ-ᾏ */
- 0x1f90, 0x1f97, 508, /* ᾐ-ᾗ ᾘ-ᾟ */
- 0x1fa0, 0x1fa7, 508, /* ᾠ-ᾧ ᾨ-ᾯ */
- 0x1fb0, 0x1fb1, 508, /* ᾰ-ᾱ Ᾰ-Ᾱ */
- 0x1fd0, 0x1fd1, 508, /* ῐ-ῑ Ῐ-Ῑ */
- 0x1fe0, 0x1fe1, 508, /* ῠ-ῡ Ῠ-Ῡ */
- 0x2170, 0x217f, 484, /* ⅰ-ⅿ Ⅰ-Ⅿ */
- 0x24d0, 0x24e9, 474, /* ⓐ-ⓩ Ⓐ-Ⓩ */
- 0xff41, 0xff5a, 468, /* a-z A-Z */
-};
-
-/*
- * lower case singlets
- * 2nd col is conversion excess 500
- */
-static
-Rune _toupper1[] =
-{
- 0x00ff, 621, /* ÿ Ÿ */
- 0x0101, 499, /* ā Ā */
- 0x0103, 499, /* ă Ă */
- 0x0105, 499, /* ą Ą */
- 0x0107, 499, /* ć Ć */
- 0x0109, 499, /* ĉ Ĉ */
- 0x010b, 499, /* ċ Ċ */
- 0x010d, 499, /* č Č */
- 0x010f, 499, /* ď Ď */
- 0x0111, 499, /* đ Đ */
- 0x0113, 499, /* ē Ē */
- 0x0115, 499, /* ĕ Ĕ */
- 0x0117, 499, /* ė Ė */
- 0x0119, 499, /* ę Ę */
- 0x011b, 499, /* ě Ě */
- 0x011d, 499, /* ĝ Ĝ */
- 0x011f, 499, /* ğ Ğ */
- 0x0121, 499, /* ġ Ġ */
- 0x0123, 499, /* ģ Ģ */
- 0x0125, 499, /* ĥ Ĥ */
- 0x0127, 499, /* ħ Ħ */
- 0x0129, 499, /* ĩ Ĩ */
- 0x012b, 499, /* ī Ī */
- 0x012d, 499, /* ĭ Ĭ */
- 0x012f, 499, /* į Į */
- 0x0131, 268, /* ı I */
- 0x0133, 499, /* ij IJ */
- 0x0135, 499, /* ĵ Ĵ */
- 0x0137, 499, /* ķ Ķ */
- 0x013a, 499, /* ĺ Ĺ */
- 0x013c, 499, /* ļ Ļ */
- 0x013e, 499, /* ľ Ľ */
- 0x0140, 499, /* ŀ Ŀ */
- 0x0142, 499, /* ł Ł */
- 0x0144, 499, /* ń Ń */
- 0x0146, 499, /* ņ Ņ */
- 0x0148, 499, /* ň Ň */
- 0x014b, 499, /* ŋ Ŋ */
- 0x014d, 499, /* ō Ō */
- 0x014f, 499, /* ŏ Ŏ */
- 0x0151, 499, /* ő Ő */
- 0x0153, 499, /* œ Œ */
- 0x0155, 499, /* ŕ Ŕ */
- 0x0157, 499, /* ŗ Ŗ */
- 0x0159, 499, /* ř Ř */
- 0x015b, 499, /* ś Ś */
- 0x015d, 499, /* ŝ Ŝ */
- 0x015f, 499, /* ş Ş */
- 0x0161, 499, /* š Š */
- 0x0163, 499, /* ţ Ţ */
- 0x0165, 499, /* ť Ť */
- 0x0167, 499, /* ŧ Ŧ */
- 0x0169, 499, /* ũ Ũ */
- 0x016b, 499, /* ū Ū */
- 0x016d, 499, /* ŭ Ŭ */
- 0x016f, 499, /* ů Ů */
- 0x0171, 499, /* ű Ű */
- 0x0173, 499, /* ų Ų */
- 0x0175, 499, /* ŵ Ŵ */
- 0x0177, 499, /* ŷ Ŷ */
- 0x017a, 499, /* ź Ź */
- 0x017c, 499, /* ż Ż */
- 0x017e, 499, /* ž Ž */
- 0x017f, 200, /* ſ S */
- 0x0183, 499, /* ƃ Ƃ */
- 0x0185, 499, /* ƅ Ƅ */
- 0x0188, 499, /* ƈ Ƈ */
- 0x018c, 499, /* ƌ Ƌ */
- 0x0192, 499, /* ƒ Ƒ */
- 0x0199, 499, /* ƙ Ƙ */
- 0x01a1, 499, /* ơ Ơ */
- 0x01a3, 499, /* ƣ Ƣ */
- 0x01a5, 499, /* ƥ Ƥ */
- 0x01a8, 499, /* ƨ Ƨ */
- 0x01ad, 499, /* ƭ Ƭ */
- 0x01b0, 499, /* ư Ư */
- 0x01b4, 499, /* ƴ Ƴ */
- 0x01b6, 499, /* ƶ Ƶ */
- 0x01b9, 499, /* ƹ Ƹ */
- 0x01bd, 499, /* ƽ Ƽ */
- 0x01c5, 499, /* Dž DŽ */
- 0x01c6, 498, /* dž DŽ */
- 0x01c8, 499, /* Lj LJ */
- 0x01c9, 498, /* lj LJ */
- 0x01cb, 499, /* Nj NJ */
- 0x01cc, 498, /* nj NJ */
- 0x01ce, 499, /* ǎ Ǎ */
- 0x01d0, 499, /* ǐ Ǐ */
- 0x01d2, 499, /* ǒ Ǒ */
- 0x01d4, 499, /* ǔ Ǔ */
- 0x01d6, 499, /* ǖ Ǖ */
- 0x01d8, 499, /* ǘ Ǘ */
- 0x01da, 499, /* ǚ Ǚ */
- 0x01dc, 499, /* ǜ Ǜ */
- 0x01df, 499, /* ǟ Ǟ */
- 0x01e1, 499, /* ǡ Ǡ */
- 0x01e3, 499, /* ǣ Ǣ */
- 0x01e5, 499, /* ǥ Ǥ */
- 0x01e7, 499, /* ǧ Ǧ */
- 0x01e9, 499, /* ǩ Ǩ */
- 0x01eb, 499, /* ǫ Ǫ */
- 0x01ed, 499, /* ǭ Ǭ */
- 0x01ef, 499, /* ǯ Ǯ */
- 0x01f2, 499, /* Dz DZ */
- 0x01f3, 498, /* dz DZ */
- 0x01f5, 499, /* ǵ Ǵ */
- 0x01fb, 499, /* ǻ Ǻ */
- 0x01fd, 499, /* ǽ Ǽ */
- 0x01ff, 499, /* ǿ Ǿ */
- 0x0201, 499, /* ȁ Ȁ */
- 0x0203, 499, /* ȃ Ȃ */
- 0x0205, 499, /* ȅ Ȅ */
- 0x0207, 499, /* ȇ Ȇ */
- 0x0209, 499, /* ȉ Ȉ */
- 0x020b, 499, /* ȋ Ȋ */
- 0x020d, 499, /* ȍ Ȍ */
- 0x020f, 499, /* ȏ Ȏ */
- 0x0211, 499, /* ȑ Ȑ */
- 0x0213, 499, /* ȓ Ȓ */
- 0x0215, 499, /* ȕ Ȕ */
- 0x0217, 499, /* ȗ Ȗ */
- 0x0253, 290, /* ɓ Ɓ */
- 0x0254, 294, /* ɔ Ɔ */
- 0x025b, 297, /* ɛ Ɛ */
- 0x0260, 295, /* ɠ Ɠ */
- 0x0263, 293, /* ɣ Ɣ */
- 0x0268, 291, /* ɨ Ɨ */
- 0x0269, 289, /* ɩ Ɩ */
- 0x026f, 289, /* ɯ Ɯ */
- 0x0272, 287, /* ɲ Ɲ */
- 0x0283, 282, /* ʃ Ʃ */
- 0x0288, 282, /* ʈ Ʈ */
- 0x0292, 281, /* ʒ Ʒ */
- 0x03ac, 462, /* ά Ά */
- 0x03cc, 436, /* ό Ό */
- 0x03d0, 438, /* ϐ Β */
- 0x03d1, 443, /* ϑ Θ */
- 0x03d5, 453, /* ϕ Φ */
- 0x03d6, 446, /* ϖ Π */
- 0x03e3, 499, /* ϣ Ϣ */
- 0x03e5, 499, /* ϥ Ϥ */
- 0x03e7, 499, /* ϧ Ϧ */
- 0x03e9, 499, /* ϩ Ϩ */
- 0x03eb, 499, /* ϫ Ϫ */
- 0x03ed, 499, /* ϭ Ϭ */
- 0x03ef, 499, /* ϯ Ϯ */
- 0x03f0, 414, /* ϰ Κ */
- 0x03f1, 420, /* ϱ Ρ */
- 0x0461, 499, /* ѡ Ѡ */
- 0x0463, 499, /* ѣ Ѣ */
- 0x0465, 499, /* ѥ Ѥ */
- 0x0467, 499, /* ѧ Ѧ */
- 0x0469, 499, /* ѩ Ѩ */
- 0x046b, 499, /* ѫ Ѫ */
- 0x046d, 499, /* ѭ Ѭ */
- 0x046f, 499, /* ѯ Ѯ */
- 0x0471, 499, /* ѱ Ѱ */
- 0x0473, 499, /* ѳ Ѳ */
- 0x0475, 499, /* ѵ Ѵ */
- 0x0477, 499, /* ѷ Ѷ */
- 0x0479, 499, /* ѹ Ѹ */
- 0x047b, 499, /* ѻ Ѻ */
- 0x047d, 499, /* ѽ Ѽ */
- 0x047f, 499, /* ѿ Ѿ */
- 0x0481, 499, /* ҁ Ҁ */
- 0x0491, 499, /* ґ Ґ */
- 0x0493, 499, /* ғ Ғ */
- 0x0495, 499, /* ҕ Ҕ */
- 0x0497, 499, /* җ Җ */
- 0x0499, 499, /* ҙ Ҙ */
- 0x049b, 499, /* қ Қ */
- 0x049d, 499, /* ҝ Ҝ */
- 0x049f, 499, /* ҟ Ҟ */
- 0x04a1, 499, /* ҡ Ҡ */
- 0x04a3, 499, /* ң Ң */
- 0x04a5, 499, /* ҥ Ҥ */
- 0x04a7, 499, /* ҧ Ҧ */
- 0x04a9, 499, /* ҩ Ҩ */
- 0x04ab, 499, /* ҫ Ҫ */
- 0x04ad, 499, /* ҭ Ҭ */
- 0x04af, 499, /* ү Ү */
- 0x04b1, 499, /* ұ Ұ */
- 0x04b3, 499, /* ҳ Ҳ */
- 0x04b5, 499, /* ҵ Ҵ */
- 0x04b7, 499, /* ҷ Ҷ */
- 0x04b9, 499, /* ҹ Ҹ */
- 0x04bb, 499, /* һ Һ */
- 0x04bd, 499, /* ҽ Ҽ */
- 0x04bf, 499, /* ҿ Ҿ */
- 0x04c2, 499, /* ӂ Ӂ */
- 0x04c4, 499, /* ӄ Ӄ */
- 0x04c8, 499, /* ӈ Ӈ */
- 0x04cc, 499, /* ӌ Ӌ */
- 0x04d1, 499, /* ӑ Ӑ */
- 0x04d3, 499, /* ӓ Ӓ */
- 0x04d5, 499, /* ӕ Ӕ */
- 0x04d7, 499, /* ӗ Ӗ */
- 0x04d9, 499, /* ә Ә */
- 0x04db, 499, /* ӛ Ӛ */
- 0x04dd, 499, /* ӝ Ӝ */
- 0x04df, 499, /* ӟ Ӟ */
- 0x04e1, 499, /* ӡ Ӡ */
- 0x04e3, 499, /* ӣ Ӣ */
- 0x04e5, 499, /* ӥ Ӥ */
- 0x04e7, 499, /* ӧ Ӧ */
- 0x04e9, 499, /* ө Ө */
- 0x04eb, 499, /* ӫ Ӫ */
- 0x04ef, 499, /* ӯ Ӯ */
- 0x04f1, 499, /* ӱ Ӱ */
- 0x04f3, 499, /* ӳ Ӳ */
- 0x04f5, 499, /* ӵ Ӵ */
- 0x04f9, 499, /* ӹ Ӹ */
- 0x1e01, 499, /* ḁ Ḁ */
- 0x1e03, 499, /* ḃ Ḃ */
- 0x1e05, 499, /* ḅ Ḅ */
- 0x1e07, 499, /* ḇ Ḇ */
- 0x1e09, 499, /* ḉ Ḉ */
- 0x1e0b, 499, /* ḋ Ḋ */
- 0x1e0d, 499, /* ḍ Ḍ */
- 0x1e0f, 499, /* ḏ Ḏ */
- 0x1e11, 499, /* ḑ Ḑ */
- 0x1e13, 499, /* ḓ Ḓ */
- 0x1e15, 499, /* ḕ Ḕ */
- 0x1e17, 499, /* ḗ Ḗ */
- 0x1e19, 499, /* ḙ Ḙ */
- 0x1e1b, 499, /* ḛ Ḛ */
- 0x1e1d, 499, /* ḝ Ḝ */
- 0x1e1f, 499, /* ḟ Ḟ */
- 0x1e21, 499, /* ḡ Ḡ */
- 0x1e23, 499, /* ḣ Ḣ */
- 0x1e25, 499, /* ḥ Ḥ */
- 0x1e27, 499, /* ḧ Ḧ */
- 0x1e29, 499, /* ḩ Ḩ */
- 0x1e2b, 499, /* ḫ Ḫ */
- 0x1e2d, 499, /* ḭ Ḭ */
- 0x1e2f, 499, /* ḯ Ḯ */
- 0x1e31, 499, /* ḱ Ḱ */
- 0x1e33, 499, /* ḳ Ḳ */
- 0x1e35, 499, /* ḵ Ḵ */
- 0x1e37, 499, /* ḷ Ḷ */
- 0x1e39, 499, /* ḹ Ḹ */
- 0x1e3b, 499, /* ḻ Ḻ */
- 0x1e3d, 499, /* ḽ Ḽ */
- 0x1e3f, 499, /* ḿ Ḿ */
- 0x1e41, 499, /* ṁ Ṁ */
- 0x1e43, 499, /* ṃ Ṃ */
- 0x1e45, 499, /* ṅ Ṅ */
- 0x1e47, 499, /* ṇ Ṇ */
- 0x1e49, 499, /* ṉ Ṉ */
- 0x1e4b, 499, /* ṋ Ṋ */
- 0x1e4d, 499, /* ṍ Ṍ */
- 0x1e4f, 499, /* ṏ Ṏ */
- 0x1e51, 499, /* ṑ Ṑ */
- 0x1e53, 499, /* ṓ Ṓ */
- 0x1e55, 499, /* ṕ Ṕ */
- 0x1e57, 499, /* ṗ Ṗ */
- 0x1e59, 499, /* ṙ Ṙ */
- 0x1e5b, 499, /* ṛ Ṛ */
- 0x1e5d, 499, /* ṝ Ṝ */
- 0x1e5f, 499, /* ṟ Ṟ */
- 0x1e61, 499, /* ṡ Ṡ */
- 0x1e63, 499, /* ṣ Ṣ */
- 0x1e65, 499, /* ṥ Ṥ */
- 0x1e67, 499, /* ṧ Ṧ */
- 0x1e69, 499, /* ṩ Ṩ */
- 0x1e6b, 499, /* ṫ Ṫ */
- 0x1e6d, 499, /* ṭ Ṭ */
- 0x1e6f, 499, /* ṯ Ṯ */
- 0x1e71, 499, /* ṱ Ṱ */
- 0x1e73, 499, /* ṳ Ṳ */
- 0x1e75, 499, /* ṵ Ṵ */
- 0x1e77, 499, /* ṷ Ṷ */
- 0x1e79, 499, /* ṹ Ṹ */
- 0x1e7b, 499, /* ṻ Ṻ */
- 0x1e7d, 499, /* ṽ Ṽ */
- 0x1e7f, 499, /* ṿ Ṿ */
- 0x1e81, 499, /* ẁ Ẁ */
- 0x1e83, 499, /* ẃ Ẃ */
- 0x1e85, 499, /* ẅ Ẅ */
- 0x1e87, 499, /* ẇ Ẇ */
- 0x1e89, 499, /* ẉ Ẉ */
- 0x1e8b, 499, /* ẋ Ẋ */
- 0x1e8d, 499, /* ẍ Ẍ */
- 0x1e8f, 499, /* ẏ Ẏ */
- 0x1e91, 499, /* ẑ Ẑ */
- 0x1e93, 499, /* ẓ Ẓ */
- 0x1e95, 499, /* ẕ Ẕ */
- 0x1ea1, 499, /* ạ Ạ */
- 0x1ea3, 499, /* ả Ả */
- 0x1ea5, 499, /* ấ Ấ */
- 0x1ea7, 499, /* ầ Ầ */
- 0x1ea9, 499, /* ẩ Ẩ */
- 0x1eab, 499, /* ẫ Ẫ */
- 0x1ead, 499, /* ậ Ậ */
- 0x1eaf, 499, /* ắ Ắ */
- 0x1eb1, 499, /* ằ Ằ */
- 0x1eb3, 499, /* ẳ Ẳ */
- 0x1eb5, 499, /* ẵ Ẵ */
- 0x1eb7, 499, /* ặ Ặ */
- 0x1eb9, 499, /* ẹ Ẹ */
- 0x1ebb, 499, /* ẻ Ẻ */
- 0x1ebd, 499, /* ẽ Ẽ */
- 0x1ebf, 499, /* ế Ế */
- 0x1ec1, 499, /* ề Ề */
- 0x1ec3, 499, /* ể Ể */
- 0x1ec5, 499, /* ễ Ễ */
- 0x1ec7, 499, /* ệ Ệ */
- 0x1ec9, 499, /* ỉ Ỉ */
- 0x1ecb, 499, /* ị Ị */
- 0x1ecd, 499, /* ọ Ọ */
- 0x1ecf, 499, /* ỏ Ỏ */
- 0x1ed1, 499, /* ố Ố */
- 0x1ed3, 499, /* ồ Ồ */
- 0x1ed5, 499, /* ổ Ổ */
- 0x1ed7, 499, /* ỗ Ỗ */
- 0x1ed9, 499, /* ộ Ộ */
- 0x1edb, 499, /* ớ Ớ */
- 0x1edd, 499, /* ờ Ờ */
- 0x1edf, 499, /* ở Ở */
- 0x1ee1, 499, /* ỡ Ỡ */
- 0x1ee3, 499, /* ợ Ợ */
- 0x1ee5, 499, /* ụ Ụ */
- 0x1ee7, 499, /* ủ Ủ */
- 0x1ee9, 499, /* ứ Ứ */
- 0x1eeb, 499, /* ừ Ừ */
- 0x1eed, 499, /* ử Ử */
- 0x1eef, 499, /* ữ Ữ */
- 0x1ef1, 499, /* ự Ự */
- 0x1ef3, 499, /* ỳ Ỳ */
- 0x1ef5, 499, /* ỵ Ỵ */
- 0x1ef7, 499, /* ỷ Ỷ */
- 0x1ef9, 499, /* ỹ Ỹ */
- 0x1f51, 508, /* ὑ Ὑ */
- 0x1f53, 508, /* ὓ Ὓ */
- 0x1f55, 508, /* ὕ Ὕ */
- 0x1f57, 508, /* ὗ Ὗ */
- 0x1fb3, 509, /* ᾳ ᾼ */
- 0x1fc3, 509, /* ῃ ῌ */
- 0x1fe5, 507, /* ῥ Ῥ */
- 0x1ff3, 509, /* ῳ ῼ */
-};
-
-static Rune __isdigitr[] = {
- 0x0030, 0x0039,
- 0x0660, 0x0669,
- 0x06f0, 0x06f9,
- 0x07c0, 0x07c9,
- 0x0966, 0x096f,
- 0x09e6, 0x09ef,
- 0x0a66, 0x0a6f,
- 0x0ae6, 0x0aef,
- 0x0b66, 0x0b6f,
- 0x0be6, 0x0bef,
- 0x0c66, 0x0c6f,
- 0x0ce6, 0x0cef,
- 0x0d66, 0x0d6f,
- 0x0e50, 0x0e59,
- 0x0ed0, 0x0ed9,
- 0x0f20, 0x0f29,
- 0x1040, 0x1049,
- 0x17e0, 0x17e9,
- 0x1810, 0x1819,
- 0x1946, 0x194f,
- 0x19d0, 0x19d9,
- 0x1b50, 0x1b59,
- 0xff10, 0xff19,
- 0x104a0, 0x104a9,
- 0x1d7ce, 0x1d7ff,
-};
-
-/*
- * upper case ranges
- * 3rd col is conversion excess 500
- */
-static
-Rune _tolower2[] =
-{
- 0x0041, 0x005a, 532, /* A-Z a-z */
- 0x00c0, 0x00d6, 532, /* À-Ö à-ö */
- 0x00d8, 0x00de, 532, /* Ø-Þ ø-þ */
- 0x0189, 0x018a, 705, /* Ɖ-Ɗ ɖ-ɗ */
- 0x018e, 0x018f, 702, /* Ǝ-Ə ɘ-ə */
- 0x01b1, 0x01b2, 717, /* Ʊ-Ʋ ʊ-ʋ */
- 0x0388, 0x038a, 537, /* Έ-Ί έ-ί */
- 0x038e, 0x038f, 563, /* Ύ-Ώ ύ-ώ */
- 0x0391, 0x03a1, 532, /* Α-Ρ α-ρ */
- 0x03a3, 0x03ab, 532, /* Σ-Ϋ σ-ϋ */
- 0x0401, 0x040c, 580, /* Ё-Ќ ё-ќ */
- 0x040e, 0x040f, 580, /* Ў-Џ ў-џ */
- 0x0410, 0x042f, 532, /* А-Я а-я */
- 0x0531, 0x0556, 548, /* Ա-Ֆ ա-ֆ */
- 0x10a0, 0x10c5, 548, /* Ⴀ-Ⴥ ა-ჵ */
- 0x1f08, 0x1f0f, 492, /* Ἀ-Ἇ ἀ-ἇ */
- 0x1f18, 0x1f1d, 492, /* Ἐ-Ἕ ἐ-ἕ */
- 0x1f28, 0x1f2f, 492, /* Ἠ-Ἧ ἠ-ἧ */
- 0x1f38, 0x1f3f, 492, /* Ἰ-Ἷ ἰ-ἷ */
- 0x1f48, 0x1f4d, 492, /* Ὀ-Ὅ ὀ-ὅ */
- 0x1f68, 0x1f6f, 492, /* Ὠ-Ὧ ὠ-ὧ */
- 0x1f88, 0x1f8f, 492, /* ᾈ-ᾏ ᾀ-ᾇ */
- 0x1f98, 0x1f9f, 492, /* ᾘ-ᾟ ᾐ-ᾗ */
- 0x1fa8, 0x1faf, 492, /* ᾨ-ᾯ ᾠ-ᾧ */
- 0x1fb8, 0x1fb9, 492, /* Ᾰ-Ᾱ ᾰ-ᾱ */
- 0x1fba, 0x1fbb, 426, /* Ὰ-Ά ὰ-ά */
- 0x1fc8, 0x1fcb, 414, /* Ὲ-Ή ὲ-ή */
- 0x1fd8, 0x1fd9, 492, /* Ῐ-Ῑ ῐ-ῑ */
- 0x1fda, 0x1fdb, 400, /* Ὶ-Ί ὶ-ί */
- 0x1fe8, 0x1fe9, 492, /* Ῠ-Ῡ ῠ-ῡ */
- 0x1fea, 0x1feb, 388, /* Ὺ-Ύ ὺ-ύ */
- 0x1ff8, 0x1ff9, 372, /* Ὸ-Ό ὸ-ό */
- 0x1ffa, 0x1ffb, 374, /* Ὼ-Ώ ὼ-ώ */
- 0x2160, 0x216f, 516, /* Ⅰ-Ⅿ ⅰ-ⅿ */
- 0x24b6, 0x24cf, 526, /* Ⓐ-Ⓩ ⓐ-ⓩ */
- 0xff21, 0xff3a, 532, /* A-Z a-z */
-};
-
-/*
- * upper case singlets
- * 2nd col is conversion excess 500
- */
-static
-Rune _tolower1[] =
-{
- 0x0100, 501, /* Ā ā */
- 0x0102, 501, /* Ă ă */
- 0x0104, 501, /* Ą ą */
- 0x0106, 501, /* Ć ć */
- 0x0108, 501, /* Ĉ ĉ */
- 0x010a, 501, /* Ċ ċ */
- 0x010c, 501, /* Č č */
- 0x010e, 501, /* Ď ď */
- 0x0110, 501, /* Đ đ */
- 0x0112, 501, /* Ē ē */
- 0x0114, 501, /* Ĕ ĕ */
- 0x0116, 501, /* Ė ė */
- 0x0118, 501, /* Ę ę */
- 0x011a, 501, /* Ě ě */
- 0x011c, 501, /* Ĝ ĝ */
- 0x011e, 501, /* Ğ ğ */
- 0x0120, 501, /* Ġ ġ */
- 0x0122, 501, /* Ģ ģ */
- 0x0124, 501, /* Ĥ ĥ */
- 0x0126, 501, /* Ħ ħ */
- 0x0128, 501, /* Ĩ ĩ */
- 0x012a, 501, /* Ī ī */
- 0x012c, 501, /* Ĭ ĭ */
- 0x012e, 501, /* Į į */
- 0x0130, 301, /* İ i */
- 0x0132, 501, /* IJ ij */
- 0x0134, 501, /* Ĵ ĵ */
- 0x0136, 501, /* Ķ ķ */
- 0x0139, 501, /* Ĺ ĺ */
- 0x013b, 501, /* Ļ ļ */
- 0x013d, 501, /* Ľ ľ */
- 0x013f, 501, /* Ŀ ŀ */
- 0x0141, 501, /* Ł ł */
- 0x0143, 501, /* Ń ń */
- 0x0145, 501, /* Ņ ņ */
- 0x0147, 501, /* Ň ň */
- 0x014a, 501, /* Ŋ ŋ */
- 0x014c, 501, /* Ō ō */
- 0x014e, 501, /* Ŏ ŏ */
- 0x0150, 501, /* Ő ő */
- 0x0152, 501, /* Œ œ */
- 0x0154, 501, /* Ŕ ŕ */
- 0x0156, 501, /* Ŗ ŗ */
- 0x0158, 501, /* Ř ř */
- 0x015a, 501, /* Ś ś */
- 0x015c, 501, /* Ŝ ŝ */
- 0x015e, 501, /* Ş ş */
- 0x0160, 501, /* Š š */
- 0x0162, 501, /* Ţ ţ */
- 0x0164, 501, /* Ť ť */
- 0x0166, 501, /* Ŧ ŧ */
- 0x0168, 501, /* Ũ ũ */
- 0x016a, 501, /* Ū ū */
- 0x016c, 501, /* Ŭ ŭ */
- 0x016e, 501, /* Ů ů */
- 0x0170, 501, /* Ű ű */
- 0x0172, 501, /* Ų ų */
- 0x0174, 501, /* Ŵ ŵ */
- 0x0176, 501, /* Ŷ ŷ */
- 0x0178, 379, /* Ÿ ÿ */
- 0x0179, 501, /* Ź ź */
- 0x017b, 501, /* Ż ż */
- 0x017d, 501, /* Ž ž */
- 0x0181, 710, /* Ɓ ɓ */
- 0x0182, 501, /* Ƃ ƃ */
- 0x0184, 501, /* Ƅ ƅ */
- 0x0186, 706, /* Ɔ ɔ */
- 0x0187, 501, /* Ƈ ƈ */
- 0x018b, 501, /* Ƌ ƌ */
- 0x0190, 703, /* Ɛ ɛ */
- 0x0191, 501, /* Ƒ ƒ */
- 0x0193, 705, /* Ɠ ɠ */
- 0x0194, 707, /* Ɣ ɣ */
- 0x0196, 711, /* Ɩ ɩ */
- 0x0197, 709, /* Ɨ ɨ */
- 0x0198, 501, /* Ƙ ƙ */
- 0x019c, 711, /* Ɯ ɯ */
- 0x019d, 713, /* Ɲ ɲ */
- 0x01a0, 501, /* Ơ ơ */
- 0x01a2, 501, /* Ƣ ƣ */
- 0x01a4, 501, /* Ƥ ƥ */
- 0x01a7, 501, /* Ƨ ƨ */
- 0x01a9, 718, /* Ʃ ʃ */
- 0x01ac, 501, /* Ƭ ƭ */
- 0x01ae, 718, /* Ʈ ʈ */
- 0x01af, 501, /* Ư ư */
- 0x01b3, 501, /* Ƴ ƴ */
- 0x01b5, 501, /* Ƶ ƶ */
- 0x01b7, 719, /* Ʒ ʒ */
- 0x01b8, 501, /* Ƹ ƹ */
- 0x01bc, 501, /* Ƽ ƽ */
- 0x01c4, 502, /* DŽ dž */
- 0x01c5, 501, /* Dž dž */
- 0x01c7, 502, /* LJ lj */
- 0x01c8, 501, /* Lj lj */
- 0x01ca, 502, /* NJ nj */
- 0x01cb, 501, /* Nj nj */
- 0x01cd, 501, /* Ǎ ǎ */
- 0x01cf, 501, /* Ǐ ǐ */
- 0x01d1, 501, /* Ǒ ǒ */
- 0x01d3, 501, /* Ǔ ǔ */
- 0x01d5, 501, /* Ǖ ǖ */
- 0x01d7, 501, /* Ǘ ǘ */
- 0x01d9, 501, /* Ǚ ǚ */
- 0x01db, 501, /* Ǜ ǜ */
- 0x01de, 501, /* Ǟ ǟ */
- 0x01e0, 501, /* Ǡ ǡ */
- 0x01e2, 501, /* Ǣ ǣ */
- 0x01e4, 501, /* Ǥ ǥ */
- 0x01e6, 501, /* Ǧ ǧ */
- 0x01e8, 501, /* Ǩ ǩ */
- 0x01ea, 501, /* Ǫ ǫ */
- 0x01ec, 501, /* Ǭ ǭ */
- 0x01ee, 501, /* Ǯ ǯ */
- 0x01f1, 502, /* DZ dz */
- 0x01f2, 501, /* Dz dz */
- 0x01f4, 501, /* Ǵ ǵ */
- 0x01fa, 501, /* Ǻ ǻ */
- 0x01fc, 501, /* Ǽ ǽ */
- 0x01fe, 501, /* Ǿ ǿ */
- 0x0200, 501, /* Ȁ ȁ */
- 0x0202, 501, /* Ȃ ȃ */
- 0x0204, 501, /* Ȅ ȅ */
- 0x0206, 501, /* Ȇ ȇ */
- 0x0208, 501, /* Ȉ ȉ */
- 0x020a, 501, /* Ȋ ȋ */
- 0x020c, 501, /* Ȍ ȍ */
- 0x020e, 501, /* Ȏ ȏ */
- 0x0210, 501, /* Ȑ ȑ */
- 0x0212, 501, /* Ȓ ȓ */
- 0x0214, 501, /* Ȕ ȕ */
- 0x0216, 501, /* Ȗ ȗ */
- 0x0386, 538, /* Ά ά */
- 0x038c, 564, /* Ό ό */
- 0x03e2, 501, /* Ϣ ϣ */
- 0x03e4, 501, /* Ϥ ϥ */
- 0x03e6, 501, /* Ϧ ϧ */
- 0x03e8, 501, /* Ϩ ϩ */
- 0x03ea, 501, /* Ϫ ϫ */
- 0x03ec, 501, /* Ϭ ϭ */
- 0x03ee, 501, /* Ϯ ϯ */
- 0x0460, 501, /* Ѡ ѡ */
- 0x0462, 501, /* Ѣ ѣ */
- 0x0464, 501, /* Ѥ ѥ */
- 0x0466, 501, /* Ѧ ѧ */
- 0x0468, 501, /* Ѩ ѩ */
- 0x046a, 501, /* Ѫ ѫ */
- 0x046c, 501, /* Ѭ ѭ */
- 0x046e, 501, /* Ѯ ѯ */
- 0x0470, 501, /* Ѱ ѱ */
- 0x0472, 501, /* Ѳ ѳ */
- 0x0474, 501, /* Ѵ ѵ */
- 0x0476, 501, /* Ѷ ѷ */
- 0x0478, 501, /* Ѹ ѹ */
- 0x047a, 501, /* Ѻ ѻ */
- 0x047c, 501, /* Ѽ ѽ */
- 0x047e, 501, /* Ѿ ѿ */
- 0x0480, 501, /* Ҁ ҁ */
- 0x0490, 501, /* Ґ ґ */
- 0x0492, 501, /* Ғ ғ */
- 0x0494, 501, /* Ҕ ҕ */
- 0x0496, 501, /* Җ җ */
- 0x0498, 501, /* Ҙ ҙ */
- 0x049a, 501, /* Қ қ */
- 0x049c, 501, /* Ҝ ҝ */
- 0x049e, 501, /* Ҟ ҟ */
- 0x04a0, 501, /* Ҡ ҡ */
- 0x04a2, 501, /* Ң ң */
- 0x04a4, 501, /* Ҥ ҥ */
- 0x04a6, 501, /* Ҧ ҧ */
- 0x04a8, 501, /* Ҩ ҩ */
- 0x04aa, 501, /* Ҫ ҫ */
- 0x04ac, 501, /* Ҭ ҭ */
- 0x04ae, 501, /* Ү ү */
- 0x04b0, 501, /* Ұ ұ */
- 0x04b2, 501, /* Ҳ ҳ */
- 0x04b4, 501, /* Ҵ ҵ */
- 0x04b6, 501, /* Ҷ ҷ */
- 0x04b8, 501, /* Ҹ ҹ */
- 0x04ba, 501, /* Һ һ */
- 0x04bc, 501, /* Ҽ ҽ */
- 0x04be, 501, /* Ҿ ҿ */
- 0x04c1, 501, /* Ӂ ӂ */
- 0x04c3, 501, /* Ӄ ӄ */
- 0x04c7, 501, /* Ӈ ӈ */
- 0x04cb, 501, /* Ӌ ӌ */
- 0x04d0, 501, /* Ӑ ӑ */
- 0x04d2, 501, /* Ӓ ӓ */
- 0x04d4, 501, /* Ӕ ӕ */
- 0x04d6, 501, /* Ӗ ӗ */
- 0x04d8, 501, /* Ә ә */
- 0x04da, 501, /* Ӛ ӛ */
- 0x04dc, 501, /* Ӝ ӝ */
- 0x04de, 501, /* Ӟ ӟ */
- 0x04e0, 501, /* Ӡ ӡ */
- 0x04e2, 501, /* Ӣ ӣ */
- 0x04e4, 501, /* Ӥ ӥ */
- 0x04e6, 501, /* Ӧ ӧ */
- 0x04e8, 501, /* Ө ө */
- 0x04ea, 501, /* Ӫ ӫ */
- 0x04ee, 501, /* Ӯ ӯ */
- 0x04f0, 501, /* Ӱ ӱ */
- 0x04f2, 501, /* Ӳ ӳ */
- 0x04f4, 501, /* Ӵ ӵ */
- 0x04f8, 501, /* Ӹ ӹ */
- 0x1e00, 501, /* Ḁ ḁ */
- 0x1e02, 501, /* Ḃ ḃ */
- 0x1e04, 501, /* Ḅ ḅ */
- 0x1e06, 501, /* Ḇ ḇ */
- 0x1e08, 501, /* Ḉ ḉ */
- 0x1e0a, 501, /* Ḋ ḋ */
- 0x1e0c, 501, /* Ḍ ḍ */
- 0x1e0e, 501, /* Ḏ ḏ */
- 0x1e10, 501, /* Ḑ ḑ */
- 0x1e12, 501, /* Ḓ ḓ */
- 0x1e14, 501, /* Ḕ ḕ */
- 0x1e16, 501, /* Ḗ ḗ */
- 0x1e18, 501, /* Ḙ ḙ */
- 0x1e1a, 501, /* Ḛ ḛ */
- 0x1e1c, 501, /* Ḝ ḝ */
- 0x1e1e, 501, /* Ḟ ḟ */
- 0x1e20, 501, /* Ḡ ḡ */
- 0x1e22, 501, /* Ḣ ḣ */
- 0x1e24, 501, /* Ḥ ḥ */
- 0x1e26, 501, /* Ḧ ḧ */
- 0x1e28, 501, /* Ḩ ḩ */
- 0x1e2a, 501, /* Ḫ ḫ */
- 0x1e2c, 501, /* Ḭ ḭ */
- 0x1e2e, 501, /* Ḯ ḯ */
- 0x1e30, 501, /* Ḱ ḱ */
- 0x1e32, 501, /* Ḳ ḳ */
- 0x1e34, 501, /* Ḵ ḵ */
- 0x1e36, 501, /* Ḷ ḷ */
- 0x1e38, 501, /* Ḹ ḹ */
- 0x1e3a, 501, /* Ḻ ḻ */
- 0x1e3c, 501, /* Ḽ ḽ */
- 0x1e3e, 501, /* Ḿ ḿ */
- 0x1e40, 501, /* Ṁ ṁ */
- 0x1e42, 501, /* Ṃ ṃ */
- 0x1e44, 501, /* Ṅ ṅ */
- 0x1e46, 501, /* Ṇ ṇ */
- 0x1e48, 501, /* Ṉ ṉ */
- 0x1e4a, 501, /* Ṋ ṋ */
- 0x1e4c, 501, /* Ṍ ṍ */
- 0x1e4e, 501, /* Ṏ ṏ */
- 0x1e50, 501, /* Ṑ ṑ */
- 0x1e52, 501, /* Ṓ ṓ */
- 0x1e54, 501, /* Ṕ ṕ */
- 0x1e56, 501, /* Ṗ ṗ */
- 0x1e58, 501, /* Ṙ ṙ */
- 0x1e5a, 501, /* Ṛ ṛ */
- 0x1e5c, 501, /* Ṝ ṝ */
- 0x1e5e, 501, /* Ṟ ṟ */
- 0x1e60, 501, /* Ṡ ṡ */
- 0x1e62, 501, /* Ṣ ṣ */
- 0x1e64, 501, /* Ṥ ṥ */
- 0x1e66, 501, /* Ṧ ṧ */
- 0x1e68, 501, /* Ṩ ṩ */
- 0x1e6a, 501, /* Ṫ ṫ */
- 0x1e6c, 501, /* Ṭ ṭ */
- 0x1e6e, 501, /* Ṯ ṯ */
- 0x1e70, 501, /* Ṱ ṱ */
- 0x1e72, 501, /* Ṳ ṳ */
- 0x1e74, 501, /* Ṵ ṵ */
- 0x1e76, 501, /* Ṷ ṷ */
- 0x1e78, 501, /* Ṹ ṹ */
- 0x1e7a, 501, /* Ṻ ṻ */
- 0x1e7c, 501, /* Ṽ ṽ */
- 0x1e7e, 501, /* Ṿ ṿ */
- 0x1e80, 501, /* Ẁ ẁ */
- 0x1e82, 501, /* Ẃ ẃ */
- 0x1e84, 501, /* Ẅ ẅ */
- 0x1e86, 501, /* Ẇ ẇ */
- 0x1e88, 501, /* Ẉ ẉ */
- 0x1e8a, 501, /* Ẋ ẋ */
- 0x1e8c, 501, /* Ẍ ẍ */
- 0x1e8e, 501, /* Ẏ ẏ */
- 0x1e90, 501, /* Ẑ ẑ */
- 0x1e92, 501, /* Ẓ ẓ */
- 0x1e94, 501, /* Ẕ ẕ */
- 0x1ea0, 501, /* Ạ ạ */
- 0x1ea2, 501, /* Ả ả */
- 0x1ea4, 501, /* Ấ ấ */
- 0x1ea6, 501, /* Ầ ầ */
- 0x1ea8, 501, /* Ẩ ẩ */
- 0x1eaa, 501, /* Ẫ ẫ */
- 0x1eac, 501, /* Ậ ậ */
- 0x1eae, 501, /* Ắ ắ */
- 0x1eb0, 501, /* Ằ ằ */
- 0x1eb2, 501, /* Ẳ ẳ */
- 0x1eb4, 501, /* Ẵ ẵ */
- 0x1eb6, 501, /* Ặ ặ */
- 0x1eb8, 501, /* Ẹ ẹ */
- 0x1eba, 501, /* Ẻ ẻ */
- 0x1ebc, 501, /* Ẽ ẽ */
- 0x1ebe, 501, /* Ế ế */
- 0x1ec0, 501, /* Ề ề */
- 0x1ec2, 501, /* Ể ể */
- 0x1ec4, 501, /* Ễ ễ */
- 0x1ec6, 501, /* Ệ ệ */
- 0x1ec8, 501, /* Ỉ ỉ */
- 0x1eca, 501, /* Ị ị */
- 0x1ecc, 501, /* Ọ ọ */
- 0x1ece, 501, /* Ỏ ỏ */
- 0x1ed0, 501, /* Ố ố */
- 0x1ed2, 501, /* Ồ ồ */
- 0x1ed4, 501, /* Ổ ổ */
- 0x1ed6, 501, /* Ỗ ỗ */
- 0x1ed8, 501, /* Ộ ộ */
- 0x1eda, 501, /* Ớ ớ */
- 0x1edc, 501, /* Ờ ờ */
- 0x1ede, 501, /* Ở ở */
- 0x1ee0, 501, /* Ỡ ỡ */
- 0x1ee2, 501, /* Ợ ợ */
- 0x1ee4, 501, /* Ụ ụ */
- 0x1ee6, 501, /* Ủ ủ */
- 0x1ee8, 501, /* Ứ ứ */
- 0x1eea, 501, /* Ừ ừ */
- 0x1eec, 501, /* Ử ử */
- 0x1eee, 501, /* Ữ ữ */
- 0x1ef0, 501, /* Ự ự */
- 0x1ef2, 501, /* Ỳ ỳ */
- 0x1ef4, 501, /* Ỵ ỵ */
- 0x1ef6, 501, /* Ỷ ỷ */
- 0x1ef8, 501, /* Ỹ ỹ */
- 0x1f59, 492, /* Ὑ ὑ */
- 0x1f5b, 492, /* Ὓ ὓ */
- 0x1f5d, 492, /* Ὕ ὕ */
- 0x1f5f, 492, /* Ὗ ὗ */
- 0x1fbc, 491, /* ᾼ ᾳ */
- 0x1fcc, 491, /* ῌ ῃ */
- 0x1fec, 493, /* Ῥ ῥ */
- 0x1ffc, 491, /* ῼ ῳ */
-};
-
-/*
- * title characters are those between
- * upper and lower case.  ie DZ Dz dz
- */
-static
-Rune _totitle1[] =
-{
- 0x01c4, 501, /* DŽ Dž */
- 0x01c6, 499, /* dž Dž */
- 0x01c7, 501, /* LJ Lj */
- 0x01c9, 499, /* lj Lj */
- 0x01ca, 501, /* NJ Nj */
- 0x01cc, 499, /* nj Nj */
- 0x01f1, 501, /* DZ Dz */
- 0x01f3, 499, /* dz Dz */
-};
-
-static
-Rune*
-bsearch(Rune c, Rune *t, int n, int ne)
-{
- Rune *p;
- int m;
-
- while(n > 1) {
- m = n/2;
- p = t + m*ne;
- if(c >= p[0]) {
- t = p;
- n = n-m;
- } else
- n = m;
- }
- if(n && c >= t[0])
- return t;
- return 0;
-}
-
-Rune
-tolowerrune(Rune c)
-{
- Rune *p;
-
- p = bsearch(c, _tolower2, nelem(_tolower2)/3, 3);
- if(p && c >= p[0] && c <= p[1])
- return c + p[2] - 500;
- p = bsearch(c, _tolower1, nelem(_tolower1)/2, 2);
- if(p && c == p[0])
- return c + p[1] - 500;
- return c;
-}
-
-Rune
-toupperrune(Rune c)
-{
- Rune *p;
-
- p = bsearch(c, _toupper2, nelem(_toupper2)/3, 3);
- if(p && c >= p[0] && c <= p[1])
- return c + p[2] - 500;
- p = bsearch(c, _toupper1, nelem(_toupper1)/2, 2);
- if(p && c == p[0])
- return c + p[1] - 500;
- return c;
-}
-
-Rune
-totitlerune(Rune c)
-{
- Rune *p;
-
- p = bsearch(c, _totitle1, nelem(_totitle1)/2, 2);
- if(p && c == p[0])
- return c + p[1] - 500;
- return c;
-}
-
-int
-islowerrune(Rune c)
-{
- Rune *p;
-
- p = bsearch(c, _toupper2, nelem(_toupper2)/3, 3);
- if(p && c >= p[0] && c <= p[1])
- return 1;
- p = bsearch(c, _toupper1, nelem(_toupper1)/2, 2);
- if(p && c == p[0])
- return 1;
- return 0;
-}
-
-int
-isupperrune(Rune c)
-{
- Rune *p;
-
- p = bsearch(c, _tolower2, nelem(_tolower2)/3, 3);
- if(p && c >= p[0] && c <= p[1])
- return 1;
- p = bsearch(c, _tolower1, nelem(_tolower1)/2, 2);
- if(p && c == p[0])
- return 1;
- return 0;
-}
-
-int
-isalpharune(Rune c)
-{
- Rune *p;
-
- if(isupperrune(c) || islowerrune(c))
- return 1;
- p = bsearch(c, _alpha2, nelem(_alpha2)/2, 2);
- if(p && c >= p[0] && c <= p[1])
- return 1;
- p = bsearch(c, _alpha1, nelem(_alpha1), 1);
- if(p && c == p[0])
- return 1;
- return 0;
-}
-
-int
-istitlerune(Rune c)
-{
- return isupperrune(c) && islowerrune(c);
-}
-
-int
-isspacerune(Rune c)
-{
- Rune *p;
-
- p = bsearch(c, _space2, nelem(_space2)/2, 2);
- if(p && c >= p[0] && c <= p[1])
- return 1;
- return 0;
-}
-
-int
-isdigitrune(Rune c)
-{
- Rune *p;
-
- p = bsearch(c, __isdigitr, nelem(__isdigitr)/2, 2);
- if(p && c >= p[0] && c <= p[1])
- return 1;
- return 0;
-}
--- a//sys/src/libc/test/mkfile
+++ b//sys/src/libc/test/mkfile
@@ -3,6 +3,8 @@
 TEST=\
	date\
	pow\
+ runebreak\
+ runenorm\
	strchr\

 </sys/src/cmd/mktest
--- /dev/null
+++ b//sys/src/libc/test/runebreak.c
@@ -1,0 +1,93 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+
+static int
+estrtoul(char *s)
+{
+ char *epr;
+ Rune code;
+
+ code = strtoul(s, &epr, 16);
+ if(s == epr)
+ sysfatal("bad code point hex string");
+ return code;
+}
+
+static void
+run(char *file, Rune* (*fn)(Rune*))
+{
+ Biobuf *b;
+ char *p, *dot;
+ char *pieces[16];
+ int i, j, n;
+ Rune stack[16], ops[16];
+ int nstack, nops;
+ Rune r, *rp, *rp2;
+ char *line;
+
+ b = Bopen(file, OREAD);
+ if(b == nil)
+ sysfatal("could not load composition exclusions: %r");
+
+ for(;(p = Brdline(b, '\n')) != nil; free(line)){
+ p[Blinelen(b)-1] = 0;
+ line = strdup(p);
+ if(p[0] == 0 || p[0] == '#')
+ continue;
+ if((dot = strstr(p, "#")) != nil)
+ *dot = 0;
+ n = getfields(p, pieces, nelem(pieces), 0, " ");
+ nstack = nops = 0;
+ for(i = 0; i < n; i++){
+ chartorune(&r, pieces[i]);
+ if(r != L'÷' && r != L'×'){
+ r = estrtoul(pieces[i]);
+ stack[nstack++] = r;
+ stack[nstack] = 0;
+ } else {
+ ops[nops++] = r;
+ ops[nops] = 0;
+ }
+ }
+
+ rp = stack;
+ for(i = 1; i < nops-1;){
+ rp2 = fn(rp);
+ switch(ops[i]){
+ case L'÷':
+ if(rp2 != rp+1){
+ print("break fail %X %X || %s\n", rp[0], rp[1], line);
+ goto Break;
+ }
+ rp++;
+ i++;
+ break;
+ case L'×':
+ if(rp2 - rp == 0){
+ for(j = i; j < nops - 1; j++)
+ if(ops[j] != L'×')
+ print("skipped %d %d %s\n", i, nops, line);
+ goto Break;
+ }
+ for(; rp < (rp2-1); rp++, i++){
+ if(ops[i] != L'×')
+ print("skipped %d %d %s\n", i, nops, line);
+ }
+ rp = rp2;
+ i++;
+ break;
+ }
+ }
+Break:
+ ;
+ }
+}
+
+void
+main(int, char)
+{
+ run("/lib/ucd/GraphemeBreakTest.txt", runegbreak);
+ run("/lib/ucd/WordBreakTest.txt", runewbreak);
+ exits(nil);
+}
--- /dev/null
+++ b//sys/src/libc/test/runenorm.c
@@ -1,0 +1,92 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+
+static int
+estrtoul(char *s)
+{
+ char *epr;
+ Rune code;
+
+ code = strtoul(s, &epr, 16);
+ if(s == epr)
+ sysfatal("bad code point hex string");
+ return code;
+}
+
+void
+main(int, char)
+{
+ Rune buffer1[64];
+ Rune buffer2[64];
+ char utfbuff1[128];
+ char utfbuff2[128];
+ char srctmp[128], tmp1[128], tmp2[128];
+ char *fields[10];
+ char *runes[32];
+ char *p;
+ int n, n2;
+ int i;
+ uint fail;
+ Biobuf *b;
+
+ b = Bopen("/lib/ucd/NormalizationTest.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load composition exclusions: %r");
+
+ struct {
+ Rune src[32];
+ Rune nfc[32];
+ Rune nfd[32];
+ } test;
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#' || p[0] == '@')
+ continue;
+ getfields(p, fields, 6 + 1, 0, ";");
+ n = getfields(fields[0], runes, nelem(runes), 0, " ");
+ for(i = 0; i < n; i++)
+ test.src[i] = estrtoul(runes[i]);
+ test.src[i] = 0;
+
+ n = getfields(fields[1], runes, nelem(runes), 0, " ");
+ for(i = 0; i < n; i++)
+ test.nfc[i] = estrtoul(runes[i]);
+ test.nfc[i] = 0;
+
+ n = getfields(fields[2], runes, nelem(runes), 0, " ");
+ for(i = 0; i < n; i++)
+ test.nfd[i] = estrtoul(runes[i]);
+ test.nfd[i] = 0;
+
+ n = runenorm(buffer1, test.src, nelem(buffer1), 1);
+ n2 = runenorm(buffer2, test.src, nelem(buffer2), 0);
+ fail = 0;
+
+ if(runestrcmp(buffer1, test.nfc) != 0)
+ fail |= 1<<0;
+ if(runestrcmp(buffer2, test.nfd) != 0)
+ fail |= 1<<1;
+ if(fail)
+ print("%d %d %S %S %S %S %S\n", fail, i, test.src, test.nfd, test.nfc, buffer2,
buffer1);
+ assert(n == runestrlen(test.nfc));
+ assert(n2 == runestrlen(test.nfd));
+
+ snprint(srctmp, sizeof tmp1, "%S", test.src);
+ snprint(tmp1, sizeof tmp1, "%S", test.nfc);
+ snprint(tmp2, sizeof tmp2, "%S", test.nfd);
+
+ n = utfnorm(utfbuff1, srctmp, nelem(utfbuff1), 1);
+ n2 = utfnorm(utfbuff2, srctmp, nelem(utfbuff2), 0);
+
+ if(strcmp(utfbuff1, tmp1) != 0)
+ fail |= 1<<2;
+ if(strcmp(utfbuff2, tmp2) != 0)
+ fail |= 1<<3;
+ if(fail)
+ print("%d %d %s %s %s %s %s\n", fail, i, srctmp, tmp2, tmp1, utfbuff2,
utfbuff1);
+ assert(n == strlen(tmp1));
+ assert(n2 == strlen(tmp2));
+ }
+ exits(nil);
+}


Wed Mar 22 21:16:26 EDT 2023
diff b8ae7708fb3ef3acbb30ccf3181897f8157c18de uncommitted
--- /dev/null
+++ b//lib/ucd/mkfile
@@ -1,0 +1,70 @@
+</$objtype/mkfile
+
+VERSION='15.0.0'
+URL='https://www.unicode.org/Public/'$VERSION'/ucd/'
+
+TXT=\
+ ArabicShaping.txt\
+ BidiBrackets.txt\
+ BidiMirroring.txt\
+ BidiTest.txt\
+ Blocks.txt\
+ CJKRadicals.txt\
+ CaseFolding.txt\
+ CompositionExclusions.txt\
+ DerivedAge.txt\
+ DerivedCoreProperties.txt\
+ DerivedNormalizationProps.txt\
+ EastAsianWidth.txt\
+ EmojiSources.txt\
+ EquivalentUnifiedIdeograph.txt\
+ HangulSyllableType.txt\
+ Index.txt\
+ IndicPositionalCategory.txt\
+ IndicSyllabicCategory.txt\
+ Jamo.txt\
+ LineBreak.txt\
+ NameAliases.txt\
+ NamedSequences.txt\
+ NamedSequencesProv.txt\
+ NamesList.txt\
+ NormalizationCorrections.txt\
+ NushuSources.txt\
+ PropList.txt\
+ PropertyAliases.txt\
+ PropertyValueAliases.txt\
+ ScriptExtensions.txt\
+ Scripts.txt\
+ SpecialCasing.txt\
+ StandardizedVariants.txt\
+ TangutSources.txt\
+ USourceData.txt\
+ UnicodeData.txt\
+ VerticalOrientation.txt\
+
+TEST=\
+ NormalizationTest.txt\
+ BidiCharacterTest.txt\
+
+PDF=\
+ USourceGlyphs.pdf\
+ USourceRSChart.pdf\
+
+AUX=\
+ WordBreakProperty.txt\
+ GraphemeBreakProperty.txt\
+
+ucd:V: UnicodeData.txt
+
+%.txt:
+ hget $URL^$target > $target >[2]/dev/null || hget
$URL^'auxiliary/'^$target > $target
+%.pdf:
+ hget $URL^$target > $target
+
+txt:V: $TXT
+
+pdf:V: $PDF
+
+test:V: $TEST
+
+all:V: $TXT $PDF $TEST
--- a//sys/include/libc.h
+++ b//sys/include/libc.h
@@ -77,6 +77,14 @@
 extern long runestrlen(Rune*);
 extern Rune* runestrstr(Rune*, Rune*);

+extern int runenorm(Rune*, Rune*, int, int);
+extern int utfnorm(char*,char*,int,int);
+extern char* fullutfnorm(char*,int);
+extern Rune* fullrunenorm(Rune*,int);
+
+extern Rune* runewbreak(Rune*);
+extern Rune* runegbreak(Rune*);
+
 extern Rune tolowerrune(Rune);
 extern Rune totitlerune(Rune);
 extern Rune toupperrune(Rune);
@@ -404,7 +412,7 @@
 extern int enc16chr(int);

 extern int encodefmt(Fmt*);
-extern void exits(char*);
+extern _Noreturn void exits(char*);
 extern double frexp(double, int*);
 extern uintptr getcallerpc(void*);
 extern char* getenv(char*);
@@ -431,7 +439,7 @@
 extern ulong strtoul(char*, char**, int);
 extern vlong strtoll(char*, char**, int);
 extern uvlong strtoull(char*, char**, int);
-extern void sysfatal(char*, ...);
+extern _Noreturn void sysfatal(char*, ...);
 #pragma varargck argpos sysfatal 1
 extern void syslog(int, char*, char*, ...);
 #pragma varargck argpos syslog 3
@@ -677,7 +685,7 @@
	ulong len;
 } IOchunk;

-extern void _exits(char*);
+extern _Noreturn void _exits(char*);

 extern void abort(void);
 extern int access(char*, int);
--- a//sys/src/cmd/tcs/hdr.h
+++ b//sys/src/cmd/tcs/hdr.h
@@ -23,6 +23,8 @@

 void utf_in(int, long *, struct convert *);
 void utf_out(Rune *, int, long *);
+void utfnfc_out(Rune *, int, long *);
+void utfnfd_out(Rune *, int, long *);
 void isoutf_in(int, long *, struct convert *);
 void isoutf_out(Rune *, int, long *);

--- a//sys/src/cmd/tcs/tcs.c
+++ b//sys/src/cmd/tcs/tcs.c
@@ -613,6 +613,10 @@
	{ "utf-16be", "alias for unicode-be (MIME)", Func, 0,
	(Fnptr)unicode_out_be },
	{ "utf-16le", "alias for unicode-le (MIME)", From|Func, 0,
	(Fnptr)unicode_in_le },
	{ "utf-16le", "alias for unicode-le (MIME)", Func, 0,
	(Fnptr)unicode_out_le },
+ { "nfc", "UTF Normalization Form C", From|Func, 0, (Fnptr)utf_in },
+ { "nfc", "UTF Normalization Form C", Func, 0, (Fnptr)utfnfc_out },
+ { "nfd", "UTF Normalization Form D", From|Func, 0, (Fnptr)utf_in },
+ { "nfd", "UTF Normalization Form D", Func, 0, (Fnptr)utfnfd_out },
	{ "viet1", "Vietnamese VSCII-1 (1993)", Table, (void *)tabviet1 },
	{ "viet2", "Vietnamese VSCII-2 (1993)", Table, (void *)tabviet2 },
	{ "vscii", "Vietnamese VISCII 1.1 (1992)", Table, (void *)tabviscii },
--- a//sys/src/cmd/tcs/utf.c
+++ b//sys/src/cmd/tcs/utf.c
@@ -19,38 +19,27 @@
 void
 utf_in(int fd, long *, struct convert *out)
 {
- char buf[N];
- int i, j, c, n, tot;
- unsigned long l;
+ char buf[N + 1];
+ Rune r;
+ char *p;
+ int n, tot, j;

	tot = 0;
+ j = 0;
	while((n = read(fd, buf+tot, N-tot)) >= 0){
		tot += n;
- for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i)));
){
- c = our_mbtowc(&l, buf+i, tot-i);
- if(c == -1){
- if(squawk)
- warn("bad UTF sequence near byte %ld in input", ninput+i);
- if(clean){
- i++;
- continue;
- }
- nerrors++;
- l = Runeerror;
- c = 1;
- }
- runes[j++] = l;
- i += c;
+ if(fullutfnorm(buf, tot) == buf)
+ continue;
+ /* fullutfnorm ensures rune boundary */
+ for(p = buf; p < buf + tot;){
+ p += chartorune(&r, p);
+ runes[j++] = r;
+ runes[j] = 0;
		}
		OUT(out, runes, j);
- tot -= i;
- ninput += i;
- if(tot)
- memmove(buf, buf+i, tot);
- if(n == 0)
- break;
+ j = 0;
+ tot = 0;
	}
- OUT(out, runes, 0);
 }

 void
@@ -66,6 +55,26 @@
	noutput += p-obuf;
	if(p > obuf)
		write(1, obuf, p-obuf);
+}
+
+void
+utfnfc_out(Rune *base, int n, long *)
+{
+ Rune buf[N + 1];
+ int w;
+
+ w = runenorm(buf, base, n + 1, 1);
+ utf_out(buf, w, nil);
+}
+
+void
+utfnfd_out(Rune *base, int n, long *)
+{
+ Rune buf[N + 1];
+ int w;
+
+ w = runenorm(buf, base, n + 1, 0);
+ utf_out(buf, w, nil);
 }

 void
--- a//sys/src/libc/port/mkfile
+++ b//sys/src/libc/port/mkfile
@@ -62,6 +62,9 @@
	rand.c\
	readn.c\
	rune.c\
+ runebreak.c\
+ runeistype.c\
+ runenorm.c\
	runestrcat.c\
	runestrchr.c\
	runestrcmp.c\
@@ -74,7 +77,7 @@
	runestrrchr.c\
	runestrlen.c\
	runestrstr.c\
- runetype.c\
+ runetotype.c\
	sin.c\
	sinh.c\
	sqrt.c\
@@ -127,3 +130,16 @@
 </sys/src/cmd/mksyslib

 profile.$O: /sys/include/tos.h
+
+runenorm.$O: runenormdata runenorm.c
+runetotype.$O: runetotypedata runetotype.c
+runeistype.$O: runeistypedata runeistype.c
+runebreak.$O: runebreakdata runebreak.c
+
+runenormdata runetotypedata runeistypedata runebreakdata: mkrunetype.c
+ @{
+ eval `{grep '^[A-Z]' /$cputype/mkfile}
+ $CC $CFLAGS -o mkrunetype.$O $prereq
+ $LD $LDFLAGS -o $O.mkrunetype mkrunetype.$O
+ $O.mkrunetype
+ }
--- /dev/null
+++ b//sys/src/libc/port/mkrunetype.c
@@ -1,0 +1,761 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+
+enum{
+ NRUNES = 1<<21
+};
+
+typedef struct Param Param;
+typedef struct Lvl Lvl;
+struct Lvl{
+ int bits;
+ int max;
+ int mask;
+};
+struct Param{
+ Lvl idx1;
+ Lvl idx2;
+ Lvl data;
+
+ int round1max;
+};
+
+static void
+derive(Lvl *l)
+{
+ l->max = 1 << l->bits;
+ l->mask = l->max - 1;
+}
+
+static void
+param(Param *p, int idx1, int idx2)
+{
+
+ assert(idx1 + idx2 < 21);
+ p->idx1.bits = idx1;
+ p->idx2.bits = idx2;
+ p->data.bits = 21 - idx1 - idx2;
+ derive(&p->idx1);
+ derive(&p->idx2);
+ derive(&p->data);
+
+ p->round1max = NRUNES/p->data.max;
+}
+
+static int
+lkup(Param *p, int *idx1, int *idx2, int *data, int x)
+{
+ int y, z;
+
+ y = (((x)>>(p->data.bits+p->idx2.bits))&p->idx1.mask);
+ z = (((x)>>p->data.bits)&p->idx2.mask);
+ return data[idx2[idx1[y] + z] + (x&p->data.mask)];
+}
+
+static int
+mkarrvar(int fd, char *name, int *d, int len)
+{
+ int i, sz;
+ int max, min;
+ char *t;
+
+ max = min = 0;
+ for(i = 0; i < len; i++){
+ if(d[i] > max)
+ max = d[i];
+ if(d[i] < min)
+ min = d[i];
+ }
+ if(min == 0){
+ if(max < (uchar)~0)
+ t = "uchar", sz = 1;
+ else if(max < 0xFFFF)
+ t = "ushort", sz = 2;
+ else
+ t = "uint", sz = 4;
+ } else {
+ if(max < 1<<7)
+ t = "char", sz = 1;
+ else if(max < 1<<15)
+ t = "short", sz = 2;
+ else
+ t = "int", sz = 4;
+ }
+ if(fd < 0)
+ return sz * len;
+
+ fprint(fd, "static\n%s\t%s[%d] =\n{\n\t", t, name, len);
+ for(i = 0; i < len; i++){
+ fprint(fd, "%d,", d[i]);
+ if((i+1) % 16 == 0)
+ fprint(fd, "\n\t");
+ }
+ fprint(fd, "\n};\n");
+
+ return sz * len;
+}
+
+static int
+mkexceptarr(int fd, char *name, int *d, int n, int all)
+{
+ int i;
+ fprint(fd, "static\nRune %s[][%d] =\n{\n\t", name, all ? 3 : 2);
+ for(i = 0; i < n*3; i += 3){
+ if(all && d[i] != 0)
+ fprint(fd, "{0x%X, 0x%X, 0x%X},", d[i], d[i+1], d[i+2]);
+ else if(!all)
+ fprint(fd, "{0x%X, 0x%X},", d[i+1], d[i+2]);
+ if((i+3) % (8*3) == 0)
+ fprint(fd, "\n\t");
+ }
+ fprint(fd, "\n};\n");
+ return n * sizeof(Rune) * 2;
+}
+
+static int
+compact(int *data, int *idx, int nidx, int *src, int chunksize)
+{
+ int i, n, ndata, best;
+ int *dot, *lp, *rp;
+
+ dot = src;
+ ndata = 0;
+ idx[0] = 0;
+ for(i = 1; i <= nidx; i++){
+ rp = dot + chunksize;
+ lp = rp - 1;
+
+ for(best = 0, n = 0; i != nidx && n < chunksize; n++, lp--){
+ if(memcmp(lp, rp, (n+1) * sizeof data[0]) == 0)
+ best = n+1;
+ }
+ memmove(data + ndata, dot, (chunksize - best) * sizeof data[0]);
+ ndata += (chunksize - best);
+ idx[i] = idx[i - 1] + (chunksize - best);
+ dot = rp;
+ }
+ return ndata;
+}
+
+
+static int
+mklkup(int fd, char *label, int *map, Param *p)
+{
+ static int data[NRUNES];
+ static int idx2[NRUNES];
+ static int idx2dest[NRUNES];
+ static int idx1[NRUNES];
+ int i, nidx2, ndata;
+ int size;
+
+ ndata = compact(data, idx2, p->round1max, map, p->data.max);
+ nidx2 = compact(idx2dest, idx1, p->idx1.max, idx2, p->idx2.max);
+
+ if(fd >= 0){
+ for(i = 0; i < NRUNES; i++)
+ if(map[i] != lkup(p, idx1, idx2dest, data, i))
+ sysfatal("mismatch in %s at %d %d %d\n", label, i, map[i], lkup(p, idx1,
idx2dest, data, i));
+ }
+
+ size = mkarrvar(fd, smprint("_%sdata", label), data, ndata);
+ size += mkarrvar(fd, smprint("_%sidx2", label), idx2dest, nidx2);
+ size += mkarrvar(fd, smprint("_%sidx1", label), idx1, p->idx1.max);
+ if(fd >= 0){
+ fprint(fd, "\n");
+ fprint(fd, "#define %sindex1(x) (((x)>>(%d+%d))&0x%X)\n", label,
p->data.bits, p->idx2.bits, p->idx1.mask);
+ fprint(fd, "#define %sindex2(x) (((x)>>%d)&0x%X)\n", label,
p->data.bits, p->idx2.mask);
+ fprint(fd, "#define %soffset(x) ((x)&0x%X)\n", label, p->data.mask);
+ fprint(fd, "#define %slkup(x) (_%sdata[_%sidx2[_%sidx1[%sindex1(x)] +
%sindex2(x)] + %soffset(x)] )\n\n",
+ label, label, label, label, label, label, label);
+ }
+ return size;
+}
+
+static void
+mklkupmatrix(char *label, int *map, Param *p)
+{
+ int bestsize, size, bestx, besty;
+ int x, y;
+
+ bestsize = bestx = besty = -1;
+ for(x = 4; x <= 12; x++)
+ for(y=4; y <= (19 - x); y++){
+ param(p, x, y);
+ size = mklkup(-1, label, map, p);
+ if(bestsize == -1 || size < bestsize){
+ bestx = x;
+ besty = y;
+ bestsize = size;
+ }
+ }
+
+ assert(bestsize != -1);
+ fprint(2, "label: %s best: %d %d (%d)\n", label, bestx, besty, bestsize);
+ param(p, bestx, besty);
+}
+
+static int myismerged[NRUNES];
+static int mytoupper[NRUNES];
+static int mytolower[NRUNES];
+static int mytotitle[NRUNES];
+static int mybreak[NRUNES];
+
+enum{ DSTART = 0xEEEE };
+static int mydecomp[NRUNES];
+static int mydespecial[256*3];
+static int nspecial;
+static int myccc[NRUNES];
+
+typedef struct KV KV;
+struct KV{
+ uint key;
+ uint val;
+ ushort next;
+};
+
+static KV myrecomp[2000];
+static int nrecomp;
+
+static int recompext[256*3];
+static int nrecompext;
+
+static uint
+hash(uint x)
+{
+ x ^= x >> 16;
+ x *= 0x21f0aaad;
+ x ^= x >> 15;
+ x *= 0xd35a2d97;
+ x ^= x >> 15;
+ return x;
+}
+
+static void
+mkrecomp(int fd)
+{
+ int i;
+ KV *p;
+ static KV vals[512];
+ static KV coll[1000];
+ int over;
+ int maxchain;
+
+ for(i = 0; i < nelem(vals); i++)
+ vals[i] = (KV){0, 0, 0};
+ for(i = 0; i < nelem(coll); i++)
+ coll[i] = (KV){0, 0, 0};
+ over = 1;
+ for(i = 0; i < nrecomp; i++){
+ p = vals + (hash(myrecomp[i].key) % nelem(vals));
+ maxchain = 0;
+ while(p->key != 0){
+ maxchain++;
+ if(p->next == 0){
+ p->next = over;
+ p = coll + over - 1;
+ over++;
+ } else
+ p = coll + p->next - 1;
+ }
+ p->key = myrecomp[i].key;
+ p->val = myrecomp[i].val;
+ }
+ fprint(2, "recomp map [%d][%d]: %d\n", nelem(vals), over-1, (nelem(vals) +
over-1) * (4+2+2));
+ fprint(fd, "static\nuint\t_recompdata[] =\n{\n\t");
+ for(p = vals, i = 0;; i++){
+ assert(p->val < 0xFFFF);
+ assert(p->next < 0xFFFF);
+ fprint(fd, "%udU,%udU,", p->key, p->val | (p->next<<16));
+ if((i+1) % 8 == 0)
+ fprint(fd, "\n\t");
+
+ if(p == vals+nelem(vals)-1)
+ p = coll;
+ else if(p == coll + over - 2)
+ break;
+ else
+ p++;
+ }
+ fprint(fd, "\n};\n");
+ fprint(fd, "static uint *_recompcoll = _recompdata+%d*2;\n", nelem(vals));
+ /*
+ fprint(fd,
+ " x ^= x >> 16;\n"
+ " x *= 0x21f0aaad;\n"
+ " x ^= x >> 15;\n"
+ " x *= 0xd35a2d97;\n"
+ " x ^= x >> 15;\n"
+ " p = _recompdata + (x%%%d)*2;\n"
+ "}\n", nelem(vals));
+ */
+}
+
+static void
+mktables(void)
+{
+ Param p;
+ int tofd, isfd, normfd, breakfd;
+ int size;
+
+ tofd = create("runetotypedata", OWRITE, 0664);
+ if(tofd < 0)
+ sysfatal("could not create runetotypedata: %r");
+ param(&p, 10, 7);
+ size = mklkup(tofd, "upper", mytoupper, &p);
+ fprint(2, "%s: %d\n", "upper", size);
+
+ size = mklkup(tofd, "lower", mytolower, &p);
+ fprint(2, "%s: %d\n", "lower", size);
+
+ size = mklkup(tofd, "title", mytotitle, &p);
+ fprint(2, "%s: %d\n", "title", size);
+ close(tofd);
+
+ isfd = create("runeistypedata", OWRITE, 0664);
+ if(isfd < 0)
+ sysfatal("could not create runeistypedata: %r");
+ param(&p, 11, 6);
+ size = mklkup(isfd, "merged", myismerged, &p);
+ fprint(2, "%s: %d\n", "merged", size);
+ fprint(isfd, "static\nenum {\n");
+ fprint(isfd, "\tL%s = %s,\n", "space", "1<<0");
+ fprint(isfd, "\tL%s = %s,\n", "alpha", "1<<1");
+ fprint(isfd, "\tL%s = %s,\n", "digit", "1<<2");
+ fprint(isfd, "\tL%s = %s,\n", "upper", "1<<3");
+ fprint(isfd, "\tL%s = %s,\n", "lower", "1<<4");
+ fprint(isfd, "\tL%s = %s,\n", "title", "1<<5");
+ fprint(isfd, "};\n");
+ close(isfd);
+
+ normfd = create("runenormdata", OWRITE, 0664);
+ if(normfd < 0)
+ sysfatal("could not create runenormdata: %r");
+ param(&p, 10, 7);
+ size = mklkup(normfd, "decomp", mydecomp, &p);
+ fprint(2, "%s: %d\n", "decomp", size);
+
+ param(&p, 9, 7);
+ size = mklkup(normfd, "ccc", myccc, &p);
+ fprint(2, "%s: %d\n", "ccc", size);
+
+ mkexceptarr(normfd, "_decompexceptions", mydespecial, nspecial, 0);
+ mkexceptarr(normfd, "_recompexceptions", recompext, nrecompext, 1);
+ mkrecomp(normfd);
+ close(normfd);
+
+ param(&p, 10, 6);
+ breakfd = create("runebreakdata", OWRITE, 0644);
+ if(breakfd < 0)
+ sysfatal("could not create runebreakdata: %r");
+ size = mklkup(breakfd, "break", mybreak, &p);
+ fprint(2, "%s: %d\n", "break", size);
+}
+
+enum {
+ FIELD_CODE,
+ FIELD_NAME,
+ FIELD_CATEGORY,
+ FIELD_COMBINING,
+ FIELD_BIDIR,
+ FIELD_DECOMP,
+ FIELD_DECIMAL_DIG,
+ FIELD_DIG,
+ FIELD_NUMERIC_VAL,
+ FIELD_MIRRORED,
+ FIELD_UNICODE_1_NAME,
+ FIELD_COMMENT,
+ FIELD_UPPER,
+ FIELD_LOWER,
+ FIELD_TITLE,
+ NFIELDS,
+};
+
+static int
+getunicodeline(Biobuf *in, char **fields)
+{
+ char *p;
+
+ if((p = Brdline(in, '\n')) == nil)
+ return 0;
+
+ p[Blinelen(in)-1] = '\0';
+
+ if (getfields(p, fields, NFIELDS + 1, 0, ";") != NFIELDS)
+ sysfatal("bad number of fields");
+
+ return 1;
+}
+
+static int
+estrtoul(char *s, int base)
+{
+ char *epr;
+ Rune code;
+
+ code = strtoul(s, &epr, base);
+ if(s == epr)
+ sysfatal("bad code point hex string");
+ return code;
+}
+
+enum {
+ OTHER,
+ Hebrew_Letter, Newline, Extend, Format,
+ Katakana, ALetter, MidLetter, MidNum,
+ MidNumLet, Numeric, ExtendNumLet, WSegSpace,
+ PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
+ L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
+ EMOJIEX = 0xB0,
+};
+
+static void
+markbreak(void)
+{
+ Biobuf *b;
+ char *p, *dot;
+ int i, s, e;
+ uchar v;
+
+ b = Bopen("/lib/ucd/WordBreakProperty.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load word breaks: %r");
+
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#')
+ continue;
+ if((dot = strstr(p, "..")) != nil){
+ *dot = 0;
+ dot += 2;
+ s = estrtoul(p, 16);
+ e = estrtoul(dot, 16);
+ } else {
+ s = e = estrtoul(p, 16);
+ dot = p;
+ }
+ v = 0;
+ if(strstr(dot, "ExtendNumLet") != nil)
+ v = ExtendNumLet;
+ else if(strstr(dot, "Hebrew_Letter") != nil)
+ v = Hebrew_Letter;
+ else if(strstr(dot, "Newline") != nil)
+ v = Newline;
+ else if(strstr(dot, "Extend") != nil)
+ v = Extend;
+ else if(strstr(dot, "Format") != nil)
+ v = Format;
+ else if(strstr(dot, "Katakana") != nil)
+ v = Katakana;
+ else if(strstr(dot, "ALetter") != nil)
+ v = ALetter;
+ else if(strstr(dot, "MidLetter") != nil)
+ v = MidLetter;
+ else if(strstr(dot, "MidNum") != nil)
+ v = MidNum;
+ else if(strstr(dot, "Numeric") != nil)
+ v = Numeric;
+ else if(strstr(dot, "WSegSpace") != nil)
+ v = WSegSpace;
+ for(i = s; i <= e; i++)
+ mybreak[i] = v;
+ }
+ Bterm(b);
+ b = Bopen("/lib/ucd/GraphemeBreakProperty.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load Grapheme breaks: %r");
+
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#')
+ continue;
+ if((dot = strstr(p, "..")) != nil){
+ *dot = 0;
+ dot += 2;
+ s = estrtoul(p, 16);
+ e = estrtoul(dot, 16);
+ } else {
+ s = e = estrtoul(p, 16);
+ dot = p;
+ }
+ v = 0;
+ if(strstr(dot, "; Prepend #") != nil)
+ v = PREPEND;
+ else if(strstr(dot, "; Control #") != nil)
+ v = CONTROL;
+ else if(strstr(dot, "; Extend #") != nil)
+ v = EXTEND;
+ else if(strstr(dot, "; Regional_Indicator #") != nil)
+ v = REGION;
+ else if(strstr(dot, "; SpacingMark #") != nil)
+ v = SPACEMK;
+ else if(strstr(dot, "; L #") != nil)
+ v = L;
+ else if(strstr(dot, "; V #") != nil)
+ v = V;
+ else if(strstr(dot, "; T #") != nil)
+ v = T;
+ else if(strstr(dot, "; LV #") != nil)
+ v = LV;
+ else if(strstr(dot, "; LVT #") != nil)
+ v = LVT;
+ for(i = s; i <= e; i++)
+ mybreak[i] |= v;
+ }
+ Bterm(b);
+
+ b = Bopen("/lib/ucd/emoji-data.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load emoji-data: %r");
+
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#')
+ continue;
+ if((dot = strstr(p, "..")) != nil){
+ *dot = 0;
+ dot += 2;
+ s = estrtoul(p, 16);
+ e = estrtoul(dot, 16);
+ } else {
+ s = e = estrtoul(p, 16);
+ dot = p;
+ }
+ v = 0;
+ if(strstr(dot, "; Extended_Pictographic") != nil)
+ v = EMOJIEX;
+ for(i = s; i <= e; i++)
+ mybreak[i] |= v;
+ }
+ Bterm(b);
+}
+
+static void
+markexclusions(void)
+{
+ Biobuf *b;
+ char *p;
+ int i;
+ uint x;
+
+ b = Bopen("/lib/ucd/CompositionExclusions.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load composition exclusions: %r");
+
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#')
+ continue;
+ x = estrtoul(p, 16);
+ for(i = 0; i < nrecomp; i++){
+ if(myrecomp[i].val == x){
+ myrecomp[i].val = 0;
+ break;
+ }
+ }
+ if(i == nrecomp){
+ for(i = 0; i < nrecompext; i++){
+ if(recompext[i*3] == x){
+ recompext[i*3] = 0;
+ break;
+ }
+ }
+ }
+ }
+ Bterm(b);
+}
+
+void
+main(int, char)
+{
+ static char myisspace[NRUNES];
+ static char myisalpha[NRUNES];
+ static char myisdigit[NRUNES];
+ static char myisupper[NRUNES];
+ static char myislower[NRUNES];
+ static char myistitle[NRUNES];
+ Biobuf *in;
+ char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
+ char *p, *d;
+ int i, code, last;
+ int decomp[2], *ip;
+
+ in = Bopen("/lib/ucd/UnicodeData.txt", OREAD);
+ if(in == nil)
+ sysfatal("can't open UnicodeData.txt: %r");
+
+ for(i = 0; i < NRUNES; i++){
+ mytoupper[i] = -1;
+ mytolower[i] = -1;
+ mytotitle[i] = -1;
+ mydecomp[i] = 0;
+ myccc[i] = 0;
+ mybreak[i] = 0;
+ }
+
+ myisspace['\t'] = 1;
+ myisspace['\n'] = 1;
+ myisspace['\r'] = 1;
+ myisspace['\f'] = 1;
+ myisspace['\v'] = 1;
+ myisspace[0x85] = 1; /* control char, "next line" */
+ myisspace[0xfeff] = 1; /* zero-width non-break space */
+
+ last = -1;
+ nspecial = nrecomp = nrecompext = 0;
+ while(getunicodeline(in, fields)){
+ code = estrtoul(fields[FIELD_CODE], 16);
+ if (code >= NRUNES)
+ sysfatal("code-point value too big: %x", code);
+ if(code <= last)
+ sysfatal("bad code sequence: %x then %x", last, code);
+ last = code;
+
+ p = fields[FIELD_CATEGORY];
+ if(strstr(fields[FIELD_NAME], ", First>") != nil){
+ if(!getunicodeline(in, fields2))
+ sysfatal("range start at eof");
+ if (strstr(fields2[FIELD_NAME], ", Last>") == nil)
+ sysfatal("range start not followed by range end");
+ last = estrtoul(fields2[FIELD_CODE], 16);
+ if(last <= code)
+ sysfatal("range out of sequence: %x then %x", code, last);
+ if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
+ sysfatal("range with mismatched category");
+ }
+
+ d = fields[FIELD_DECOMP];
+ if(strlen(d) > 0 && strstr(d, "<") == nil){
+ decomp[0] = estrtoul(d, 16);
+ d = strstr(d, " ");
+ if(d == nil){
+ /* singleton recompositions are verboden */
+ decomp[1] = 0;
+ if(decomp[0] > 0xFFFF){
+ //fprint(2, "case1 %X %X\n", code, decomp[0]);
+ ip = mydespecial + nspecial*3;
+ ip[0] = code;
+ ip[1] = decomp[0];
+ ip[2] = 0;
+ mydecomp[code] = (DSTART+nspecial)<<16;
+ nspecial++;
+ } else
+ mydecomp[code] = decomp[0]<<16;
+ } else {
+ d++;
+ decomp[1] = estrtoul(d, 16);
+ if(decomp[0] > 0xFFFF || decomp[1] > 0xFFFF){
+ //fprint(2, "case2 %X %X %X\n", code, decomp[0], decomp[1]);
+ ip = mydespecial + nspecial*3;
+ ip[0] = code;
+ ip[1] = decomp[0];
+ ip[2] = decomp[1];
+ mydecomp[code] = (DSTART+nspecial)<<16;
+ nspecial++;
+ ip = recompext + nrecompext*3;
+ ip[0] = code;
+ ip[1] = decomp[0];
+ ip[2] = decomp[1];
+ nrecompext++;
+ } else {
+ mydecomp[code] = decomp[0]<<16 | decomp[1];
+ myrecomp[nrecomp++] = (KV){decomp[0]<<16 | decomp[1], code, 0};
+ }
+ }
+ }
+
+ for (; code <= last; code++){
+ if(p[0] == 'L')
+ myisalpha[code] = 1;
+ if(p[0] == 'Z')
+ myisspace[code] = 1;
+
+ if(strcmp(p, "Lu") == 0)
+ myisupper[code] = 1;
+ if(strcmp(p, "Ll") == 0)
+ myislower[code] = 1;
+
+ if(strcmp(p, "Lt") == 0)
+ myistitle[code] = 1;
+
+ if(strcmp(p, "Nd") == 0)
+ myisdigit[code] = 1;
+
+ if(fields[FIELD_UPPER][0] != '\0')
+ mytoupper[code] = estrtoul(fields[FIELD_UPPER], 16);
+
+ if(fields[FIELD_LOWER][0] != '\0')
+ mytolower[code] = estrtoul(fields[FIELD_LOWER], 16);
+
+ if(fields[FIELD_TITLE][0] != '\0')
+ mytotitle[code] = estrtoul(fields[FIELD_TITLE], 16);
+
+ myccc[code] = estrtoul(fields[FIELD_COMBINING], 10);
+ }
+ }
+
+ Bterm(in);
+
+ markexclusions();
+
+ /*
+ * according to standard, if totitle(x) is not defined in ucd
+ * but toupper(x) is, then totitle is defined to be toupper(x)
+ */
+ for(i = 0; i < NRUNES; i++){
+ if(mytotitle[i] == -1
+ && mytoupper[i] != -1
+ && !myistitle[i])
+ mytotitle[i] = mytoupper[i];
+ }
+
+ /*
+ * A couple corrections:
+ * is*(to*(x)) should be true.
+ * restore undefined transformations.
+ * store offset instead of value, makes them sparse.
+ */
+ for(i = 0; i < NRUNES; i++){
+ if(mytoupper[i] != -1)
+ myisupper[mytoupper[i]] = 1;
+ else
+ mytoupper[i] = i;
+
+ if(mytolower[i] != -1)
+ myislower[mytolower[i]] = 1;
+ else
+ mytolower[i] = i;
+
+ if(mytotitle[i] != -1)
+ myistitle[mytotitle[i]] = 1;
+ else
+ mytotitle[i] = i;
+
+ mytoupper[i] = mytoupper[i] - i;
+ mytolower[i] = mytolower[i] - i;
+ mytotitle[i] = mytotitle[i] - i;
+ }
+
+ uchar b;
+ for(i = 0; i < NRUNES; i++){
+ b = 0;
+ if(myisspace[i])
+ b |= 1<<0;
+ if(myisalpha[i])
+ b |= 1<<1;
+ if(myisdigit[i])
+ b |= 1<<2;
+ if(myisupper[i])
+ b |= 1<<3;
+ if(myislower[i])
+ b |= 1<<4;
+ if(myistitle[i])
+ b |= 1<<5;
+
+ myismerged[i] = b;
+ }
+
+ markbreak();
+ mktables();
+ exits(nil);
+}
--- /dev/null
+++ b//sys/src/libc/port/runebreak.c
@@ -1,0 +1,149 @@
+#include <u.h>
+#include <libc.h>
+
+#include "/sys/src/libc/port/runebreakdata"
+
+enum {
+ OTHER,
+ Hebrew_Letter, Newline, Extend, Format,
+ Katakana, ALetter, MidLetter, MidNum,
+ MidNumLet, Numeric, ExtendNumLet, WSegSpace,
+ PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,
+ L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,
+ EMOJIEX = 0xB0,
+
+ ZWJ = 0x200DU,
+ LINETAB = 0xB,
+};
+
+#define IS(x, y) ((x&0xf) == y)
+#define ISG(x, y) ((x&0xf0) == y)
+
+Rune*
+runegbreak(Rune *s)
+{
+ Rune l, r;
+ uchar lt, rt;
+ Rune *p;
+
+ p = s;
+ if((l = *p++) == 0)
+ return s;
+ if((r = *p) == 0)
+ return s;
+ lt = breaklkup(l);
+ rt = breaklkup(r);
+ if(l == '\r' && r == '\n')
+ goto Done;
+ if(ISG(lt, CONTROL) || l == '\r' || l == '\n')
+ return p;
+ if(ISG(rt, CONTROL) || r == '\r' || r == '\n')
+ return p;
+ if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))
+ goto Done;
+ if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))
+ goto Done;
+ if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))
+ goto Done;
+ if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))
+ goto Done;
+ if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){
+ while(ISG(rt, EXTEND)){
+ p++;
+ if((r = *p) == 0)
+ return s;
+ rt = breaklkup(r);
+ }
+ if(r != ZWJ)
+ return p;
+ p++;
+ if((r = *p) == 0)
+ return s;
+ rt = breaklkup(r);
+ if(ISG(rt, EMOJIEX))
+ goto Done;
+ return p;
+ }
+ if(ISG(rt, EXTEND) || r == ZWJ)
+ goto Done;
+ if(ISG(lt, REGION) && ISG(rt, REGION))
+ goto Done;
+
+ return p;
+
+Done:
+ if(p[1] == 0)
+ return s;
+ return p + 1;
+}
+
+#define AH(x) (IS(x, ALetter) || IS(x, Hebrew_Letter))
+#define MNLQ(x) (IS(x, MidNumLet) || x == '\'')
+
+Rune*
+runewbreak(Rune *s)
+{
+ Rune l, r;
+ uchar lt, rt;
+ Rune *p;
+
+ p = s;
+ if((l = *p++) == 0)
+ return s;
+ if((r = *p) == 0)
+ return s;
+ lt = breaklkup(l);
+ rt = breaklkup(r);
+ if(l == '\r' && r == '\n')
+ goto Done;
+ if(l == '\r' || l == '\n' || l == LINETAB)
+ return p;
+ if(r == '\r' || r == '\n' || l == LINETAB)
+ return p;
+ if(IS(lt, WSegSpace) && IS(rt, WSegSpace))
+ goto Done;
+ if(IS(rt, Format) || IS(rt, Extend))
+ goto Done;
+ if(AH(lt)){
+ if(AH(rt))
+ goto Done;
+ if((IS(rt, MidLetter) || MNLQ(rt)) && p[1] != 0 && AH(breaklkup(p[1])))
+ goto Done;
+ if(IS(lt, Hebrew_Letter) && r == '\'')
+ goto Done;
+ if(IS(lt, Hebrew_Letter) && r == '"' && p[1] != 0 && IS(breaklkup(p[1]),
Hebrew_Letter))
+ goto Done;
+ if(IS(rt, Numeric))
+ goto Done;
+ }
+ if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))
+ goto Done;
+ if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && p[1] != 0 &&
IS(breaklkup(p[1]), Numeric))
+ goto Done;
+ if(IS(lt, Katakana) && IS(rt, Katakana))
+ goto Done;
+ if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))
+ if(IS(rt, ExtendNumLet))
+ goto Done;
+ if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))
+ goto Done;
+ if(ISG(lt, REGION)){
+ if(ISG(rt, REGION))
+ goto Done;
+ if(r != ZWJ)
+ return p;
+ p++;
+ if((r = *p) == 0)
+ return s;
+ rt = breaklkup(r);
+ if(ISG(rt, REGION))
+ goto Done;
+ }
+
+ return p;
+
+Done:
+ if(p[1] == 0)
+ return s;
+ return p + 1;
+}
--- /dev/null
+++ b//sys/src/libc/port/runeistype.c
@@ -1,0 +1,40 @@
+#include <u.h>
+#include <libc.h>
+
+#include "/sys/src/libc/port/runeistypedata"
+
+int
+isspacerune(Rune c)
+{
+ return (mergedlkup(c) & Lspace) == Lspace;
+}
+
+int
+isalpharune(Rune c)
+{
+ return (mergedlkup(c) & Lalpha) == Lalpha;
+}
+
+int
+isdigitrune(Rune c)
+{
+ return (mergedlkup(c) & Ldigit) == Ldigit;
+}
+
+int
+isupperrune(Rune c)
+{
+ return (mergedlkup(c) & Lupper) == Lupper;
+}
+
+int
+islowerrune(Rune c)
+{
+ return (mergedlkup(c) & Llower) == Llower;
+}
+
+int
+istitlerune(Rune c)
+{
+ return (mergedlkup(c) & Ltitle) == Ltitle;
+}
--- /dev/null
+++ b//sys/src/libc/port/runenorm.c
@@ -1,0 +1,328 @@
+#include <u.h>
+#include <libc.h>
+
+#include "/sys/src/libc/port/runenormdata"
+
+//Unicode Standard: Section 3.12 Conjoining Jamo Behavior
+enum {
+ SBase = 0xAC00,
+ LBase = 0x1100,
+ VBase = 0x1161,
+ TBase = 0x11A7,
+
+ LCount = 19,
+ VCount = 21,
+ TCount = 28,
+ NCount = VCount * TCount,
+ SCount = LCount * NCount,
+
+ LLast = LBase + LCount - 1,
+ SLast = SBase + SCount - 1,
+ VLast = VBase + VCount - 1,
+ TLast = TBase + TCount - 1,
+};
+
+void
+decomposerune(Rune c, Rune dst[2])
+{
+ uint x;
+
+ if(c >= SBase && c <= SLast){
+ c -= SBase;
+ x = c % TCount;
+ if(x){
+ dst[0] = SBase + ((c / TCount) * TCount);
+ dst[1] = TBase + x;
+ return;
+ }
+ dst[0] = LBase + (c / NCount);
+ dst[1] = VBase + ((c % NCount) / TCount);
+ return;
+ }
+ x = decomplkup(c);
+ if((x & (ushort)~0) != 0){
+ dst[0] = x>>16;
+ dst[1] = x & (ushort)~0;
+ return;
+ }
+ x >>= 16;
+ if(x >= 0xEEEE && x <0xF8FF){
+ memmove(dst, _decompexceptions[x - 0xEEEE], sizeof(Rune)*2);
+ return;
+ }
+ dst[0] = x;
+ dst[1] = 0;
+}
+
+Rune
+composerune(Rune r[2])
+{
+ uint x, y, *p, next;
+
+ if(r[0] >= LBase && r[0] <= LLast){
+ if(r[1] < VBase || r[1] > VLast)
+ return 0;
+ x = (r[0] - LBase) * NCount + (r[1] - VBase) * TCount;
+ return SBase + x;
+ }
+ if(r[0] >= SBase && r[0] <= SLast && (r[0] - SBase) % TCount == 0){
+ if(r[1] > TBase && r[1] <= TLast)
+ return r[0] + (r[1] - TBase);
+ return 0;
+ }
+ if(r[0] > (ushort)~0 || r[1] > (ushort)~0){
+ for(x = 0; x < nelem(_recompexceptions); x++)
+ if(r[0] == _recompexceptions[x][1] && r[1] == _recompexceptions[x][2])
+ return _recompexceptions[x][0];
+ return 0;
+ }
+ y = x = r[0]<<16 | r[1];
+ x ^= x >> 16;
+ x *= 0x21f0aaad;
+ x ^= x >> 15;
+ x *= 0xd35a2d97;
+ x ^= x >> 15;
+ p = _recompdata + (x%512)*2;
+ while(p[0] != y){
+ next = p[1]>>16;
+ if(!next)
+ return 0;
+ p = _recompcoll + (next-1)*2;
+ }
+ return p[1] & 0xFFFF;
+}
+
+int
+runeccc(Rune c)
+{
+ return ccclkup(c);
+}
+
+void
+runecccsort(Rune *a, int len)
+{
+ Rune r;
+ int i;
+ int fail;
+
+ do {
+ fail = 0;
+ for(i = 0; i < len - 1; i++){
+ if(runeccc(a[i]) > runeccc(a[i+1]) > 0){
+ r = a[i];
+ a[i] = a[i+1];
+ a[i + 1] = r;
+ fail = 1;
+ }
+ }
+ } while(fail);
+}
+
+char*
+fullutfnorm(char *s, int n)
+{
+ Rune r, peek;
+ char *p, *p2;
+
+ p = s;
+ if(fullrune(p, n) == 0)
+ return s;
+
+ p += chartorune(&r, p);
+ n -= (p - s);
+
+ if((r >= LBase && r <= LLast) || (r >= SBase && r <= SLast)){
+ do {
+ if(fullrune(p, n) == 0)
+ return s;
+ p2 = p + chartorune(&peek, p);
+ n -= (p2 - p);
+ p = p2;
+ } while(n > 0 && (peek >= VBase && peek <= VLast) || (peek > TBase
&& peek <= TLast));
+ if(n <= 0)
+ return s;
+ return p;
+ }
+
+ do {
+ if(fullrune(p, n) == 0)
+ return s;
+ p2 = p + chartorune(&peek, p);
+ n -= (p2 - p);
+ p = p2;
+ if(runeccc(peek) == 0)
+ return p;
+ } while(n > 0);
+
+ return s;
+}
+
+Rune*
+fullrunenorm(Rune *r, int n)
+{
+ Rune *e, *p;
+
+ p = r;
+ e = p + n;
+
+ if((*p >= LBase && *p <= LLast) || (*p >= SBase && *p <= SLast)){
+ p++;
+ while(p < e && (*p >= VBase && *p <= VLast) || (*p > TBase && *p
<= TLast))
+ p++;
+
+ if(p >= e)
+ return r;
+ return p;
+ }
+
+ for(; p < e && p + 1 < e; p++)
+ if(runeccc(p[1]) == 0)
+ return p + 1;
+
+ return r;
+}
+
+int
+_runenorm(Rune *dst, Rune *src, char *sdst, char *ssrc, int max, int compose)
+{
+ Rune c, r[2], _stack[32];
+ Rune *p, *stack, *sp, *tp;
+ char *strp, *strstop;
+ Rune *rp, *rrp;
+ Rune *stop;
+ Rune peek;
+ int w, w2, size;
+ int mode;
+
+ if(src){
+ mode = 1;
+ p = src;
+ stop = dst + (max - 1);
+ strp = "";
+ strstop = nil;
+ } else {
+ mode = 0;
+ p = L"";
+ stop = nil;
+ strp = ssrc;
+ strstop = sdst + (max - 1);
+ }
+
+ stack = _stack + nelem(_stack)/2;
+ size = 0;
+ w = w2 = 0;
+ while(*strp || *p){
+ if(mode)
+ c = *p;
+ else
+ w = chartorune(&c, strp);
+
+ sp = stack - 1;
+ tp = stack;
+ decomposerune(c, r);
+ while(r[0] != 0){
+ c = r[0];
+ if(r[1] != 0){
+ *sp-- = r[1];
+ if(sp == _stack)
+ break;
+ }
+ decomposerune(c, r);
+ }
+
+ *sp = c;
+ if(mode)
+ peek = p[1];
+ else
+ w2 = chartorune(&peek, strp+w);
+
+ if((*sp >= LBase && *sp <= LLast) || (*sp >= SBase && *sp <=
SLast)){
+ while(peek != 0 && (peek >= VBase && peek <= VLast) || (peek > TBase &&
peek <= TLast)){
+ *tp++ = peek;
+ if(mode){
+ p++;
+ peek = p[1];
+ } else {
+ strp += w;
+ w = w2;
+ w2 = chartorune(&peek, strp+w);
+ }
+ if(tp == _stack + nelem(_stack))
+ break;
+ }
+ }
+ while(peek != 0 && runeccc(peek) != 0){
+ decomposerune(peek, r);
+ if(r[1] != 0){
+ if(tp+1 >= _stack + nelem(_stack))
+ break;
+ *tp++ = r[0];
+ *tp++ = r[1];
+ } else if(r[0] != 0)
+ *tp++ = r[0];
+ else
+ *tp++ = peek;
+
+ if(mode){
+ p++;
+ peek = p[1];
+ } else {
+ strp += w;
+ w = w2;
+ w2 = chartorune(&peek, strp+w);
+ }
+ if(tp == _stack + nelem(_stack))
+ break;
+ }
+ runecccsort(sp, tp - sp);
+
+ if(compose && runeccc(*sp) == 0){
+ for(rp = sp + 1; rp < tp; rp++){
+ r[0] = *sp;
+ r[1] = *rp;
+ c = composerune(r);
+ if(c != 0){
+ *sp = c;
+ for(rrp = rp; rrp > sp; rrp--)
+ *rrp = rrp[-1];
+ sp++;
+ } else while(rp + 1 < tp && runeccc(*rp) == runeccc(*(rp+1)))
+ rp++;
+ }
+ }
+
+ for(; sp < tp; sp++){
+ if(mode){
+ if(dst < stop)
+ *dst++ = *sp;
+ size++;
+ } else {
+ w2 = runelen(*sp);
+ if(sdst+w2 < strstop)
+ sdst += runetochar(sdst, sp);
+ size += w2;
+ }
+ }
+ if(mode)
+ p++;
+ else
+ strp += w;
+ }
+ if(mode)
+ *dst = 0;
+ else
+ *sdst = 0;
+ return size;
+}
+
+int
+runenorm(Rune *dst, Rune *src, int max, int compose)
+{
+ return _runenorm(dst, src, nil, nil, max, compose);
+}
+
+int
+utfnorm(char *dst, char *src, int max, int compose)
+{
+ return _runenorm(nil, nil, dst, src, max, compose);
+}
--- /dev/null
+++ b//sys/src/libc/port/runetotype.c
@@ -1,0 +1,22 @@
+#include <u.h>
+#include <libc.h>
+
+#include "/sys/src/libc/port/runetotypedata"
+
+Rune
+toupperrune(Rune c)
+{
+ return c + upperlkup(c);
+}
+
+Rune
+tolowerrune(Rune c)
+{
+ return c + lowerlkup(c);
+}
+
+Rune
+totitlerune(Rune c)
+{
+ return c + titlelkup(c);
+}
--- a//sys/src/libc/port/runetype.c
+++ /dev/null
@@ -1,1181 +1,0 @@
-#include <u.h>
-#include <libc.h>
-
-/*
- * alpha ranges -
- * only covers ranges not in lower||upper
- */
-static
-Rune _alpha2[] =
-{
- 0x00d8, 0x00f6, /* Ø - ö */
- 0x00f8, 0x01f5, /* ø - ǵ */
- 0x0250, 0x02a8, /* ɐ - ʨ */
- 0x038e, 0x03a1, /* Ύ - Ρ */
- 0x03a3, 0x03ce, /* Σ - ώ */
- 0x03d0, 0x03d6, /* ϐ - ϖ */
- 0x03e2, 0x03f3, /* Ϣ - ϳ */
- 0x0490, 0x04c4, /* Ґ - ӄ */
- 0x0561, 0x0587, /* ա - և */
- 0x05d0, 0x05ea, /* א - ת */
- 0x05f0, 0x05f2, /* װ - ײ */
- 0x0621, 0x063a, /* ء - غ */
- 0x0640, 0x064a, /* ـ - ي */
- 0x0671, 0x06b7, /* ٱ - ڷ */
- 0x06ba, 0x06be, /* ں - ھ */
- 0x06c0, 0x06ce, /* ۀ - ێ */
- 0x06d0, 0x06d3, /* ې - ۓ */
- 0x0905, 0x0939, /* अ - ह */
- 0x0958, 0x0961, /* क़ - ॡ */
- 0x0985, 0x098c, /* অ - ঌ */
- 0x098f, 0x0990, /* এ - ঐ */
- 0x0993, 0x09a8, /* ও - ন */
- 0x09aa, 0x09b0, /* প - র */
- 0x09b6, 0x09b9, /* শ - হ */
- 0x09dc, 0x09dd, /* ড় - ঢ় */
- 0x09df, 0x09e1, /* য় - ৡ */
- 0x09f0, 0x09f1, /* ৰ - ৱ */
- 0x0a05, 0x0a0a, /* ਅ - ਊ */
- 0x0a0f, 0x0a10, /* ਏ - ਐ */
- 0x0a13, 0x0a28, /* ਓ - ਨ */
- 0x0a2a, 0x0a30, /* ਪ - ਰ */
- 0x0a32, 0x0a33, /* ਲ - ਲ਼ */
- 0x0a35, 0x0a36, /* ਵ - ਸ਼ */
- 0x0a38, 0x0a39, /* ਸ - ਹ */
- 0x0a59, 0x0a5c, /* ਖ਼ - ੜ */
- 0x0a85, 0x0a8b, /* અ - ઋ */
- 0x0a8f, 0x0a91, /* એ - ઑ */
- 0x0a93, 0x0aa8, /* ઓ - ન */
- 0x0aaa, 0x0ab0, /* પ - ર */
- 0x0ab2, 0x0ab3, /* લ - ળ */
- 0x0ab5, 0x0ab9, /* વ - હ */
- 0x0b05, 0x0b0c, /* ଅ - ଌ */
- 0x0b0f, 0x0b10, /* ଏ - ଐ */
- 0x0b13, 0x0b28, /* ଓ - ନ */
- 0x0b2a, 0x0b30, /* ପ - ର */
- 0x0b32, 0x0b33, /* ଲ - ଳ */
- 0x0b36, 0x0b39, /* ଶ - ହ */
- 0x0b5c, 0x0b5d, /* ଡ଼ - ଢ଼ */
- 0x0b5f, 0x0b61, /* ୟ - ୡ */
- 0x0b85, 0x0b8a, /* அ - ஊ */
- 0x0b8e, 0x0b90, /* எ - ஐ */
- 0x0b92, 0x0b95, /* ஒ - க */
- 0x0b99, 0x0b9a, /* ங - ச */
- 0x0b9e, 0x0b9f, /* ஞ - ட */
- 0x0ba3, 0x0ba4, /* ண - த */
- 0x0ba8, 0x0baa, /* ந - ப */
- 0x0bae, 0x0bb5, /* ம - வ */
- 0x0bb7, 0x0bb9, /* ஷ - ஹ */
- 0x0c05, 0x0c0c, /* అ - ఌ */
- 0x0c0e, 0x0c10, /* ఎ - ఐ */
- 0x0c12, 0x0c28, /* ఒ - న */
- 0x0c2a, 0x0c33, /* ప - ళ */
- 0x0c35, 0x0c39, /* వ - హ */
- 0x0c60, 0x0c61, /* ౠ - ౡ */
- 0x0c85, 0x0c8c, /* ಅ - ಌ */
- 0x0c8e, 0x0c90, /* ಎ - ಐ */
- 0x0c92, 0x0ca8, /* ಒ - ನ */
- 0x0caa, 0x0cb3, /* ಪ - ಳ */
- 0x0cb5, 0x0cb9, /* ವ - ಹ */
- 0x0ce0, 0x0ce1, /* ೠ - ೡ */
- 0x0d05, 0x0d0c, /* അ - ഌ */
- 0x0d0e, 0x0d10, /* എ - ഐ */
- 0x0d12, 0x0d28, /* ഒ - ന */
- 0x0d2a, 0x0d39, /* പ - ഹ */
- 0x0d60, 0x0d61, /* ൠ - ൡ */
- 0x0e01, 0x0e30, /* ก - ะ */
- 0x0e32, 0x0e33, /* า - ำ */
- 0x0e40, 0x0e46, /* เ - ๆ */
- 0x0e5a, 0x0e5b, /* ๚ - ๛ */
- 0x0e81, 0x0e82, /* ກ - ຂ */
- 0x0e87, 0x0e88, /* ງ - ຈ */
- 0x0e94, 0x0e97, /* ດ - ທ */
- 0x0e99, 0x0e9f, /* ນ - ຟ */
- 0x0ea1, 0x0ea3, /* ມ - ຣ */
- 0x0eaa, 0x0eab, /* ສ - ຫ */
- 0x0ead, 0x0eae, /* ອ - ຮ */
- 0x0eb2, 0x0eb3, /* າ - ຳ */
- 0x0ec0, 0x0ec4, /* ເ - ໄ */
- 0x0edc, 0x0edd, /* ໜ - ໝ */
- 0x0f18, 0x0f19, /* ༘ - ༙ */
- 0x0f40, 0x0f47, /* ཀ - ཇ */
- 0x0f49, 0x0f69, /* ཉ - ཀྵ */
- 0x10d0, 0x10f6, /* ა - ჶ */
- 0x1100, 0x1159, /* ᄀ - ᅙ */
- 0x115f, 0x11a2, /* ᅟ - ᆢ */
- 0x11a8, 0x11f9, /* ᆨ - ᇹ */
- 0x1e00, 0x1e9b, /* Ḁ - ẛ */
- 0x1f50, 0x1f57, /* ὐ - ὗ */
- 0x1f80, 0x1fb4, /* ᾀ - ᾴ */
- 0x1fb6, 0x1fbc, /* ᾶ - ᾼ */
- 0x1fc2, 0x1fc4, /* ῂ - ῄ */
- 0x1fc6, 0x1fcc, /* ῆ - ῌ */
- 0x1fd0, 0x1fd3, /* ῐ - ΐ */
- 0x1fd6, 0x1fdb, /* ῖ - Ί */
- 0x1fe0, 0x1fec, /* ῠ - Ῥ */
- 0x1ff2, 0x1ff4, /* ῲ - ῴ */
- 0x1ff6, 0x1ffc, /* ῶ - ῼ */
- 0x210a, 0x2113, /* ℊ - ℓ */
- 0x2115, 0x211d, /* ℕ - ℝ */
- 0x2120, 0x2122, /* ℠ - ™ */
- 0x212a, 0x2131, /* K - ℱ */
- 0x2133, 0x2138, /* ℳ - ℸ */
- 0x3041, 0x3094, /* ぁ - ゔ */
- 0x30a1, 0x30fa, /* ァ - ヺ */
- 0x3105, 0x312c, /* ㄅ - ㄬ */
- 0x3131, 0x318e, /* ㄱ - ㆎ */
- 0x3192, 0x319f, /* ㆒ - ㆟ */
- 0x3260, 0x327b, /* ㉠ - ㉻ */
- 0x328a, 0x32b0, /* ㊊ - ㊰ */
- 0x32d0, 0x32fe, /* ㋐ - ㋾ */
- 0x3300, 0x3357, /* ㌀ - ㍗ */
- 0x3371, 0x3376, /* ㍱ - ㍶ */
- 0x337b, 0x3394, /* ㍻ - ㎔ */
- 0x3399, 0x339e, /* ㎙ - ㎞ */
- 0x33a9, 0x33ad, /* ㎩ - ㎭ */
- 0x33b0, 0x33c1, /* ㎰ - ㏁ */
- 0x33c3, 0x33c5, /* ㏃ - ㏅ */
- 0x33c7, 0x33d7, /* ㏇ - ㏗ */
- 0x33d9, 0x33dd, /* ㏙ - ㏝ */
- 0x4e00, 0x9fff, /* 一 - 鿿 */
- 0xac00, 0xd7a3, /* 가 - 힣 */
- 0xf900, 0xfb06, /* 豈 - st */
- 0xfb13, 0xfb17, /* ﬓ - ﬗ */
- 0xfb1f, 0xfb28, /* ײַ - ﬨ */
- 0xfb2a, 0xfb36, /* שׁ - זּ */
- 0xfb38, 0xfb3c, /* טּ - לּ */
- 0xfb40, 0xfb41, /* נּ - סּ */
- 0xfb43, 0xfb44, /* ףּ - פּ */
- 0xfb46, 0xfbb1, /* צּ - ﮱ */
- 0xfbd3, 0xfd3d, /* ﯓ - ﴽ */
- 0xfd50, 0xfd8f, /* ﵐ - ﶏ */
- 0xfd92, 0xfdc7, /* ﶒ - ﷇ */
- 0xfdf0, 0xfdf9, /* ﷰ - ﷹ */
- 0xfe70, 0xfe72, /* ﹰ - ﹲ */
- 0xfe76, 0xfefc, /* ﹶ - ﻼ */
- 0xff66, 0xff6f, /* ヲ - ッ */
- 0xff71, 0xff9d, /* ア - ン */
- 0xffa0, 0xffbe, /* ᅠ - ᄒ */
- 0xffc2, 0xffc7, /* ᅡ - ᅦ */
- 0xffca, 0xffcf, /* ᅧ - ᅬ */
- 0xffd2, 0xffd7, /* ᅭ - ᅲ */
- 0xffda, 0xffdc, /* ᅳ - ᅵ */
-};
-
-/*
- * alpha singlets -
- * only covers ranges not in lower||upper
- */
-static
-Rune _alpha1[] =
-{
- 0x00aa, /* ª */
- 0x00b5, /* µ */
- 0x00ba, /* º */
- 0x03da, /* Ϛ */
- 0x03dc, /* Ϝ */
- 0x03de, /* Ϟ */
- 0x03e0, /* Ϡ */
- 0x06d5, /* ە */
- 0x09b2, /* ল */
- 0x0a5e, /* ਫ਼ */
- 0x0a8d, /* ઍ */
- 0x0ae0, /* ૠ */
- 0x0b9c, /* ஜ */
- 0x0cde, /* ೞ */
- 0x0e4f, /* ๏ */
- 0x0e84, /* ຄ */
- 0x0e8a, /* ຊ */
- 0x0e8d, /* ຍ */
- 0x0ea5, /* ລ */
- 0x0ea7, /* ວ */
- 0x0eb0, /* ະ */
- 0x0ebd, /* ຽ */
- 0x1fbe, /* ι */
- 0x207f, /* ⁿ */
- 0x20a8, /* ₨ */
- 0x2102, /* ℂ */
- 0x2107, /* ℇ */
- 0x2124, /* ℤ */
- 0x2126, /* Ω */
- 0x2128, /* ℨ */
- 0xfb3e, /* מּ */
- 0xfe74, /* ﹴ */
-};
-
-/*
- * space ranges
- */
-static
-Rune _space2[] =
-{
- 0x0009, 0x000a, /* tab and newline */
- 0x0020, 0x0020, /* space */
- 0x0085, 0x0085,
- 0x00a0, 0x00a0, /*   */
- 0x1680, 0x1680,
- 0x180e, 0x180e,
- 0x2000, 0x200b, /*   - ​ */
- 0x2028, 0x2029, /* 
 - 
 */
- 0x202f, 0x202f,
- 0x205f, 0x205f,
- 0x3000, 0x3000, /*   */
- 0xfeff, 0xfeff, /*  */
-};
-
-/*
- * lower case ranges
- * 3rd col is conversion excess 500
- */
-static
-Rune _toupper2[] =
-{
- 0x0061, 0x007a, 468, /* a-z A-Z */
- 0x00e0, 0x00f6, 468, /* à-ö À-Ö */
- 0x00f8, 0x00fe, 468, /* ø-þ Ø-Þ */
- 0x0256, 0x0257, 295, /* ɖ-ɗ Ɖ-Ɗ */
- 0x0258, 0x0259, 298, /* ɘ-ə Ǝ-Ə */
- 0x028a, 0x028b, 283, /* ʊ-ʋ Ʊ-Ʋ */
- 0x03ad, 0x03af, 463, /* έ-ί Έ-Ί */
- 0x03b1, 0x03c1, 468, /* α-ρ Α-Ρ */
- 0x03c3, 0x03cb, 468, /* σ-ϋ Σ-Ϋ */
- 0x03cd, 0x03ce, 437, /* ύ-ώ Ύ-Ώ */
- 0x0430, 0x044f, 468, /* а-я А-Я */
- 0x0451, 0x045c, 420, /* ё-ќ Ё-Ќ */
- 0x045e, 0x045f, 420, /* ў-џ Ў-Џ */
- 0x0561, 0x0586, 452, /* ա-ֆ Ա-Ֆ */
- 0x1f00, 0x1f07, 508, /* ἀ-ἇ Ἀ-Ἇ */
- 0x1f10, 0x1f15, 508, /* ἐ-ἕ Ἐ-Ἕ */
- 0x1f20, 0x1f27, 508, /* ἠ-ἧ Ἠ-Ἧ */
- 0x1f30, 0x1f37, 508, /* ἰ-ἷ Ἰ-Ἷ */
- 0x1f40, 0x1f45, 508, /* ὀ-ὅ Ὀ-Ὅ */
- 0x1f60, 0x1f67, 508, /* ὠ-ὧ Ὠ-Ὧ */
- 0x1f70, 0x1f71, 574, /* ὰ-ά Ὰ-Ά */
- 0x1f72, 0x1f75, 586, /* ὲ-ή Ὲ-Ή */
- 0x1f76, 0x1f77, 600, /* ὶ-ί Ὶ-Ί */
- 0x1f78, 0x1f79, 628, /* ὸ-ό Ὸ-Ό */
- 0x1f7a, 0x1f7b, 612, /* ὺ-ύ Ὺ-Ύ */
- 0x1f7c, 0x1f7d, 626, /* ὼ-ώ Ὼ-Ώ */
- 0x1f80, 0x1f87, 508, /* ᾀ-ᾇ ᾈ-ᾏ */
- 0x1f90, 0x1f97, 508, /* ᾐ-ᾗ ᾘ-ᾟ */
- 0x1fa0, 0x1fa7, 508, /* ᾠ-ᾧ ᾨ-ᾯ */
- 0x1fb0, 0x1fb1, 508, /* ᾰ-ᾱ Ᾰ-Ᾱ */
- 0x1fd0, 0x1fd1, 508, /* ῐ-ῑ Ῐ-Ῑ */
- 0x1fe0, 0x1fe1, 508, /* ῠ-ῡ Ῠ-Ῡ */
- 0x2170, 0x217f, 484, /* ⅰ-ⅿ Ⅰ-Ⅿ */
- 0x24d0, 0x24e9, 474, /* ⓐ-ⓩ Ⓐ-Ⓩ */
- 0xff41, 0xff5a, 468, /* a-z A-Z */
-};
-
-/*
- * lower case singlets
- * 2nd col is conversion excess 500
- */
-static
-Rune _toupper1[] =
-{
- 0x00ff, 621, /* ÿ Ÿ */
- 0x0101, 499, /* ā Ā */
- 0x0103, 499, /* ă Ă */
- 0x0105, 499, /* ą Ą */
- 0x0107, 499, /* ć Ć */
- 0x0109, 499, /* ĉ Ĉ */
- 0x010b, 499, /* ċ Ċ */
- 0x010d, 499, /* č Č */
- 0x010f, 499, /* ď Ď */
- 0x0111, 499, /* đ Đ */
- 0x0113, 499, /* ē Ē */
- 0x0115, 499, /* ĕ Ĕ */
- 0x0117, 499, /* ė Ė */
- 0x0119, 499, /* ę Ę */
- 0x011b, 499, /* ě Ě */
- 0x011d, 499, /* ĝ Ĝ */
- 0x011f, 499, /* ğ Ğ */
- 0x0121, 499, /* ġ Ġ */
- 0x0123, 499, /* ģ Ģ */
- 0x0125, 499, /* ĥ Ĥ */
- 0x0127, 499, /* ħ Ħ */
- 0x0129, 499, /* ĩ Ĩ */
- 0x012b, 499, /* ī Ī */
- 0x012d, 499, /* ĭ Ĭ */
- 0x012f, 499, /* į Į */
- 0x0131, 268, /* ı I */
- 0x0133, 499, /* ij IJ */
- 0x0135, 499, /* ĵ Ĵ */
- 0x0137, 499, /* ķ Ķ */
- 0x013a, 499, /* ĺ Ĺ */
- 0x013c, 499, /* ļ Ļ */
- 0x013e, 499, /* ľ Ľ */
- 0x0140, 499, /* ŀ Ŀ */
- 0x0142, 499, /* ł Ł */
- 0x0144, 499, /* ń Ń */
- 0x0146, 499, /* ņ Ņ */
- 0x0148, 499, /* ň Ň */
- 0x014b, 499, /* ŋ Ŋ */
- 0x014d, 499, /* ō Ō */
- 0x014f, 499, /* ŏ Ŏ */
- 0x0151, 499, /* ő Ő */
- 0x0153, 499, /* œ Œ */
- 0x0155, 499, /* ŕ Ŕ */
- 0x0157, 499, /* ŗ Ŗ */
- 0x0159, 499, /* ř Ř */
- 0x015b, 499, /* ś Ś */
- 0x015d, 499, /* ŝ Ŝ */
- 0x015f, 499, /* ş Ş */
- 0x0161, 499, /* š Š */
- 0x0163, 499, /* ţ Ţ */
- 0x0165, 499, /* ť Ť */
- 0x0167, 499, /* ŧ Ŧ */
- 0x0169, 499, /* ũ Ũ */
- 0x016b, 499, /* ū Ū */
- 0x016d, 499, /* ŭ Ŭ */
- 0x016f, 499, /* ů Ů */
- 0x0171, 499, /* ű Ű */
- 0x0173, 499, /* ų Ų */
- 0x0175, 499, /* ŵ Ŵ */
- 0x0177, 499, /* ŷ Ŷ */
- 0x017a, 499, /* ź Ź */
- 0x017c, 499, /* ż Ż */
- 0x017e, 499, /* ž Ž */
- 0x017f, 200, /* ſ S */
- 0x0183, 499, /* ƃ Ƃ */
- 0x0185, 499, /* ƅ Ƅ */
- 0x0188, 499, /* ƈ Ƈ */
- 0x018c, 499, /* ƌ Ƌ */
- 0x0192, 499, /* ƒ Ƒ */
- 0x0199, 499, /* ƙ Ƙ */
- 0x01a1, 499, /* ơ Ơ */
- 0x01a3, 499, /* ƣ Ƣ */
- 0x01a5, 499, /* ƥ Ƥ */
- 0x01a8, 499, /* ƨ Ƨ */
- 0x01ad, 499, /* ƭ Ƭ */
- 0x01b0, 499, /* ư Ư */
- 0x01b4, 499, /* ƴ Ƴ */
- 0x01b6, 499, /* ƶ Ƶ */
- 0x01b9, 499, /* ƹ Ƹ */
- 0x01bd, 499, /* ƽ Ƽ */
- 0x01c5, 499, /* Dž DŽ */
- 0x01c6, 498, /* dž DŽ */
- 0x01c8, 499, /* Lj LJ */
- 0x01c9, 498, /* lj LJ */
- 0x01cb, 499, /* Nj NJ */
- 0x01cc, 498, /* nj NJ */
- 0x01ce, 499, /* ǎ Ǎ */
- 0x01d0, 499, /* ǐ Ǐ */
- 0x01d2, 499, /* ǒ Ǒ */
- 0x01d4, 499, /* ǔ Ǔ */
- 0x01d6, 499, /* ǖ Ǖ */
- 0x01d8, 499, /* ǘ Ǘ */
- 0x01da, 499, /* ǚ Ǚ */
- 0x01dc, 499, /* ǜ Ǜ */
- 0x01df, 499, /* ǟ Ǟ */
- 0x01e1, 499, /* ǡ Ǡ */
- 0x01e3, 499, /* ǣ Ǣ */
- 0x01e5, 499, /* ǥ Ǥ */
- 0x01e7, 499, /* ǧ Ǧ */
- 0x01e9, 499, /* ǩ Ǩ */
- 0x01eb, 499, /* ǫ Ǫ */
- 0x01ed, 499, /* ǭ Ǭ */
- 0x01ef, 499, /* ǯ Ǯ */
- 0x01f2, 499, /* Dz DZ */
- 0x01f3, 498, /* dz DZ */
- 0x01f5, 499, /* ǵ Ǵ */
- 0x01fb, 499, /* ǻ Ǻ */
- 0x01fd, 499, /* ǽ Ǽ */
- 0x01ff, 499, /* ǿ Ǿ */
- 0x0201, 499, /* ȁ Ȁ */
- 0x0203, 499, /* ȃ Ȃ */
- 0x0205, 499, /* ȅ Ȅ */
- 0x0207, 499, /* ȇ Ȇ */
- 0x0209, 499, /* ȉ Ȉ */
- 0x020b, 499, /* ȋ Ȋ */
- 0x020d, 499, /* ȍ Ȍ */
- 0x020f, 499, /* ȏ Ȏ */
- 0x0211, 499, /* ȑ Ȑ */
- 0x0213, 499, /* ȓ Ȓ */
- 0x0215, 499, /* ȕ Ȕ */
- 0x0217, 499, /* ȗ Ȗ */
- 0x0253, 290, /* ɓ Ɓ */
- 0x0254, 294, /* ɔ Ɔ */
- 0x025b, 297, /* ɛ Ɛ */
- 0x0260, 295, /* ɠ Ɠ */
- 0x0263, 293, /* ɣ Ɣ */
- 0x0268, 291, /* ɨ Ɨ */
- 0x0269, 289, /* ɩ Ɩ */
- 0x026f, 289, /* ɯ Ɯ */
- 0x0272, 287, /* ɲ Ɲ */
- 0x0283, 282, /* ʃ Ʃ */
- 0x0288, 282, /* ʈ Ʈ */
- 0x0292, 281, /* ʒ Ʒ */
- 0x03ac, 462, /* ά Ά */
- 0x03cc, 436, /* ό Ό */
- 0x03d0, 438, /* ϐ Β */
- 0x03d1, 443, /* ϑ Θ */
- 0x03d5, 453, /* ϕ Φ */
- 0x03d6, 446, /* ϖ Π */
- 0x03e3, 499, /* ϣ Ϣ */
- 0x03e5, 499, /* ϥ Ϥ */
- 0x03e7, 499, /* ϧ Ϧ */
- 0x03e9, 499, /* ϩ Ϩ */
- 0x03eb, 499, /* ϫ Ϫ */
- 0x03ed, 499, /* ϭ Ϭ */
- 0x03ef, 499, /* ϯ Ϯ */
- 0x03f0, 414, /* ϰ Κ */
- 0x03f1, 420, /* ϱ Ρ */
- 0x0461, 499, /* ѡ Ѡ */
- 0x0463, 499, /* ѣ Ѣ */
- 0x0465, 499, /* ѥ Ѥ */
- 0x0467, 499, /* ѧ Ѧ */
- 0x0469, 499, /* ѩ Ѩ */
- 0x046b, 499, /* ѫ Ѫ */
- 0x046d, 499, /* ѭ Ѭ */
- 0x046f, 499, /* ѯ Ѯ */
- 0x0471, 499, /* ѱ Ѱ */
- 0x0473, 499, /* ѳ Ѳ */
- 0x0475, 499, /* ѵ Ѵ */
- 0x0477, 499, /* ѷ Ѷ */
- 0x0479, 499, /* ѹ Ѹ */
- 0x047b, 499, /* ѻ Ѻ */
- 0x047d, 499, /* ѽ Ѽ */
- 0x047f, 499, /* ѿ Ѿ */
- 0x0481, 499, /* ҁ Ҁ */
- 0x0491, 499, /* ґ Ґ */
- 0x0493, 499, /* ғ Ғ */
- 0x0495, 499, /* ҕ Ҕ */
- 0x0497, 499, /* җ Җ */
- 0x0499, 499, /* ҙ Ҙ */
- 0x049b, 499, /* қ Қ */
- 0x049d, 499, /* ҝ Ҝ */
- 0x049f, 499, /* ҟ Ҟ */
- 0x04a1, 499, /* ҡ Ҡ */
- 0x04a3, 499, /* ң Ң */
- 0x04a5, 499, /* ҥ Ҥ */
- 0x04a7, 499, /* ҧ Ҧ */
- 0x04a9, 499, /* ҩ Ҩ */
- 0x04ab, 499, /* ҫ Ҫ */
- 0x04ad, 499, /* ҭ Ҭ */
- 0x04af, 499, /* ү Ү */
- 0x04b1, 499, /* ұ Ұ */
- 0x04b3, 499, /* ҳ Ҳ */
- 0x04b5, 499, /* ҵ Ҵ */
- 0x04b7, 499, /* ҷ Ҷ */
- 0x04b9, 499, /* ҹ Ҹ */
- 0x04bb, 499, /* һ Һ */
- 0x04bd, 499, /* ҽ Ҽ */
- 0x04bf, 499, /* ҿ Ҿ */
- 0x04c2, 499, /* ӂ Ӂ */
- 0x04c4, 499, /* ӄ Ӄ */
- 0x04c8, 499, /* ӈ Ӈ */
- 0x04cc, 499, /* ӌ Ӌ */
- 0x04d1, 499, /* ӑ Ӑ */
- 0x04d3, 499, /* ӓ Ӓ */
- 0x04d5, 499, /* ӕ Ӕ */
- 0x04d7, 499, /* ӗ Ӗ */
- 0x04d9, 499, /* ә Ә */
- 0x04db, 499, /* ӛ Ӛ */
- 0x04dd, 499, /* ӝ Ӝ */
- 0x04df, 499, /* ӟ Ӟ */
- 0x04e1, 499, /* ӡ Ӡ */
- 0x04e3, 499, /* ӣ Ӣ */
- 0x04e5, 499, /* ӥ Ӥ */
- 0x04e7, 499, /* ӧ Ӧ */
- 0x04e9, 499, /* ө Ө */
- 0x04eb, 499, /* ӫ Ӫ */
- 0x04ef, 499, /* ӯ Ӯ */
- 0x04f1, 499, /* ӱ Ӱ */
- 0x04f3, 499, /* ӳ Ӳ */
- 0x04f5, 499, /* ӵ Ӵ */
- 0x04f9, 499, /* ӹ Ӹ */
- 0x1e01, 499, /* ḁ Ḁ */
- 0x1e03, 499, /* ḃ Ḃ */
- 0x1e05, 499, /* ḅ Ḅ */
- 0x1e07, 499, /* ḇ Ḇ */
- 0x1e09, 499, /* ḉ Ḉ */
- 0x1e0b, 499, /* ḋ Ḋ */
- 0x1e0d, 499, /* ḍ Ḍ */
- 0x1e0f, 499, /* ḏ Ḏ */
- 0x1e11, 499, /* ḑ Ḑ */
- 0x1e13, 499, /* ḓ Ḓ */
- 0x1e15, 499, /* ḕ Ḕ */
- 0x1e17, 499, /* ḗ Ḗ */
- 0x1e19, 499, /* ḙ Ḙ */
- 0x1e1b, 499, /* ḛ Ḛ */
- 0x1e1d, 499, /* ḝ Ḝ */
- 0x1e1f, 499, /* ḟ Ḟ */
- 0x1e21, 499, /* ḡ Ḡ */
- 0x1e23, 499, /* ḣ Ḣ */
- 0x1e25, 499, /* ḥ Ḥ */
- 0x1e27, 499, /* ḧ Ḧ */
- 0x1e29, 499, /* ḩ Ḩ */
- 0x1e2b, 499, /* ḫ Ḫ */
- 0x1e2d, 499, /* ḭ Ḭ */
- 0x1e2f, 499, /* ḯ Ḯ */
- 0x1e31, 499, /* ḱ Ḱ */
- 0x1e33, 499, /* ḳ Ḳ */
- 0x1e35, 499, /* ḵ Ḵ */
- 0x1e37, 499, /* ḷ Ḷ */
- 0x1e39, 499, /* ḹ Ḹ */
- 0x1e3b, 499, /* ḻ Ḻ */
- 0x1e3d, 499, /* ḽ Ḽ */
- 0x1e3f, 499, /* ḿ Ḿ */
- 0x1e41, 499, /* ṁ Ṁ */
- 0x1e43, 499, /* ṃ Ṃ */
- 0x1e45, 499, /* ṅ Ṅ */
- 0x1e47, 499, /* ṇ Ṇ */
- 0x1e49, 499, /* ṉ Ṉ */
- 0x1e4b, 499, /* ṋ Ṋ */
- 0x1e4d, 499, /* ṍ Ṍ */
- 0x1e4f, 499, /* ṏ Ṏ */
- 0x1e51, 499, /* ṑ Ṑ */
- 0x1e53, 499, /* ṓ Ṓ */
- 0x1e55, 499, /* ṕ Ṕ */
- 0x1e57, 499, /* ṗ Ṗ */
- 0x1e59, 499, /* ṙ Ṙ */
- 0x1e5b, 499, /* ṛ Ṛ */
- 0x1e5d, 499, /* ṝ Ṝ */
- 0x1e5f, 499, /* ṟ Ṟ */
- 0x1e61, 499, /* ṡ Ṡ */
- 0x1e63, 499, /* ṣ Ṣ */
- 0x1e65, 499, /* ṥ Ṥ */
- 0x1e67, 499, /* ṧ Ṧ */
- 0x1e69, 499, /* ṩ Ṩ */
- 0x1e6b, 499, /* ṫ Ṫ */
- 0x1e6d, 499, /* ṭ Ṭ */
- 0x1e6f, 499, /* ṯ Ṯ */
- 0x1e71, 499, /* ṱ Ṱ */
- 0x1e73, 499, /* ṳ Ṳ */
- 0x1e75, 499, /* ṵ Ṵ */
- 0x1e77, 499, /* ṷ Ṷ */
- 0x1e79, 499, /* ṹ Ṹ */
- 0x1e7b, 499, /* ṻ Ṻ */
- 0x1e7d, 499, /* ṽ Ṽ */
- 0x1e7f, 499, /* ṿ Ṿ */
- 0x1e81, 499, /* ẁ Ẁ */
- 0x1e83, 499, /* ẃ Ẃ */
- 0x1e85, 499, /* ẅ Ẅ */
- 0x1e87, 499, /* ẇ Ẇ */
- 0x1e89, 499, /* ẉ Ẉ */
- 0x1e8b, 499, /* ẋ Ẋ */
- 0x1e8d, 499, /* ẍ Ẍ */
- 0x1e8f, 499, /* ẏ Ẏ */
- 0x1e91, 499, /* ẑ Ẑ */
- 0x1e93, 499, /* ẓ Ẓ */
- 0x1e95, 499, /* ẕ Ẕ */
- 0x1ea1, 499, /* ạ Ạ */
- 0x1ea3, 499, /* ả Ả */
- 0x1ea5, 499, /* ấ Ấ */
- 0x1ea7, 499, /* ầ Ầ */
- 0x1ea9, 499, /* ẩ Ẩ */
- 0x1eab, 499, /* ẫ Ẫ */
- 0x1ead, 499, /* ậ Ậ */
- 0x1eaf, 499, /* ắ Ắ */
- 0x1eb1, 499, /* ằ Ằ */
- 0x1eb3, 499, /* ẳ Ẳ */
- 0x1eb5, 499, /* ẵ Ẵ */
- 0x1eb7, 499, /* ặ Ặ */
- 0x1eb9, 499, /* ẹ Ẹ */
- 0x1ebb, 499, /* ẻ Ẻ */
- 0x1ebd, 499, /* ẽ Ẽ */
- 0x1ebf, 499, /* ế Ế */
- 0x1ec1, 499, /* ề Ề */
- 0x1ec3, 499, /* ể Ể */
- 0x1ec5, 499, /* ễ Ễ */
- 0x1ec7, 499, /* ệ Ệ */
- 0x1ec9, 499, /* ỉ Ỉ */
- 0x1ecb, 499, /* ị Ị */
- 0x1ecd, 499, /* ọ Ọ */
- 0x1ecf, 499, /* ỏ Ỏ */
- 0x1ed1, 499, /* ố Ố */
- 0x1ed3, 499, /* ồ Ồ */
- 0x1ed5, 499, /* ổ Ổ */
- 0x1ed7, 499, /* ỗ Ỗ */
- 0x1ed9, 499, /* ộ Ộ */
- 0x1edb, 499, /* ớ Ớ */
- 0x1edd, 499, /* ờ Ờ */
- 0x1edf, 499, /* ở Ở */
- 0x1ee1, 499, /* ỡ Ỡ */
- 0x1ee3, 499, /* ợ Ợ */
- 0x1ee5, 499, /* ụ Ụ */
- 0x1ee7, 499, /* ủ Ủ */
- 0x1ee9, 499, /* ứ Ứ */
- 0x1eeb, 499, /* ừ Ừ */
- 0x1eed, 499, /* ử Ử */
- 0x1eef, 499, /* ữ Ữ */
- 0x1ef1, 499, /* ự Ự */
- 0x1ef3, 499, /* ỳ Ỳ */
- 0x1ef5, 499, /* ỵ Ỵ */
- 0x1ef7, 499, /* ỷ Ỷ */
- 0x1ef9, 499, /* ỹ Ỹ */
- 0x1f51, 508, /* ὑ Ὑ */
- 0x1f53, 508, /* ὓ Ὓ */
- 0x1f55, 508, /* ὕ Ὕ */
- 0x1f57, 508, /* ὗ Ὗ */
- 0x1fb3, 509, /* ᾳ ᾼ */
- 0x1fc3, 509, /* ῃ ῌ */
- 0x1fe5, 507, /* ῥ Ῥ */
- 0x1ff3, 509, /* ῳ ῼ */
-};
-
-static Rune __isdigitr[] = {
- 0x0030, 0x0039,
- 0x0660, 0x0669,
- 0x06f0, 0x06f9,
- 0x07c0, 0x07c9,
- 0x0966, 0x096f,
- 0x09e6, 0x09ef,
- 0x0a66, 0x0a6f,
- 0x0ae6, 0x0aef,
- 0x0b66, 0x0b6f,
- 0x0be6, 0x0bef,
- 0x0c66, 0x0c6f,
- 0x0ce6, 0x0cef,
- 0x0d66, 0x0d6f,
- 0x0e50, 0x0e59,
- 0x0ed0, 0x0ed9,
- 0x0f20, 0x0f29,
- 0x1040, 0x1049,
- 0x17e0, 0x17e9,
- 0x1810, 0x1819,
- 0x1946, 0x194f,
- 0x19d0, 0x19d9,
- 0x1b50, 0x1b59,
- 0xff10, 0xff19,
- 0x104a0, 0x104a9,
- 0x1d7ce, 0x1d7ff,
-};
-
-/*
- * upper case ranges
- * 3rd col is conversion excess 500
- */
-static
-Rune _tolower2[] =
-{
- 0x0041, 0x005a, 532, /* A-Z a-z */
- 0x00c0, 0x00d6, 532, /* À-Ö à-ö */
- 0x00d8, 0x00de, 532, /* Ø-Þ ø-þ */
- 0x0189, 0x018a, 705, /* Ɖ-Ɗ ɖ-ɗ */
- 0x018e, 0x018f, 702, /* Ǝ-Ə ɘ-ə */
- 0x01b1, 0x01b2, 717, /* Ʊ-Ʋ ʊ-ʋ */
- 0x0388, 0x038a, 537, /* Έ-Ί έ-ί */
- 0x038e, 0x038f, 563, /* Ύ-Ώ ύ-ώ */
- 0x0391, 0x03a1, 532, /* Α-Ρ α-ρ */
- 0x03a3, 0x03ab, 532, /* Σ-Ϋ σ-ϋ */
- 0x0401, 0x040c, 580, /* Ё-Ќ ё-ќ */
- 0x040e, 0x040f, 580, /* Ў-Џ ў-џ */
- 0x0410, 0x042f, 532, /* А-Я а-я */
- 0x0531, 0x0556, 548, /* Ա-Ֆ ա-ֆ */
- 0x10a0, 0x10c5, 548, /* Ⴀ-Ⴥ ა-ჵ */
- 0x1f08, 0x1f0f, 492, /* Ἀ-Ἇ ἀ-ἇ */
- 0x1f18, 0x1f1d, 492, /* Ἐ-Ἕ ἐ-ἕ */
- 0x1f28, 0x1f2f, 492, /* Ἠ-Ἧ ἠ-ἧ */
- 0x1f38, 0x1f3f, 492, /* Ἰ-Ἷ ἰ-ἷ */
- 0x1f48, 0x1f4d, 492, /* Ὀ-Ὅ ὀ-ὅ */
- 0x1f68, 0x1f6f, 492, /* Ὠ-Ὧ ὠ-ὧ */
- 0x1f88, 0x1f8f, 492, /* ᾈ-ᾏ ᾀ-ᾇ */
- 0x1f98, 0x1f9f, 492, /* ᾘ-ᾟ ᾐ-ᾗ */
- 0x1fa8, 0x1faf, 492, /* ᾨ-ᾯ ᾠ-ᾧ */
- 0x1fb8, 0x1fb9, 492, /* Ᾰ-Ᾱ ᾰ-ᾱ */
- 0x1fba, 0x1fbb, 426, /* Ὰ-Ά ὰ-ά */
- 0x1fc8, 0x1fcb, 414, /* Ὲ-Ή ὲ-ή */
- 0x1fd8, 0x1fd9, 492, /* Ῐ-Ῑ ῐ-ῑ */
- 0x1fda, 0x1fdb, 400, /* Ὶ-Ί ὶ-ί */
- 0x1fe8, 0x1fe9, 492, /* Ῠ-Ῡ ῠ-ῡ */
- 0x1fea, 0x1feb, 388, /* Ὺ-Ύ ὺ-ύ */
- 0x1ff8, 0x1ff9, 372, /* Ὸ-Ό ὸ-ό */
- 0x1ffa, 0x1ffb, 374, /* Ὼ-Ώ ὼ-ώ */
- 0x2160, 0x216f, 516, /* Ⅰ-Ⅿ ⅰ-ⅿ */
- 0x24b6, 0x24cf, 526, /* Ⓐ-Ⓩ ⓐ-ⓩ */
- 0xff21, 0xff3a, 532, /* A-Z a-z */
-};
-
-/*
- * upper case singlets
- * 2nd col is conversion excess 500
- */
-static
-Rune _tolower1[] =
-{
- 0x0100, 501, /* Ā ā */
- 0x0102, 501, /* Ă ă */
- 0x0104, 501, /* Ą ą */
- 0x0106, 501, /* Ć ć */
- 0x0108, 501, /* Ĉ ĉ */
- 0x010a, 501, /* Ċ ċ */
- 0x010c, 501, /* Č č */
- 0x010e, 501, /* Ď ď */
- 0x0110, 501, /* Đ đ */
- 0x0112, 501, /* Ē ē */
- 0x0114, 501, /* Ĕ ĕ */
- 0x0116, 501, /* Ė ė */
- 0x0118, 501, /* Ę ę */
- 0x011a, 501, /* Ě ě */
- 0x011c, 501, /* Ĝ ĝ */
- 0x011e, 501, /* Ğ ğ */
- 0x0120, 501, /* Ġ ġ */
- 0x0122, 501, /* Ģ ģ */
- 0x0124, 501, /* Ĥ ĥ */
- 0x0126, 501, /* Ħ ħ */
- 0x0128, 501, /* Ĩ ĩ */
- 0x012a, 501, /* Ī ī */
- 0x012c, 501, /* Ĭ ĭ */
- 0x012e, 501, /* Į į */
- 0x0130, 301, /* İ i */
- 0x0132, 501, /* IJ ij */
- 0x0134, 501, /* Ĵ ĵ */
- 0x0136, 501, /* Ķ ķ */
- 0x0139, 501, /* Ĺ ĺ */
- 0x013b, 501, /* Ļ ļ */
- 0x013d, 501, /* Ľ ľ */
- 0x013f, 501, /* Ŀ ŀ */
- 0x0141, 501, /* Ł ł */
- 0x0143, 501, /* Ń ń */
- 0x0145, 501, /* Ņ ņ */
- 0x0147, 501, /* Ň ň */
- 0x014a, 501, /* Ŋ ŋ */
- 0x014c, 501, /* Ō ō */
- 0x014e, 501, /* Ŏ ŏ */
- 0x0150, 501, /* Ő ő */
- 0x0152, 501, /* Œ œ */
- 0x0154, 501, /* Ŕ ŕ */
- 0x0156, 501, /* Ŗ ŗ */
- 0x0158, 501, /* Ř ř */
- 0x015a, 501, /* Ś ś */
- 0x015c, 501, /* Ŝ ŝ */
- 0x015e, 501, /* Ş ş */
- 0x0160, 501, /* Š š */
- 0x0162, 501, /* Ţ ţ */
- 0x0164, 501, /* Ť ť */
- 0x0166, 501, /* Ŧ ŧ */
- 0x0168, 501, /* Ũ ũ */
- 0x016a, 501, /* Ū ū */
- 0x016c, 501, /* Ŭ ŭ */
- 0x016e, 501, /* Ů ů */
- 0x0170, 501, /* Ű ű */
- 0x0172, 501, /* Ų ų */
- 0x0174, 501, /* Ŵ ŵ */
- 0x0176, 501, /* Ŷ ŷ */
- 0x0178, 379, /* Ÿ ÿ */
- 0x0179, 501, /* Ź ź */
- 0x017b, 501, /* Ż ż */
- 0x017d, 501, /* Ž ž */
- 0x0181, 710, /* Ɓ ɓ */
- 0x0182, 501, /* Ƃ ƃ */
- 0x0184, 501, /* Ƅ ƅ */
- 0x0186, 706, /* Ɔ ɔ */
- 0x0187, 501, /* Ƈ ƈ */
- 0x018b, 501, /* Ƌ ƌ */
- 0x0190, 703, /* Ɛ ɛ */
- 0x0191, 501, /* Ƒ ƒ */
- 0x0193, 705, /* Ɠ ɠ */
- 0x0194, 707, /* Ɣ ɣ */
- 0x0196, 711, /* Ɩ ɩ */
- 0x0197, 709, /* Ɨ ɨ */
- 0x0198, 501, /* Ƙ ƙ */
- 0x019c, 711, /* Ɯ ɯ */
- 0x019d, 713, /* Ɲ ɲ */
- 0x01a0, 501, /* Ơ ơ */
- 0x01a2, 501, /* Ƣ ƣ */
- 0x01a4, 501, /* Ƥ ƥ */
- 0x01a7, 501, /* Ƨ ƨ */
- 0x01a9, 718, /* Ʃ ʃ */
- 0x01ac, 501, /* Ƭ ƭ */
- 0x01ae, 718, /* Ʈ ʈ */
- 0x01af, 501, /* Ư ư */
- 0x01b3, 501, /* Ƴ ƴ */
- 0x01b5, 501, /* Ƶ ƶ */
- 0x01b7, 719, /* Ʒ ʒ */
- 0x01b8, 501, /* Ƹ ƹ */
- 0x01bc, 501, /* Ƽ ƽ */
- 0x01c4, 502, /* DŽ dž */
- 0x01c5, 501, /* Dž dž */
- 0x01c7, 502, /* LJ lj */
- 0x01c8, 501, /* Lj lj */
- 0x01ca, 502, /* NJ nj */
- 0x01cb, 501, /* Nj nj */
- 0x01cd, 501, /* Ǎ ǎ */
- 0x01cf, 501, /* Ǐ ǐ */
- 0x01d1, 501, /* Ǒ ǒ */
- 0x01d3, 501, /* Ǔ ǔ */
- 0x01d5, 501, /* Ǖ ǖ */
- 0x01d7, 501, /* Ǘ ǘ */
- 0x01d9, 501, /* Ǚ ǚ */
- 0x01db, 501, /* Ǜ ǜ */
- 0x01de, 501, /* Ǟ ǟ */
- 0x01e0, 501, /* Ǡ ǡ */
- 0x01e2, 501, /* Ǣ ǣ */
- 0x01e4, 501, /* Ǥ ǥ */
- 0x01e6, 501, /* Ǧ ǧ */
- 0x01e8, 501, /* Ǩ ǩ */
- 0x01ea, 501, /* Ǫ ǫ */
- 0x01ec, 501, /* Ǭ ǭ */
- 0x01ee, 501, /* Ǯ ǯ */
- 0x01f1, 502, /* DZ dz */
- 0x01f2, 501, /* Dz dz */
- 0x01f4, 501, /* Ǵ ǵ */
- 0x01fa, 501, /* Ǻ ǻ */
- 0x01fc, 501, /* Ǽ ǽ */
- 0x01fe, 501, /* Ǿ ǿ */
- 0x0200, 501, /* Ȁ ȁ */
- 0x0202, 501, /* Ȃ ȃ */
- 0x0204, 501, /* Ȅ ȅ */
- 0x0206, 501, /* Ȇ ȇ */
- 0x0208, 501, /* Ȉ ȉ */
- 0x020a, 501, /* Ȋ ȋ */
- 0x020c, 501, /* Ȍ ȍ */
- 0x020e, 501, /* Ȏ ȏ */
- 0x0210, 501, /* Ȑ ȑ */
- 0x0212, 501, /* Ȓ ȓ */
- 0x0214, 501, /* Ȕ ȕ */
- 0x0216, 501, /* Ȗ ȗ */
- 0x0386, 538, /* Ά ά */
- 0x038c, 564, /* Ό ό */
- 0x03e2, 501, /* Ϣ ϣ */
- 0x03e4, 501, /* Ϥ ϥ */
- 0x03e6, 501, /* Ϧ ϧ */
- 0x03e8, 501, /* Ϩ ϩ */
- 0x03ea, 501, /* Ϫ ϫ */
- 0x03ec, 501, /* Ϭ ϭ */
- 0x03ee, 501, /* Ϯ ϯ */
- 0x0460, 501, /* Ѡ ѡ */
- 0x0462, 501, /* Ѣ ѣ */
- 0x0464, 501, /* Ѥ ѥ */
- 0x0466, 501, /* Ѧ ѧ */
- 0x0468, 501, /* Ѩ ѩ */
- 0x046a, 501, /* Ѫ ѫ */
- 0x046c, 501, /* Ѭ ѭ */
- 0x046e, 501, /* Ѯ ѯ */
- 0x0470, 501, /* Ѱ ѱ */
- 0x0472, 501, /* Ѳ ѳ */
- 0x0474, 501, /* Ѵ ѵ */
- 0x0476, 501, /* Ѷ ѷ */
- 0x0478, 501, /* Ѹ ѹ */
- 0x047a, 501, /* Ѻ ѻ */
- 0x047c, 501, /* Ѽ ѽ */
- 0x047e, 501, /* Ѿ ѿ */
- 0x0480, 501, /* Ҁ ҁ */
- 0x0490, 501, /* Ґ ґ */
- 0x0492, 501, /* Ғ ғ */
- 0x0494, 501, /* Ҕ ҕ */
- 0x0496, 501, /* Җ җ */
- 0x0498, 501, /* Ҙ ҙ */
- 0x049a, 501, /* Қ қ */
- 0x049c, 501, /* Ҝ ҝ */
- 0x049e, 501, /* Ҟ ҟ */
- 0x04a0, 501, /* Ҡ ҡ */
- 0x04a2, 501, /* Ң ң */
- 0x04a4, 501, /* Ҥ ҥ */
- 0x04a6, 501, /* Ҧ ҧ */
- 0x04a8, 501, /* Ҩ ҩ */
- 0x04aa, 501, /* Ҫ ҫ */
- 0x04ac, 501, /* Ҭ ҭ */
- 0x04ae, 501, /* Ү ү */
- 0x04b0, 501, /* Ұ ұ */
- 0x04b2, 501, /* Ҳ ҳ */
- 0x04b4, 501, /* Ҵ ҵ */
- 0x04b6, 501, /* Ҷ ҷ */
- 0x04b8, 501, /* Ҹ ҹ */
- 0x04ba, 501, /* Һ һ */
- 0x04bc, 501, /* Ҽ ҽ */
- 0x04be, 501, /* Ҿ ҿ */
- 0x04c1, 501, /* Ӂ ӂ */
- 0x04c3, 501, /* Ӄ ӄ */
- 0x04c7, 501, /* Ӈ ӈ */
- 0x04cb, 501, /* Ӌ ӌ */
- 0x04d0, 501, /* Ӑ ӑ */
- 0x04d2, 501, /* Ӓ ӓ */
- 0x04d4, 501, /* Ӕ ӕ */
- 0x04d6, 501, /* Ӗ ӗ */
- 0x04d8, 501, /* Ә ә */
- 0x04da, 501, /* Ӛ ӛ */
- 0x04dc, 501, /* Ӝ ӝ */
- 0x04de, 501, /* Ӟ ӟ */
- 0x04e0, 501, /* Ӡ ӡ */
- 0x04e2, 501, /* Ӣ ӣ */
- 0x04e4, 501, /* Ӥ ӥ */
- 0x04e6, 501, /* Ӧ ӧ */
- 0x04e8, 501, /* Ө ө */
- 0x04ea, 501, /* Ӫ ӫ */
- 0x04ee, 501, /* Ӯ ӯ */
- 0x04f0, 501, /* Ӱ ӱ */
- 0x04f2, 501, /* Ӳ ӳ */
- 0x04f4, 501, /* Ӵ ӵ */
- 0x04f8, 501, /* Ӹ ӹ */
- 0x1e00, 501, /* Ḁ ḁ */
- 0x1e02, 501, /* Ḃ ḃ */
- 0x1e04, 501, /* Ḅ ḅ */
- 0x1e06, 501, /* Ḇ ḇ */
- 0x1e08, 501, /* Ḉ ḉ */
- 0x1e0a, 501, /* Ḋ ḋ */
- 0x1e0c, 501, /* Ḍ ḍ */
- 0x1e0e, 501, /* Ḏ ḏ */
- 0x1e10, 501, /* Ḑ ḑ */
- 0x1e12, 501, /* Ḓ ḓ */
- 0x1e14, 501, /* Ḕ ḕ */
- 0x1e16, 501, /* Ḗ ḗ */
- 0x1e18, 501, /* Ḙ ḙ */
- 0x1e1a, 501, /* Ḛ ḛ */
- 0x1e1c, 501, /* Ḝ ḝ */
- 0x1e1e, 501, /* Ḟ ḟ */
- 0x1e20, 501, /* Ḡ ḡ */
- 0x1e22, 501, /* Ḣ ḣ */
- 0x1e24, 501, /* Ḥ ḥ */
- 0x1e26, 501, /* Ḧ ḧ */
- 0x1e28, 501, /* Ḩ ḩ */
- 0x1e2a, 501, /* Ḫ ḫ */
- 0x1e2c, 501, /* Ḭ ḭ */
- 0x1e2e, 501, /* Ḯ ḯ */
- 0x1e30, 501, /* Ḱ ḱ */
- 0x1e32, 501, /* Ḳ ḳ */
- 0x1e34, 501, /* Ḵ ḵ */
- 0x1e36, 501, /* Ḷ ḷ */
- 0x1e38, 501, /* Ḹ ḹ */
- 0x1e3a, 501, /* Ḻ ḻ */
- 0x1e3c, 501, /* Ḽ ḽ */
- 0x1e3e, 501, /* Ḿ ḿ */
- 0x1e40, 501, /* Ṁ ṁ */
- 0x1e42, 501, /* Ṃ ṃ */
- 0x1e44, 501, /* Ṅ ṅ */
- 0x1e46, 501, /* Ṇ ṇ */
- 0x1e48, 501, /* Ṉ ṉ */
- 0x1e4a, 501, /* Ṋ ṋ */
- 0x1e4c, 501, /* Ṍ ṍ */
- 0x1e4e, 501, /* Ṏ ṏ */
- 0x1e50, 501, /* Ṑ ṑ */
- 0x1e52, 501, /* Ṓ ṓ */
- 0x1e54, 501, /* Ṕ ṕ */
- 0x1e56, 501, /* Ṗ ṗ */
- 0x1e58, 501, /* Ṙ ṙ */
- 0x1e5a, 501, /* Ṛ ṛ */
- 0x1e5c, 501, /* Ṝ ṝ */
- 0x1e5e, 501, /* Ṟ ṟ */
- 0x1e60, 501, /* Ṡ ṡ */
- 0x1e62, 501, /* Ṣ ṣ */
- 0x1e64, 501, /* Ṥ ṥ */
- 0x1e66, 501, /* Ṧ ṧ */
- 0x1e68, 501, /* Ṩ ṩ */
- 0x1e6a, 501, /* Ṫ ṫ */
- 0x1e6c, 501, /* Ṭ ṭ */
- 0x1e6e, 501, /* Ṯ ṯ */
- 0x1e70, 501, /* Ṱ ṱ */
- 0x1e72, 501, /* Ṳ ṳ */
- 0x1e74, 501, /* Ṵ ṵ */
- 0x1e76, 501, /* Ṷ ṷ */
- 0x1e78, 501, /* Ṹ ṹ */
- 0x1e7a, 501, /* Ṻ ṻ */
- 0x1e7c, 501, /* Ṽ ṽ */
- 0x1e7e, 501, /* Ṿ ṿ */
- 0x1e80, 501, /* Ẁ ẁ */
- 0x1e82, 501, /* Ẃ ẃ */
- 0x1e84, 501, /* Ẅ ẅ */
- 0x1e86, 501, /* Ẇ ẇ */
- 0x1e88, 501, /* Ẉ ẉ */
- 0x1e8a, 501, /* Ẋ ẋ */
- 0x1e8c, 501, /* Ẍ ẍ */
- 0x1e8e, 501, /* Ẏ ẏ */
- 0x1e90, 501, /* Ẑ ẑ */
- 0x1e92, 501, /* Ẓ ẓ */
- 0x1e94, 501, /* Ẕ ẕ */
- 0x1ea0, 501, /* Ạ ạ */
- 0x1ea2, 501, /* Ả ả */
- 0x1ea4, 501, /* Ấ ấ */
- 0x1ea6, 501, /* Ầ ầ */
- 0x1ea8, 501, /* Ẩ ẩ */
- 0x1eaa, 501, /* Ẫ ẫ */
- 0x1eac, 501, /* Ậ ậ */
- 0x1eae, 501, /* Ắ ắ */
- 0x1eb0, 501, /* Ằ ằ */
- 0x1eb2, 501, /* Ẳ ẳ */
- 0x1eb4, 501, /* Ẵ ẵ */
- 0x1eb6, 501, /* Ặ ặ */
- 0x1eb8, 501, /* Ẹ ẹ */
- 0x1eba, 501, /* Ẻ ẻ */
- 0x1ebc, 501, /* Ẽ ẽ */
- 0x1ebe, 501, /* Ế ế */
- 0x1ec0, 501, /* Ề ề */
- 0x1ec2, 501, /* Ể ể */
- 0x1ec4, 501, /* Ễ ễ */
- 0x1ec6, 501, /* Ệ ệ */
- 0x1ec8, 501, /* Ỉ ỉ */
- 0x1eca, 501, /* Ị ị */
- 0x1ecc, 501, /* Ọ ọ */
- 0x1ece, 501, /* Ỏ ỏ */
- 0x1ed0, 501, /* Ố ố */
- 0x1ed2, 501, /* Ồ ồ */
- 0x1ed4, 501, /* Ổ ổ */
- 0x1ed6, 501, /* Ỗ ỗ */
- 0x1ed8, 501, /* Ộ ộ */
- 0x1eda, 501, /* Ớ ớ */
- 0x1edc, 501, /* Ờ ờ */
- 0x1ede, 501, /* Ở ở */
- 0x1ee0, 501, /* Ỡ ỡ */
- 0x1ee2, 501, /* Ợ ợ */
- 0x1ee4, 501, /* Ụ ụ */
- 0x1ee6, 501, /* Ủ ủ */
- 0x1ee8, 501, /* Ứ ứ */
- 0x1eea, 501, /* Ừ ừ */
- 0x1eec, 501, /* Ử ử */
- 0x1eee, 501, /* Ữ ữ */
- 0x1ef0, 501, /* Ự ự */
- 0x1ef2, 501, /* Ỳ ỳ */
- 0x1ef4, 501, /* Ỵ ỵ */
- 0x1ef6, 501, /* Ỷ ỷ */
- 0x1ef8, 501, /* Ỹ ỹ */
- 0x1f59, 492, /* Ὑ ὑ */
- 0x1f5b, 492, /* Ὓ ὓ */
- 0x1f5d, 492, /* Ὕ ὕ */
- 0x1f5f, 492, /* Ὗ ὗ */
- 0x1fbc, 491, /* ᾼ ᾳ */
- 0x1fcc, 491, /* ῌ ῃ */
- 0x1fec, 493, /* Ῥ ῥ */
- 0x1ffc, 491, /* ῼ ῳ */
-};
-
-/*
- * title characters are those between
- * upper and lower case.  ie DZ Dz dz
- */
-static
-Rune _totitle1[] =
-{
- 0x01c4, 501, /* DŽ Dž */
- 0x01c6, 499, /* dž Dž */
- 0x01c7, 501, /* LJ Lj */
- 0x01c9, 499, /* lj Lj */
- 0x01ca, 501, /* NJ Nj */
- 0x01cc, 499, /* nj Nj */
- 0x01f1, 501, /* DZ Dz */
- 0x01f3, 499, /* dz Dz */
-};
-
-static
-Rune*
-bsearch(Rune c, Rune *t, int n, int ne)
-{
- Rune *p;
- int m;
-
- while(n > 1) {
- m = n/2;
- p = t + m*ne;
- if(c >= p[0]) {
- t = p;
- n = n-m;
- } else
- n = m;
- }
- if(n && c >= t[0])
- return t;
- return 0;
-}
-
-Rune
-tolowerrune(Rune c)
-{
- Rune *p;
-
- p = bsearch(c, _tolower2, nelem(_tolower2)/3, 3);
- if(p && c >= p[0] && c <= p[1])
- return c + p[2] - 500;
- p = bsearch(c, _tolower1, nelem(_tolower1)/2, 2);
- if(p && c == p[0])
- return c + p[1] - 500;
- return c;
-}
-
-Rune
-toupperrune(Rune c)
-{
- Rune *p;
-
- p = bsearch(c, _toupper2, nelem(_toupper2)/3, 3);
- if(p && c >= p[0] && c <= p[1])
- return c + p[2] - 500;
- p = bsearch(c, _toupper1, nelem(_toupper1)/2, 2);
- if(p && c == p[0])
- return c + p[1] - 500;
- return c;
-}
-
-Rune
-totitlerune(Rune c)
-{
- Rune *p;
-
- p = bsearch(c, _totitle1, nelem(_totitle1)/2, 2);
- if(p && c == p[0])
- return c + p[1] - 500;
- return c;
-}
-
-int
-islowerrune(Rune c)
-{
- Rune *p;
-
- p = bsearch(c, _toupper2, nelem(_toupper2)/3, 3);
- if(p && c >= p[0] && c <= p[1])
- return 1;
- p = bsearch(c, _toupper1, nelem(_toupper1)/2, 2);
- if(p && c == p[0])
- return 1;
- return 0;
-}
-
-int
-isupperrune(Rune c)
-{
- Rune *p;
-
- p = bsearch(c, _tolower2, nelem(_tolower2)/3, 3);
- if(p && c >= p[0] && c <= p[1])
- return 1;
- p = bsearch(c, _tolower1, nelem(_tolower1)/2, 2);
- if(p && c == p[0])
- return 1;
- return 0;
-}
-
-int
-isalpharune(Rune c)
-{
- Rune *p;
-
- if(isupperrune(c) || islowerrune(c))
- return 1;
- p = bsearch(c, _alpha2, nelem(_alpha2)/2, 2);
- if(p && c >= p[0] && c <= p[1])
- return 1;
- p = bsearch(c, _alpha1, nelem(_alpha1), 1);
- if(p && c == p[0])
- return 1;
- return 0;
-}
-
-int
-istitlerune(Rune c)
-{
- return isupperrune(c) && islowerrune(c);
-}
-
-int
-isspacerune(Rune c)
-{
- Rune *p;
-
- p = bsearch(c, _space2, nelem(_space2)/2, 2);
- if(p && c >= p[0] && c <= p[1])
- return 1;
- return 0;
-}
-
-int
-isdigitrune(Rune c)
-{
- Rune *p;
-
- p = bsearch(c, __isdigitr, nelem(__isdigitr)/2, 2);
- if(p && c >= p[0] && c <= p[1])
- return 1;
- return 0;
-}
--- a//sys/src/libc/test/mkfile
+++ b//sys/src/libc/test/mkfile
@@ -3,6 +3,8 @@
 TEST=\
	date\
	pow\
+ runebreak\
+ runenorm\
	strchr\

 </sys/src/cmd/mktest
--- /dev/null
+++ b//sys/src/libc/test/runebreak.c
@@ -1,0 +1,93 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+
+static int
+estrtoul(char *s)
+{
+ char *epr;
+ Rune code;
+
+ code = strtoul(s, &epr, 16);
+ if(s == epr)
+ sysfatal("bad code point hex string");
+ return code;
+}
+
+void
+run(char *file, Rune* (*fn)(Rune*))
+{
+ Biobuf *b;
+ char *p, *dot;
+ char *pieces[16];
+ int i, j, n;
+ Rune stack[16], ops[16];
+ int nstack, nops;
+ Rune r, *rp, *rp2;
+ char *line;
+
+ b = Bopen(file, OREAD);
+ if(b == nil)
+ sysfatal("could not load composition exclusions: %r");
+
+ for(;(p = Brdline(b, '\n')) != nil; free(line)){
+ p[Blinelen(b)-1] = 0;
+ line = strdup(p);
+ if(p[0] == 0 || p[0] == '#')
+ continue;
+ if((dot = strstr(p, "#")) != nil)
+ *dot = 0;
+ n = getfields(p, pieces, nelem(pieces), 0, " ");
+ nstack = nops = 0;
+ for(i = 0; i < n; i++){
+ chartorune(&r, pieces[i]);
+ if(r != L'÷' && r != L'×'){
+ r = estrtoul(pieces[i]);
+ stack[nstack++] = r;
+ stack[nstack] = 0;
+ } else {
+ ops[nops++] = r;
+ ops[nops] = 0;
+ }
+ }
+
+ rp = stack;
+ for(i = 1; i < nops-1;){
+ rp2 = fn(rp);
+ switch(ops[i]){
+ case L'÷':
+ if(rp2 != rp+1){
+ print("break fail %X %X || %s\n", rp[0], rp[1], line);
+ goto Break;
+ }
+ rp++;
+ i++;
+ break;
+ case L'×':
+ if(rp2 - rp == 0){
+ for(j = i; j < nops - 1; j++)
+ if(ops[j] != L'×')
+ print("skipped %d %d %s\n", i, nops, line);
+ goto Break;
+ }
+ for(; rp < (rp2-1); rp++, i++){
+ if(ops[i] != L'×')
+ print("skipped %d %d %s\n", i, nops, line);
+ }
+ rp = rp2;
+ i++;
+ break;
+ }
+ }
+Break:
+ ;
+ }
+}
+
+void
+main(int, char)
+{
+ run("/lib/ucd/GraphemeBreakTest.txt", runegbreak);
+ run("/lib/ucd/WordBreakTest.txt", runewbreak);
+ exits(nil);
+}
--- /dev/null
+++ b//sys/src/libc/test/runenorm.c
@@ -1,0 +1,92 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+
+static int
+estrtoul(char *s)
+{
+ char *epr;
+ Rune code;
+
+ code = strtoul(s, &epr, 16);
+ if(s == epr)
+ sysfatal("bad code point hex string");
+ return code;
+}
+
+void
+main(int, char)
+{
+ Rune buffer1[64];
+ Rune buffer2[64];
+ char utfbuff1[128];
+ char utfbuff2[128];
+ char srctmp[128], tmp1[128], tmp2[128];
+ char *fields[10];
+ char *runes[32];
+ char *p;
+ int n, n2;
+ int i;
+ uint fail;
+ Biobuf *b;
+
+ b = Bopen("/lib/ucd/NormalizationTest.txt", OREAD);
+ if(b == nil)
+ sysfatal("could not load composition exclusions: %r");
+
+ struct {
+ Rune src[32];
+ Rune nfc[32];
+ Rune nfd[32];
+ } test;
+ while((p = Brdline(b, '\n')) != nil){
+ p[Blinelen(b)-1] = 0;
+ if(p[0] == 0 || p[0] == '#' || p[0] == '@')
+ continue;
+ getfields(p, fields, 6 + 1, 0, ";");
+ n = getfields(fields[0], runes, nelem(runes), 0, " ");
+ for(i = 0; i < n; i++)
+ test.src[i] = estrtoul(runes[i]);
+ test.src[i] = 0;
+
+ n = getfields(fields[1], runes, nelem(runes), 0, " ");
+ for(i = 0; i < n; i++)
+ test.nfc[i] = estrtoul(runes[i]);
+ test.nfc[i] = 0;
+
+ n = getfields(fields[2], runes, nelem(runes), 0, " ");
+ for(i = 0; i < n; i++)
+ test.nfd[i] = estrtoul(runes[i]);
+ test.nfd[i] = 0;
+
+ n = runenorm(buffer1, test.src, nelem(buffer1), 1);
+ n2 = runenorm(buffer2, test.src, nelem(buffer2), 0);
+ fail = 0;
+
+ if(runestrcmp(buffer1, test.nfc) != 0)
+ fail |= 1<<0;
+ if(runestrcmp(buffer2, test.nfd) != 0)
+ fail |= 1<<1;
+ if(fail)
+ print("%d %d %S %S %S %S %S\n", fail, i, test.src, test.nfd, test.nfc, buffer2,
buffer1);
+ assert(n == runestrlen(test.nfc));
+ assert(n2 == runestrlen(test.nfd));
+
+ snprint(srctmp, sizeof tmp1, "%S", test.src);
+ snprint(tmp1, sizeof tmp1, "%S", test.nfc);
+ snprint(tmp2, sizeof tmp2, "%S", test.nfd);
+
+ n = utfnorm(utfbuff1, srctmp, nelem(utfbuff1), 1);
+ n2 = utfnorm(utfbuff2, srctmp, nelem(utfbuff2), 0);
+
+ if(strcmp(utfbuff1, tmp1) != 0)
+ fail |= 1<<2;
+ if(strcmp(utfbuff2, tmp2) != 0)
+ fail |= 1<<3;
+ if(fail)
+ print("%d %d %s %s %s %s %s\n", fail, i, srctmp, tmp2, tmp1, utfbuff2,
utfbuff1);
+ assert(n == strlen(tmp1));
+ assert(n2 == strlen(tmp2));
+ }
+ exits(nil);
+}


next