Commit fc77e826 authored by Russ Cox's avatar Russ Cox

unicode: add case folding tables

R=r, r
CC=golang-dev
https://golang.org/cl/4571074
parent 6e9b1a78
...@@ -275,3 +275,52 @@ func (special SpecialCase) ToLower(rune int) int { ...@@ -275,3 +275,52 @@ func (special SpecialCase) ToLower(rune int) int {
} }
return r return r
} }
// caseOrbit is defined in tables.go as []foldPair. Right now all the
// entries fit in uint16, so use uint16. If that changes, compilation
// will fail (the constants in the composite literal will not fit in uint16)
// and the types here can change to uint32.
type foldPair struct {
From uint16
To uint16
}
// SimpleFold iterates over Unicode code points equivalent under
// the Unicode-defined simple case folding. Among the code points
// equivalent to rune (including rune itself), SimpleFold returns the
// smallest r >= rune if one exists, or else the smallest r >= 0.
//
// For example:
// SimpleFold('A') = 'a'
// SimpleFold('a') = 'A'
//
// SimpleFold('K') = 'k'
// SimpleFold('k') = '\u212A' (Kelvin symbol, K)
// SimpleFold('\u212A') = 'K'
//
// SimpleFold('1') = '1'
//
func SimpleFold(rune int) int {
// Consult caseOrbit table for special cases.
lo := 0
hi := len(caseOrbit)
for lo < hi {
m := lo + (hi-lo)/2
if int(caseOrbit[m].From) < rune {
lo = m + 1
} else {
hi = m
}
}
if lo < len(caseOrbit) && int(caseOrbit[lo].From) == rune {
return int(caseOrbit[lo].To)
}
// No folding specified. This is a one- or two-element
// equivalence class containing rune and ToLower(rune)
// and ToUpper(rune) if they are different from rune.
if l := ToLower(rune); l != rune {
return l
}
return ToUpper(rune)
}
...@@ -376,3 +376,49 @@ func TestTurkishCase(t *testing.T) { ...@@ -376,3 +376,49 @@ func TestTurkishCase(t *testing.T) {
} }
} }
} }
var simpleFoldTests = []string{
// SimpleFold could order its returned slices in any order it wants,
// but we know it orders them in increasing order starting at in
// and looping around from MaxRune to 0.
// Easy cases.
"Aa",
"aA",
"δΔ",
"Δδ",
// ASCII special cases.
"KkK",
"kKK",
"KKk",
"Ssſ",
"sſS",
"ſSs",
// Non-ASCII special cases.
"ρϱΡ",
"ϱΡρ",
"Ρρϱ",
"ͅΙιι",
"Ιιιͅ",
"ιιͅΙ",
"ιͅΙι",
// Extra special cases: has lower/upper but no case fold.
"İ",
"ı",
}
func TestSimpleFold(t *testing.T) {
for _, tt := range simpleFoldTests {
cycle := []int(tt)
rune := cycle[len(cycle)-1]
for _, out := range cycle {
if r := SimpleFold(rune); r != out {
t.Errorf("SimpleFold(%#U) = %#U, want %#U", rune, r, out)
}
rune = out
}
}
}
...@@ -24,15 +24,18 @@ import ( ...@@ -24,15 +24,18 @@ import (
func main() { func main() {
flag.Parse() flag.Parse()
loadChars() // always needed loadChars() // always needed
loadCasefold()
printCategories() printCategories()
printScriptOrProperty(false) printScriptOrProperty(false)
printScriptOrProperty(true) printScriptOrProperty(true)
printCases() printCases()
printLatinProperties() printLatinProperties()
printCasefold()
printSizes() printSizes()
} }
var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt") var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt")
var url = flag.String("url", var url = flag.String("url",
"http://www.unicode.org/Public/6.0.0/ucd/", "http://www.unicode.org/Public/6.0.0/ucd/",
"URL of Unicode database directory") "URL of Unicode database directory")
...@@ -119,6 +122,8 @@ type Char struct { ...@@ -119,6 +122,8 @@ type Char struct {
upperCase int upperCase int
lowerCase int lowerCase int
titleCase int titleCase int
foldCase int // simple case folding
caseOrbit int // next in simple case folding orbit
} }
// Scripts.txt has form: // Scripts.txt has form:
...@@ -308,8 +313,53 @@ func loadChars() { ...@@ -308,8 +313,53 @@ func loadChars() {
resp.Body.Close() resp.Body.Close()
} }
func loadCasefold() {
if *casefoldingURL == "" {
flag.Set("casefolding", *url+"CaseFolding.txt")
}
resp, err := http.Get(*casefoldingURL)
if err != nil {
logger.Fatal(err)
}
if resp.StatusCode != 200 {
logger.Fatal("bad GET status for CaseFolding.txt", resp.Status)
}
input := bufio.NewReader(resp.Body)
for {
line, err := input.ReadString('\n')
if err != nil {
if err == os.EOF {
break
}
logger.Fatal(err)
}
if line[0] == '#' {
continue
}
field := strings.Split(line, "; ", -1)
if len(field) != 4 {
logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4)
}
kind := field[1]
if kind != "C" && kind != "S" {
// Only care about 'common' and 'simple' foldings.
continue
}
p1, err := strconv.Btoui64(field[0], 16)
if err != nil {
logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
}
p2, err := strconv.Btoui64(field[2], 16)
if err != nil {
logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
}
chars[p1].foldCase = int(p2)
}
resp.Body.Close()
}
const progHeader = `// Generated by running const progHeader = `// Generated by running
// maketables --tables=%s --data=%s // maketables --tables=%s --data=%s --casefolding=%s
// DO NOT EDIT // DO NOT EDIT
package unicode package unicode
...@@ -330,7 +380,7 @@ func printCategories() { ...@@ -330,7 +380,7 @@ func printCategories() {
fullCategoryTest(list) fullCategoryTest(list)
return return
} }
fmt.Printf(progHeader, *tablelist, *dataURL) fmt.Printf(progHeader, *tablelist, *dataURL, *casefoldingURL)
fmt.Println("// Version is the Unicode edition from which the tables are derived.") fmt.Println("// Version is the Unicode edition from which the tables are derived.")
fmt.Printf("const Version = %q\n\n", version()) fmt.Printf("const Version = %q\n\n", version())
...@@ -837,13 +887,13 @@ func printCases() { ...@@ -837,13 +887,13 @@ func printCases() {
} }
fmt.Printf( fmt.Printf(
"// Generated by running\n"+ "// Generated by running\n"+
"// maketables --data=%s\n"+ "// maketables --data=%s --casefolding=%s\n"+
"// DO NOT EDIT\n\n"+ "// DO NOT EDIT\n\n"+
"// CaseRanges is the table describing case mappings for all letters with\n"+ "// CaseRanges is the table describing case mappings for all letters with\n"+
"// non-self mappings.\n"+ "// non-self mappings.\n"+
"var CaseRanges = _CaseRanges\n"+ "var CaseRanges = _CaseRanges\n"+
"var _CaseRanges = []CaseRange {\n", "var _CaseRanges = []CaseRange {\n",
*dataURL) *dataURL, *casefoldingURL)
var startState *caseState // the start of a run; nil for not active var startState *caseState // the start of a run; nil for not active
var prevState = &caseState{} // the state of the previous character var prevState = &caseState{} // the state of the previous character
...@@ -946,13 +996,246 @@ func printLatinProperties() { ...@@ -946,13 +996,246 @@ func printLatinProperties() {
if code == ' ' { if code == ' ' {
property = "pZ | pp" property = "pZ | pp"
} }
fmt.Printf("\t0x%.2X: %s, // %q\n", code, property, code) fmt.Printf("\t0x%02X: %s, // %q\n", code, property, code)
}
fmt.Printf("}\n\n")
}
func printCasefold() {
// Build list of case-folding groups attached to each canonical folded char (typically lower case).
var caseOrbit = make([][]int, MaxChar+1)
for i := range chars {
c := &chars[i]
if c.foldCase == 0 {
continue
}
orb := caseOrbit[c.foldCase]
if orb == nil {
orb = append(orb, c.foldCase)
}
caseOrbit[c.foldCase] = append(orb, i)
}
// Insert explicit 1-element groups when assuming [lower, upper] would be wrong.
for i := range chars {
c := &chars[i]
f := c.foldCase
if f == 0 {
f = i
}
orb := caseOrbit[f]
if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) {
// Default assumption of [upper, lower] is wrong.
caseOrbit[i] = []int{i}
}
}
// Delete the groups for which assuming [lower, upper] is right.
for i, orb := range caseOrbit {
if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] {
caseOrbit[i] = nil
}
}
// Record orbit information in chars.
for _, orb := range caseOrbit {
if orb == nil {
continue
}
sort.SortInts(orb)
c := orb[len(orb)-1]
for _, d := range orb {
chars[c].caseOrbit = d
c = d
}
}
printCaseOrbit()
// Tables of category and script folding exceptions: code points
// that must be added when interpreting a particular category/script
// in a case-folding context.
cat := make(map[string]map[int]bool)
for name := range category {
if x := foldExceptions(inCategory(name)); len(x) > 0 {
cat[name] = x
}
}
scr := make(map[string]map[int]bool)
for name := range scripts {
if x := foldExceptions(inScript(name)); len(x) > 0 {
cat[name] = x
}
}
printCatFold("FoldCategory", cat)
printCatFold("FoldScript", scr)
}
// inCategory returns a list of all the runes in the category.
func inCategory(name string) []int {
var x []int
for i := range chars {
c := &chars[i]
if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] {
x = append(x, i)
}
}
return x
}
// inScript returns a list of all the runes in the script.
func inScript(name string) []int {
var x []int
for _, s := range scripts[name] {
for c := s.lo; c <= s.hi; c++ {
x = append(x, int(c))
}
}
return x
}
// foldExceptions returns a list of all the runes fold-equivalent
// to runes in class but not in class themselves.
func foldExceptions(class []int) map[int]bool {
// Create map containing class and all fold-equivalent chars.
m := make(map[int]bool)
for _, r := range class {
c := &chars[r]
if c.caseOrbit == 0 {
// Just upper and lower.
if u := c.upperCase; u != 0 {
m[u] = true
}
if l := c.lowerCase; l != 0 {
m[l] = true
}
m[r] = true
continue
}
// Otherwise walk orbit.
r0 := r
for {
m[r] = true
r = chars[r].caseOrbit
if r == r0 {
break
}
}
}
// Remove class itself.
for _, r := range class {
m[r] = false, false
}
// What's left is the exceptions.
return m
}
var comment = map[string]string{
"FoldCategory": "// FoldCategory maps a category name to a table of\n" +
"// code points outside the category that are equivalent under\n" +
"// simple case folding to code points inside the category.\n" +
"// If there is no entry for a category name, there are no such points.\n",
"FoldScript": "// FoldScript maps a script name to a table of\n" +
"// code points outside the script that are equivalent under\n" +
"// simple case folding to code points inside the script.\n" +
"// If there is no entry for a script name, there are no such points.\n",
}
func printCaseOrbit() {
if *test {
for i := range chars {
c := &chars[i]
f := c.caseOrbit
if f == 0 {
if c.lowerCase != i && c.lowerCase != 0 {
f = c.lowerCase
} else if c.upperCase != i && c.upperCase != 0 {
f = c.upperCase
} else {
f = i
}
}
if g := unicode.SimpleFold(i); g != f {
fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f)
}
}
return
}
fmt.Printf("var caseOrbit = []foldPair{\n")
for i := range chars {
c := &chars[i]
if c.caseOrbit != 0 {
fmt.Printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit)
foldPairCount++
}
}
fmt.Printf("}\n\n")
}
func printCatFold(name string, m map[string]map[int]bool) {
if *test {
var pkgMap map[string]*unicode.RangeTable
if name == "FoldCategory" {
pkgMap = unicode.FoldCategory
} else {
pkgMap = unicode.FoldScript
}
if len(pkgMap) != len(m) {
fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m))
return
}
for k, v := range m {
t, ok := pkgMap[k]
if !ok {
fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k)
continue
}
n := 0
for _, r := range t.R16 {
for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) {
if !v[c] {
fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
}
n++
}
}
for _, r := range t.R32 {
for c := int(r.Lo); c <= int(r.Hi); c += int(r.Stride) {
if !v[c] {
fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
}
n++
}
}
if n != len(v) {
fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v))
}
}
return
}
fmt.Print(comment[name])
fmt.Printf("var %s = map[string]*RangeTable{\n", name)
for name := range m {
fmt.Printf("\t%q: fold%s,\n", name, name)
}
fmt.Printf("}\n\n")
for name, class := range m {
dumpRange(
fmt.Sprintf("var fold%s = &RangeTable{\n", name),
func(code int) bool { return class[code] })
} }
fmt.Println("}")
} }
var range16Count = 0 // Number of entries in the 16-bit range tables. var range16Count = 0 // Number of entries in the 16-bit range tables.
var range32Count = 0 // Number of entries in the 32-bit range tables. var range32Count = 0 // Number of entries in the 32-bit range tables.
var foldPairCount = 0 // Number of fold pairs in the exception tables.
func printSizes() { func printSizes() {
if *test { if *test {
...@@ -963,4 +1246,6 @@ func printSizes() { ...@@ -963,4 +1246,6 @@ func printSizes() {
range16Bytes := range16Count * 3 * 2 range16Bytes := range16Count * 3 * 2
range32Bytes := range32Count * 3 * 4 range32Bytes := range32Count * 3 * 4
fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes) fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
fmt.Println()
fmt.Printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2)
} }
// Generated by running // Generated by running
// maketables --tables=all --data=http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt // maketables --tables=all --data=http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt --casefolding=http://www.unicode.org/Public/6.0.0/ucd/CaseFolding.txt
// DO NOT EDIT // DO NOT EDIT
package unicode package unicode
...@@ -5150,7 +5150,7 @@ var ( ...@@ -5150,7 +5150,7 @@ var (
) )
// Generated by running // Generated by running
// maketables --data=http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt // maketables --data=http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt --casefolding=http://www.unicode.org/Public/6.0.0/ucd/CaseFolding.txt
// DO NOT EDIT // DO NOT EDIT
// CaseRanges is the table describing case mappings for all letters with // CaseRanges is the table describing case mappings for all letters with
...@@ -5539,7 +5539,7 @@ var properties = [MaxLatin1 + 1]uint8{ ...@@ -5539,7 +5539,7 @@ var properties = [MaxLatin1 + 1]uint8{
0x7C: pS | pp, // '|' 0x7C: pS | pp, // '|'
0x7D: pP | pp, // '}' 0x7D: pP | pp, // '}'
0x7E: pS | pp, // '~' 0x7E: pS | pp, // '~'
0x7F: pC, // '\x7f' 0x7F: pC, // '\u007f'
0x80: pC, // '\u0080' 0x80: pC, // '\u0080'
0x81: pC, // '\u0081' 0x81: pC, // '\u0081'
0x82: pC, // '\u0082' 0x82: pC, // '\u0082'
...@@ -5573,102 +5573,534 @@ var properties = [MaxLatin1 + 1]uint8{ ...@@ -5573,102 +5573,534 @@ var properties = [MaxLatin1 + 1]uint8{
0x9E: pC, // '\u009e' 0x9E: pC, // '\u009e'
0x9F: pC, // '\u009f' 0x9F: pC, // '\u009f'
0xA0: pZ, // '\u00a0' 0xA0: pZ, // '\u00a0'
0xA1: pP | pp, // '\u00a1' 0xA1: pP | pp, // '¡'
0xA2: pS | pp, // '\u00a2' 0xA2: pS | pp, // '¢'
0xA3: pS | pp, // '\u00a3' 0xA3: pS | pp, // '£'
0xA4: pS | pp, // '\u00a4' 0xA4: pS | pp, // '¤'
0xA5: pS | pp, // '\u00a5' 0xA5: pS | pp, // '¥'
0xA6: pS | pp, // '\u00a6' 0xA6: pS | pp, // '¦'
0xA7: pS | pp, // '\u00a7' 0xA7: pS | pp, // '§'
0xA8: pS | pp, // '\u00a8' 0xA8: pS | pp, // '¨'
0xA9: pS | pp, // '\u00a9' 0xA9: pS | pp, // '©'
0xAA: pLl | pp, // '\u00aa' 0xAA: pLl | pp, // 'ª'
0xAB: pP | pp, // '\u00ab' 0xAB: pP | pp, // '«'
0xAC: pS | pp, // '\u00ac' 0xAC: pS | pp, // '¬'
0xAD: 0, // '\u00ad' 0xAD: 0, // '\u00ad'
0xAE: pS | pp, // '\u00ae' 0xAE: pS | pp, // '®'
0xAF: pS | pp, // '\u00af' 0xAF: pS | pp, // '¯'
0xB0: pS | pp, // '\u00b0' 0xB0: pS | pp, // '°'
0xB1: pS | pp, // '\u00b1' 0xB1: pS | pp, // '±'
0xB2: pN | pp, // '\u00b2' 0xB2: pN | pp, // '²'
0xB3: pN | pp, // '\u00b3' 0xB3: pN | pp, // '³'
0xB4: pS | pp, // '\u00b4' 0xB4: pS | pp, // '´'
0xB5: pLl | pp, // '\u00b5' 0xB5: pLl | pp, // 'µ'
0xB6: pS | pp, // '\u00b6' 0xB6: pS | pp, // '¶'
0xB7: pP | pp, // '\u00b7' 0xB7: pP | pp, // '·'
0xB8: pS | pp, // '\u00b8' 0xB8: pS | pp, // '¸'
0xB9: pN | pp, // '\u00b9' 0xB9: pN | pp, // '¹'
0xBA: pLl | pp, // '\u00ba' 0xBA: pLl | pp, // 'º'
0xBB: pP | pp, // '\u00bb' 0xBB: pP | pp, // '»'
0xBC: pN | pp, // '\u00bc' 0xBC: pN | pp, // '¼'
0xBD: pN | pp, // '\u00bd' 0xBD: pN | pp, // '½'
0xBE: pN | pp, // '\u00be' 0xBE: pN | pp, // '¾'
0xBF: pP | pp, // '\u00bf' 0xBF: pP | pp, // '¿'
0xC0: pLu | pp, // '\u00c0' 0xC0: pLu | pp, // 'À'
0xC1: pLu | pp, // '\u00c1' 0xC1: pLu | pp, // 'Á'
0xC2: pLu | pp, // '\u00c2' 0xC2: pLu | pp, // 'Â'
0xC3: pLu | pp, // '\u00c3' 0xC3: pLu | pp, // 'Ã'
0xC4: pLu | pp, // '\u00c4' 0xC4: pLu | pp, // 'Ä'
0xC5: pLu | pp, // '\u00c5' 0xC5: pLu | pp, // 'Å'
0xC6: pLu | pp, // '\u00c6' 0xC6: pLu | pp, // 'Æ'
0xC7: pLu | pp, // '\u00c7' 0xC7: pLu | pp, // 'Ç'
0xC8: pLu | pp, // '\u00c8' 0xC8: pLu | pp, // 'È'
0xC9: pLu | pp, // '\u00c9' 0xC9: pLu | pp, // 'É'
0xCA: pLu | pp, // '\u00ca' 0xCA: pLu | pp, // 'Ê'
0xCB: pLu | pp, // '\u00cb' 0xCB: pLu | pp, // 'Ë'
0xCC: pLu | pp, // '\u00cc' 0xCC: pLu | pp, // 'Ì'
0xCD: pLu | pp, // '\u00cd' 0xCD: pLu | pp, // 'Í'
0xCE: pLu | pp, // '\u00ce' 0xCE: pLu | pp, // 'Î'
0xCF: pLu | pp, // '\u00cf' 0xCF: pLu | pp, // 'Ï'
0xD0: pLu | pp, // '\u00d0' 0xD0: pLu | pp, // 'Ð'
0xD1: pLu | pp, // '\u00d1' 0xD1: pLu | pp, // 'Ñ'
0xD2: pLu | pp, // '\u00d2' 0xD2: pLu | pp, // 'Ò'
0xD3: pLu | pp, // '\u00d3' 0xD3: pLu | pp, // 'Ó'
0xD4: pLu | pp, // '\u00d4' 0xD4: pLu | pp, // 'Ô'
0xD5: pLu | pp, // '\u00d5' 0xD5: pLu | pp, // 'Õ'
0xD6: pLu | pp, // '\u00d6' 0xD6: pLu | pp, // 'Ö'
0xD7: pS | pp, // '\u00d7' 0xD7: pS | pp, // '×'
0xD8: pLu | pp, // '\u00d8' 0xD8: pLu | pp, // 'Ø'
0xD9: pLu | pp, // '\u00d9' 0xD9: pLu | pp, // 'Ù'
0xDA: pLu | pp, // '\u00da' 0xDA: pLu | pp, // 'Ú'
0xDB: pLu | pp, // '\u00db' 0xDB: pLu | pp, // 'Û'
0xDC: pLu | pp, // '\u00dc' 0xDC: pLu | pp, // 'Ü'
0xDD: pLu | pp, // '\u00dd' 0xDD: pLu | pp, // 'Ý'
0xDE: pLu | pp, // '\u00de' 0xDE: pLu | pp, // 'Þ'
0xDF: pLl | pp, // '\u00df' 0xDF: pLl | pp, // 'ß'
0xE0: pLl | pp, // '\u00e0' 0xE0: pLl | pp, // 'à'
0xE1: pLl | pp, // '\u00e1' 0xE1: pLl | pp, // 'á'
0xE2: pLl | pp, // '\u00e2' 0xE2: pLl | pp, // 'â'
0xE3: pLl | pp, // '\u00e3' 0xE3: pLl | pp, // 'ã'
0xE4: pLl | pp, // '\u00e4' 0xE4: pLl | pp, // 'ä'
0xE5: pLl | pp, // '\u00e5' 0xE5: pLl | pp, // 'å'
0xE6: pLl | pp, // '\u00e6' 0xE6: pLl | pp, // 'æ'
0xE7: pLl | pp, // '\u00e7' 0xE7: pLl | pp, // 'ç'
0xE8: pLl | pp, // '\u00e8' 0xE8: pLl | pp, // 'è'
0xE9: pLl | pp, // '\u00e9' 0xE9: pLl | pp, // 'é'
0xEA: pLl | pp, // '\u00ea' 0xEA: pLl | pp, // 'ê'
0xEB: pLl | pp, // '\u00eb' 0xEB: pLl | pp, // 'ë'
0xEC: pLl | pp, // '\u00ec' 0xEC: pLl | pp, // 'ì'
0xED: pLl | pp, // '\u00ed' 0xED: pLl | pp, // 'í'
0xEE: pLl | pp, // '\u00ee' 0xEE: pLl | pp, // 'î'
0xEF: pLl | pp, // '\u00ef' 0xEF: pLl | pp, // 'ï'
0xF0: pLl | pp, // '\u00f0' 0xF0: pLl | pp, // 'ð'
0xF1: pLl | pp, // '\u00f1' 0xF1: pLl | pp, // 'ñ'
0xF2: pLl | pp, // '\u00f2' 0xF2: pLl | pp, // 'ò'
0xF3: pLl | pp, // '\u00f3' 0xF3: pLl | pp, // 'ó'
0xF4: pLl | pp, // '\u00f4' 0xF4: pLl | pp, // 'ô'
0xF5: pLl | pp, // '\u00f5' 0xF5: pLl | pp, // 'õ'
0xF6: pLl | pp, // '\u00f6' 0xF6: pLl | pp, // 'ö'
0xF7: pS | pp, // '\u00f7' 0xF7: pS | pp, // '÷'
0xF8: pLl | pp, // '\u00f8' 0xF8: pLl | pp, // 'ø'
0xF9: pLl | pp, // '\u00f9' 0xF9: pLl | pp, // 'ù'
0xFA: pLl | pp, // '\u00fa' 0xFA: pLl | pp, // 'ú'
0xFB: pLl | pp, // '\u00fb' 0xFB: pLl | pp, // 'û'
0xFC: pLl | pp, // '\u00fc' 0xFC: pLl | pp, // 'ü'
0xFD: pLl | pp, // '\u00fd' 0xFD: pLl | pp, // 'ý'
0xFE: pLl | pp, // '\u00fe' 0xFE: pLl | pp, // 'þ'
0xFF: pLl | pp, // '\u00ff' 0xFF: pLl | pp, // 'ÿ'
} }
// Range entries: 3190 16-bit, 657 32-bit, 3847 total. var caseOrbit = []foldPair{
// Range bytes: 19140 16-bit, 7884 32-bit, 27024 total. {0x004B, 0x006B},
{0x0053, 0x0073},
{0x006B, 0x212A},
{0x0073, 0x017F},
{0x00B5, 0x039C},
{0x00C5, 0x00E5},
{0x00DF, 0x1E9E},
{0x00E5, 0x212B},
{0x0130, 0x0130},
{0x0131, 0x0131},
{0x017F, 0x0053},
{0x01C4, 0x01C5},
{0x01C5, 0x01C6},
{0x01C6, 0x01C4},
{0x01C7, 0x01C8},
{0x01C8, 0x01C9},
{0x01C9, 0x01C7},
{0x01CA, 0x01CB},
{0x01CB, 0x01CC},
{0x01CC, 0x01CA},
{0x01F1, 0x01F2},
{0x01F2, 0x01F3},
{0x01F3, 0x01F1},
{0x0345, 0x0399},
{0x0392, 0x03B2},
{0x0395, 0x03B5},
{0x0398, 0x03B8},
{0x0399, 0x03B9},
{0x039A, 0x03BA},
{0x039C, 0x03BC},
{0x03A0, 0x03C0},
{0x03A1, 0x03C1},
{0x03A3, 0x03C2},
{0x03A6, 0x03C6},
{0x03A9, 0x03C9},
{0x03B2, 0x03D0},
{0x03B5, 0x03F5},
{0x03B8, 0x03D1},
{0x03B9, 0x1FBE},
{0x03BA, 0x03F0},
{0x03BC, 0x00B5},
{0x03C0, 0x03D6},
{0x03C1, 0x03F1},
{0x03C2, 0x03C3},
{0x03C3, 0x03A3},
{0x03C6, 0x03D5},
{0x03C9, 0x2126},
{0x03D0, 0x0392},
{0x03D1, 0x03F4},
{0x03D5, 0x03A6},
{0x03D6, 0x03A0},
{0x03F0, 0x039A},
{0x03F1, 0x03A1},
{0x03F4, 0x0398},
{0x03F5, 0x0395},
{0x1E60, 0x1E61},
{0x1E61, 0x1E9B},
{0x1E9B, 0x1E60},
{0x1E9E, 0x00DF},
{0x1FBE, 0x0345},
{0x2126, 0x03A9},
{0x212A, 0x004B},
{0x212B, 0x00C5},
{0x2160, 0x2170},
{0x2161, 0x2171},
{0x2162, 0x2172},
{0x2163, 0x2173},
{0x2164, 0x2174},
{0x2165, 0x2175},
{0x2166, 0x2176},
{0x2167, 0x2177},
{0x2168, 0x2178},
{0x2169, 0x2179},
{0x216A, 0x217A},
{0x216B, 0x217B},
{0x216C, 0x217C},
{0x216D, 0x217D},
{0x216E, 0x217E},
{0x216F, 0x217F},
{0x2170, 0x2160},
{0x2171, 0x2161},
{0x2172, 0x2162},
{0x2173, 0x2163},
{0x2174, 0x2164},
{0x2175, 0x2165},
{0x2176, 0x2166},
{0x2177, 0x2167},
{0x2178, 0x2168},
{0x2179, 0x2169},
{0x217A, 0x216A},
{0x217B, 0x216B},
{0x217C, 0x216C},
{0x217D, 0x216D},
{0x217E, 0x216E},
{0x217F, 0x216F},
{0x24B6, 0x24D0},
{0x24B7, 0x24D1},
{0x24B8, 0x24D2},
{0x24B9, 0x24D3},
{0x24BA, 0x24D4},
{0x24BB, 0x24D5},
{0x24BC, 0x24D6},
{0x24BD, 0x24D7},
{0x24BE, 0x24D8},
{0x24BF, 0x24D9},
{0x24C0, 0x24DA},
{0x24C1, 0x24DB},
{0x24C2, 0x24DC},
{0x24C3, 0x24DD},
{0x24C4, 0x24DE},
{0x24C5, 0x24DF},
{0x24C6, 0x24E0},
{0x24C7, 0x24E1},
{0x24C8, 0x24E2},
{0x24C9, 0x24E3},
{0x24CA, 0x24E4},
{0x24CB, 0x24E5},
{0x24CC, 0x24E6},
{0x24CD, 0x24E7},
{0x24CE, 0x24E8},
{0x24CF, 0x24E9},
{0x24D0, 0x24B6},
{0x24D1, 0x24B7},
{0x24D2, 0x24B8},
{0x24D3, 0x24B9},
{0x24D4, 0x24BA},
{0x24D5, 0x24BB},
{0x24D6, 0x24BC},
{0x24D7, 0x24BD},
{0x24D8, 0x24BE},
{0x24D9, 0x24BF},
{0x24DA, 0x24C0},
{0x24DB, 0x24C1},
{0x24DC, 0x24C2},
{0x24DD, 0x24C3},
{0x24DE, 0x24C4},
{0x24DF, 0x24C5},
{0x24E0, 0x24C6},
{0x24E1, 0x24C7},
{0x24E2, 0x24C8},
{0x24E3, 0x24C9},
{0x24E4, 0x24CA},
{0x24E5, 0x24CB},
{0x24E6, 0x24CC},
{0x24E7, 0x24CD},
{0x24E8, 0x24CE},
{0x24E9, 0x24CF},
}
// FoldCategory maps a category name to a table of
// code points outside the category that are equivalent under
// simple case folding to code points inside the category.
// If there is no entry for a category name, there are no such points.
var FoldCategory = map[string]*RangeTable{
"Ll": foldLl,
"Inherited": foldInherited,
"M": foldM,
"L": foldL,
"Mn": foldMn,
"Common": foldCommon,
"Greek": foldGreek,
"Lu": foldLu,
"Lt": foldLt,
}
var foldLl = &RangeTable{
R16: []Range16{
{0x0041, 0x005a, 1},
{0x00c0, 0x00d6, 1},
{0x00d8, 0x00de, 1},
{0x0100, 0x012e, 2},
{0x0132, 0x0136, 2},
{0x0139, 0x0147, 2},
{0x014a, 0x0178, 2},
{0x0179, 0x017d, 2},
{0x0181, 0x0182, 1},
{0x0184, 0x0186, 2},
{0x0187, 0x0189, 2},
{0x018a, 0x018b, 1},
{0x018e, 0x0191, 1},
{0x0193, 0x0194, 1},
{0x0196, 0x0198, 1},
{0x019c, 0x019d, 1},
{0x019f, 0x01a0, 1},
{0x01a2, 0x01a6, 2},
{0x01a7, 0x01a9, 2},
{0x01ac, 0x01ae, 2},
{0x01af, 0x01b1, 2},
{0x01b2, 0x01b3, 1},
{0x01b5, 0x01b7, 2},
{0x01b8, 0x01bc, 4},
{0x01c4, 0x01c5, 1},
{0x01c7, 0x01c8, 1},
{0x01ca, 0x01cb, 1},
{0x01cd, 0x01db, 2},
{0x01de, 0x01ee, 2},
{0x01f1, 0x01f2, 1},
{0x01f4, 0x01f6, 2},
{0x01f7, 0x01f8, 1},
{0x01fa, 0x0232, 2},
{0x023a, 0x023b, 1},
{0x023d, 0x023e, 1},
{0x0241, 0x0243, 2},
{0x0244, 0x0246, 1},
{0x0248, 0x024e, 2},
{0x0345, 0x0370, 43},
{0x0372, 0x0376, 4},
{0x0386, 0x0388, 2},
{0x0389, 0x038a, 1},
{0x038c, 0x038e, 2},
{0x038f, 0x0391, 2},
{0x0392, 0x03a1, 1},
{0x03a3, 0x03ab, 1},
{0x03cf, 0x03d8, 9},
{0x03da, 0x03ee, 2},
{0x03f4, 0x03f7, 3},
{0x03f9, 0x03fa, 1},
{0x03fd, 0x042f, 1},
{0x0460, 0x0480, 2},
{0x048a, 0x04c0, 2},
{0x04c1, 0x04cd, 2},
{0x04d0, 0x0526, 2},
{0x0531, 0x0556, 1},
{0x10a0, 0x10c5, 1},
{0x1e00, 0x1e94, 2},
{0x1e9e, 0x1efe, 2},
{0x1f08, 0x1f0f, 1},
{0x1f18, 0x1f1d, 1},
{0x1f28, 0x1f2f, 1},
{0x1f38, 0x1f3f, 1},
{0x1f48, 0x1f4d, 1},
{0x1f59, 0x1f5f, 2},
{0x1f68, 0x1f6f, 1},
{0x1f88, 0x1f8f, 1},
{0x1f98, 0x1f9f, 1},
{0x1fa8, 0x1faf, 1},
{0x1fb8, 0x1fbc, 1},
{0x1fc8, 0x1fcc, 1},
{0x1fd8, 0x1fdb, 1},
{0x1fe8, 0x1fec, 1},
{0x1ff8, 0x1ffc, 1},
{0x2126, 0x212a, 4},
{0x212b, 0x2132, 7},
{0x2183, 0x2c00, 2685},
{0x2c01, 0x2c2e, 1},
{0x2c60, 0x2c62, 2},
{0x2c63, 0x2c64, 1},
{0x2c67, 0x2c6d, 2},
{0x2c6e, 0x2c70, 1},
{0x2c72, 0x2c75, 3},
{0x2c7e, 0x2c80, 1},
{0x2c82, 0x2ce2, 2},
{0x2ceb, 0x2ced, 2},
{0xa640, 0xa66c, 2},
{0xa680, 0xa696, 2},
{0xa722, 0xa72e, 2},
{0xa732, 0xa76e, 2},
{0xa779, 0xa77d, 2},
{0xa77e, 0xa786, 2},
{0xa78b, 0xa78d, 2},
{0xa790, 0xa7a0, 16},
{0xa7a2, 0xa7a8, 2},
{0xff21, 0xff3a, 1},
},
R32: []Range32{
{0x10400, 0x10427, 1},
},
}
var foldInherited = &RangeTable{
R16: []Range16{
{0x0399, 0x03b9, 32},
{0x1fbe, 0x1fbe, 1},
},
}
var foldM = &RangeTable{
R16: []Range16{
{0x0399, 0x03b9, 32},
{0x1fbe, 0x1fbe, 1},
},
}
var foldL = &RangeTable{
R16: []Range16{
{0x0345, 0x0345, 1},
},
}
var foldMn = &RangeTable{
R16: []Range16{
{0x0399, 0x03b9, 32},
{0x1fbe, 0x1fbe, 1},
},
}
var foldCommon = &RangeTable{
R16: []Range16{
{0x039c, 0x03bc, 32},
},
}
var foldGreek = &RangeTable{
R16: []Range16{
{0x00b5, 0x0345, 656},
},
}
var foldLu = &RangeTable{
R16: []Range16{
{0x0061, 0x007a, 1},
{0x00b5, 0x00df, 42},
{0x00e0, 0x00f6, 1},
{0x00f8, 0x00ff, 1},
{0x0101, 0x012f, 2},
{0x0133, 0x0137, 2},
{0x013a, 0x0148, 2},
{0x014b, 0x0177, 2},
{0x017a, 0x017e, 2},
{0x017f, 0x0180, 1},
{0x0183, 0x0185, 2},
{0x0188, 0x018c, 4},
{0x0192, 0x0195, 3},
{0x0199, 0x019a, 1},
{0x019e, 0x01a1, 3},
{0x01a3, 0x01a5, 2},
{0x01a8, 0x01ad, 5},
{0x01b0, 0x01b4, 4},
{0x01b6, 0x01b9, 3},
{0x01bd, 0x01bf, 2},
{0x01c5, 0x01c6, 1},
{0x01c8, 0x01c9, 1},
{0x01cb, 0x01cc, 1},
{0x01ce, 0x01dc, 2},
{0x01dd, 0x01ef, 2},
{0x01f2, 0x01f3, 1},
{0x01f5, 0x01f9, 4},
{0x01fb, 0x021f, 2},
{0x0223, 0x0233, 2},
{0x023c, 0x023f, 3},
{0x0240, 0x0242, 2},
{0x0247, 0x024f, 2},
{0x0250, 0x0254, 1},
{0x0256, 0x0257, 1},
{0x0259, 0x025b, 2},
{0x0260, 0x0263, 3},
{0x0265, 0x0268, 3},
{0x0269, 0x026b, 2},
{0x026f, 0x0271, 2},
{0x0272, 0x0275, 3},
{0x027d, 0x0283, 3},
{0x0288, 0x028c, 1},
{0x0292, 0x0345, 179},
{0x0371, 0x0373, 2},
{0x0377, 0x037b, 4},
{0x037c, 0x037d, 1},
{0x03ac, 0x03af, 1},
{0x03b1, 0x03ce, 1},
{0x03d0, 0x03d1, 1},
{0x03d5, 0x03d7, 1},
{0x03d9, 0x03ef, 2},
{0x03f0, 0x03f2, 1},
{0x03f5, 0x03fb, 3},
{0x0430, 0x045f, 1},
{0x0461, 0x0481, 2},
{0x048b, 0x04bf, 2},
{0x04c2, 0x04ce, 2},
{0x04cf, 0x0527, 2},
{0x0561, 0x0586, 1},
{0x1d79, 0x1d7d, 4},
{0x1e01, 0x1e95, 2},
{0x1e9b, 0x1ea1, 6},
{0x1ea3, 0x1eff, 2},
{0x1f00, 0x1f07, 1},
{0x1f10, 0x1f15, 1},
{0x1f20, 0x1f27, 1},
{0x1f30, 0x1f37, 1},
{0x1f40, 0x1f45, 1},
{0x1f51, 0x1f57, 2},
{0x1f60, 0x1f67, 1},
{0x1f70, 0x1f7d, 1},
{0x1fb0, 0x1fb1, 1},
{0x1fbe, 0x1fd0, 18},
{0x1fd1, 0x1fe0, 15},
{0x1fe1, 0x1fe5, 4},
{0x214e, 0x2184, 54},
{0x2c30, 0x2c5e, 1},
{0x2c61, 0x2c65, 4},
{0x2c66, 0x2c6c, 2},
{0x2c73, 0x2c76, 3},
{0x2c81, 0x2ce3, 2},
{0x2cec, 0x2cee, 2},
{0x2d00, 0x2d25, 1},
{0xa641, 0xa66d, 2},
{0xa681, 0xa697, 2},
{0xa723, 0xa72f, 2},
{0xa733, 0xa76f, 2},
{0xa77a, 0xa77c, 2},
{0xa77f, 0xa787, 2},
{0xa78c, 0xa791, 5},
{0xa7a1, 0xa7a9, 2},
{0xff41, 0xff5a, 1},
},
R32: []Range32{
{0x10428, 0x1044f, 1},
},
}
var foldLt = &RangeTable{
R16: []Range16{
{0x01c4, 0x01c6, 2},
{0x01c7, 0x01c9, 2},
{0x01ca, 0x01cc, 2},
{0x01f1, 0x01f3, 2},
{0x1f80, 0x1f87, 1},
{0x1f90, 0x1f97, 1},
{0x1fa0, 0x1fa7, 1},
{0x1fb3, 0x1fc3, 16},
{0x1ff3, 0x1ff3, 1},
},
}
// FoldScript maps a script name to a table of
// code points outside the script that are equivalent under
// simple case folding to code points inside the script.
// If there is no entry for a script name, there are no such points.
var FoldScript = map[string]*RangeTable{}
// Range entries: 3391 16-bit, 659 32-bit, 4050 total.
// Range bytes: 20346 16-bit, 7908 32-bit, 28254 total.
// Fold orbit bytes: 147 pairs, 588 bytes
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment