maketables.go 32.8 KB
Newer Older
1 2 3 4
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

Russ Cox's avatar
Russ Cox committed
5 6
// +build ignore

7 8 9 10 11 12
// Unicode table generator.
// Data read from the web.

package main

import (
13 14 15 16
	"bufio"
	"flag"
	"fmt"
	"log"
17
	"net/http"
18
	"os"
19
	"path/filepath"
Russ Cox's avatar
Russ Cox committed
20
	"regexp"
21 22 23 24
	"sort"
	"strconv"
	"strings"
	"unicode"
25 26
)

27
func main() {
28 29
	flag.Parse()
	loadChars() // always needed
Russ Cox's avatar
Russ Cox committed
30
	loadCasefold()
31 32 33 34
	printCategories()
	printScriptOrProperty(false)
	printScriptOrProperty(true)
	printCases()
35
	printLatinProperties()
Russ Cox's avatar
Russ Cox committed
36
	printCasefold()
Rob Pike's avatar
Rob Pike committed
37
	printSizes()
38 39
}

40
var dataURL = flag.String("data", "", "full URL for UnicodeData.txt; defaults to --url/UnicodeData.txt")
Russ Cox's avatar
Russ Cox committed
41
var casefoldingURL = flag.String("casefolding", "", "full URL for CaseFolding.txt; defaults to --url/CaseFolding.txt")
Rob Pike's avatar
Rob Pike committed
42
var url = flag.String("url",
43
	"http://www.unicode.org/Public/6.2.0/ucd/",
44 45
	"URL of Unicode database directory")
var tablelist = flag.String("tables",
Rob Pike's avatar
Rob Pike committed
46
	"all",
47
	"comma-separated list of which tables to generate; can be letter")
48 49
var scriptlist = flag.String("scripts",
	"all",
50
	"comma-separated list of which script tables to generate")
Rob Pike's avatar
Rob Pike committed
51 52
var proplist = flag.String("props",
	"all",
53
	"comma-separated list of which property tables to generate")
54 55
var cases = flag.Bool("cases",
	true,
56
	"generate case tables")
Rob Pike's avatar
Rob Pike committed
57 58
var test = flag.Bool("test",
	false,
59
	"test existing tables; can be used to compare web data with package data")
60 61 62
var localFiles = flag.Bool("local",
	false,
	"data files have been copied to current directory; for debugging only")
63

64 65
var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`)
var logger = log.New(os.Stderr, "", log.Lshortfile)
66

67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
type reader struct {
	*bufio.Reader
	fd   *os.File
	resp *http.Response
}

func open(url string) *reader {
	file := filepath.Base(url)
	if *localFiles {
		fd, err := os.Open(file)
		if err != nil {
			logger.Fatal(err)
		}
		return &reader{bufio.NewReader(fd), fd, nil}
	}
Russ Cox's avatar
Russ Cox committed
82
	resp, err := http.Get(url)
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
	if err != nil {
		logger.Fatal(err)
	}
	if resp.StatusCode != 200 {
		logger.Fatalf("bad GET status for %s: %d", file, resp.Status)
	}
	return &reader{bufio.NewReader(resp.Body), nil, resp}

}

func (r *reader) close() {
	if r.fd != nil {
		r.fd.Close()
	} else {
		r.resp.Body.Close()
	}
}

101 102 103 104 105 106 107 108 109 110 111
var category = map[string]bool{
	// Nd Lu etc.
	// We use one-character names to identify merged categories
	"L": true, // Lu Ll Lt Lm Lo
	"P": true, // Pc Pd Ps Pe Pu Pf Po
	"M": true, // Mn Mc Me
	"N": true, // Nd Nl No
	"S": true, // Sm Sc Sk So
	"Z": true, // Zs Zl Zp
	"C": true, // Cc Cf Cs Co Cn
}
Rob Pike's avatar
Rob Pike committed
112

113
// UnicodeData.txt has form:
114 115
//	0037;DIGIT SEVEN;Nd;0;EN;;7;7;7;N;;;;;
//	007A;LATIN SMALL LETTER Z;Ll;0;L;;;;;N;;;005A;;005A
Rob Pike's avatar
Rob Pike committed
116
// See http://www.unicode.org/reports/tr44/ for a full explanation
117
// The fields:
118
const (
119 120 121 122 123
	FCodePoint = iota
	FName
	FGeneralCategory
	FCanonicalCombiningClass
	FBidiClass
Rob Pike's avatar
Rob Pike committed
124
	FDecompositionTypeAndMapping
125
	FNumericType
Rob Pike's avatar
Rob Pike committed
126 127
	FNumericDigit // If a decimal digit.
	FNumericValue // Includes non-decimal, e.g. U+2155=1/5
128 129 130 131 132 133 134 135 136
	FBidiMirrored
	FUnicode1Name
	FISOComment
	FSimpleUppercaseMapping
	FSimpleLowercaseMapping
	FSimpleTitlecaseMapping
	NumField

	MaxChar = 0x10FFFF // anything above this shouldn't exist
137 138 139
)

var fieldName = []string{
Rob Pike's avatar
Rob Pike committed
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
	FCodePoint:                   "CodePoint",
	FName:                        "Name",
	FGeneralCategory:             "GeneralCategory",
	FCanonicalCombiningClass:     "CanonicalCombiningClass",
	FBidiClass:                   "BidiClass",
	FDecompositionTypeAndMapping: "DecompositionTypeAndMapping",
	FNumericType:                 "NumericType",
	FNumericDigit:                "NumericDigit",
	FNumericValue:                "NumericValue",
	FBidiMirrored:                "BidiMirrored",
	FUnicode1Name:                "Unicode1Name",
	FISOComment:                  "ISOComment",
	FSimpleUppercaseMapping:      "SimpleUppercaseMapping",
	FSimpleLowercaseMapping:      "SimpleLowercaseMapping",
	FSimpleTitlecaseMapping:      "SimpleTitlecaseMapping",
155 156 157 158
}

// This contains only the properties we're interested in.
type Char struct {
159
	field     []string // debugging only; could be deleted if we take out char.dump()
Russ Cox's avatar
Russ Cox committed
160
	codePoint rune     // if zero, this index is not a valid code point.
161
	category  string
Russ Cox's avatar
Russ Cox committed
162 163 164 165 166
	upperCase rune
	lowerCase rune
	titleCase rune
	foldCase  rune // simple case folding
	caseOrbit rune // next in simple case folding orbit
167 168
}

169 170 171 172 173 174
// Scripts.txt has form:
//	A673          ; Cyrillic # Po       SLAVONIC ASTERISK
//	A67C..A67D    ; Cyrillic # Mn   [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK
// See http://www.unicode.org/Public/5.1.0/ucd/UCD.html for full explanation

type Script struct {
175 176
	lo, hi uint32 // range of code points
	script string
177 178
}

179
var chars = make([]Char, MaxChar+1)
Robert Griesemer's avatar
Robert Griesemer committed
180
var scripts = make(map[string][]Script)
181
var props = make(map[string][]Script) // a property looks like a script; can share the format
Rob Pike's avatar
Rob Pike committed
182

Russ Cox's avatar
Russ Cox committed
183
var lastChar rune = 0
184

185
// In UnicodeData.txt, some ranges are marked like this:
186 187
//	3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
//	4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
188 189
// parseCategory returns a state variable indicating the weirdness.
type State int
Robert Griesemer's avatar
Robert Griesemer committed
190

191
const (
192 193 194 195
	SNormal State = iota // known to be zero for the type
	SFirst
	SLast
	SMissing
196 197 198
)

func parseCategory(line string) (state State) {
199
	field := strings.Split(line, ";")
200
	if len(field) != NumField {
Rob Pike's avatar
Rob Pike committed
201
		logger.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField)
202
	}
Russ Cox's avatar
Russ Cox committed
203
	point, err := strconv.ParseUint(field[FCodePoint], 16, 64)
204
	if err != nil {
Rob Pike's avatar
Rob Pike committed
205
		logger.Fatalf("%.5s...: %s", line, err)
206
	}
Russ Cox's avatar
Russ Cox committed
207
	lastChar = rune(point)
208
	if point == 0 {
209
		return // not interesting and we use 0 as unset
210
	}
211
	if point > MaxChar {
212
		return
213
	}
214 215
	char := &chars[point]
	char.field = field
216
	if char.codePoint != 0 {
Rob Pike's avatar
Rob Pike committed
217
		logger.Fatalf("point %U reused", point)
218
	}
219 220 221
	char.codePoint = lastChar
	char.category = field[FGeneralCategory]
	category[char.category] = true
222 223 224
	switch char.category {
	case "Nd":
		// Decimal digit
225
		_, err := strconv.Atoi(field[FNumericValue])
226
		if err != nil {
Rob Pike's avatar
Rob Pike committed
227
			logger.Fatalf("%U: bad numeric field: %s", point, err)
228 229
		}
	case "Lu":
230
		char.letter(field[FCodePoint], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping])
231
	case "Ll":
232
		char.letter(field[FSimpleUppercaseMapping], field[FCodePoint], field[FSimpleTitlecaseMapping])
233
	case "Lt":
234
		char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FCodePoint])
235
	default:
236
		char.letter(field[FSimpleUppercaseMapping], field[FSimpleLowercaseMapping], field[FSimpleTitlecaseMapping])
237
	}
238 239
	switch {
	case strings.Index(field[FName], ", First>") > 0:
240
		state = SFirst
241
	case strings.Index(field[FName], ", Last>") > 0:
242
		state = SLast
243
	}
244
	return
245 246 247
}

func (char *Char) dump(s string) {
248
	fmt.Print(s, " ")
Robert Griesemer's avatar
Robert Griesemer committed
249
	for i := 0; i < len(char.field); i++ {
250
		fmt.Printf("%s:%q ", fieldName[i], char.field[i])
251
	}
252
	fmt.Print("\n")
253 254 255
}

func (char *Char) letter(u, l, t string) {
256 257 258
	char.upperCase = char.letterValue(u, "U")
	char.lowerCase = char.letterValue(l, "L")
	char.titleCase = char.letterValue(t, "T")
259 260
}

Russ Cox's avatar
Russ Cox committed
261
func (char *Char) letterValue(s string, cas string) rune {
262
	if s == "" {
263
		return 0
264
	}
Russ Cox's avatar
Russ Cox committed
265
	v, err := strconv.ParseUint(s, 16, 64)
266
	if err != nil {
267
		char.dump(cas)
Rob Pike's avatar
Rob Pike committed
268
		logger.Fatalf("%U: bad letter(%s): %s", char.codePoint, s, err)
269
	}
Russ Cox's avatar
Russ Cox committed
270
	return rune(v)
271 272
}

Rob Pike's avatar
Rob Pike committed
273
func allCategories() []string {
Russ Cox's avatar
Russ Cox committed
274
	a := make([]string, 0, len(category))
Rob Pike's avatar
Rob Pike committed
275
	for k := range category {
Russ Cox's avatar
Russ Cox committed
276
		a = append(a, k)
Rob Pike's avatar
Rob Pike committed
277
	}
Russ Cox's avatar
Russ Cox committed
278
	sort.Strings(a)
279
	return a
Rob Pike's avatar
Rob Pike committed
280 281
}

Robert Griesemer's avatar
Robert Griesemer committed
282
func all(scripts map[string][]Script) []string {
Russ Cox's avatar
Russ Cox committed
283
	a := make([]string, 0, len(scripts))
284
	for k := range scripts {
Russ Cox's avatar
Russ Cox committed
285
		a = append(a, k)
286
	}
Russ Cox's avatar
Russ Cox committed
287 288 289 290
	sort.Strings(a)
	return a
}

Russ Cox's avatar
Russ Cox committed
291
func allCatFold(m map[string]map[rune]bool) []string {
Russ Cox's avatar
Russ Cox committed
292 293 294 295 296
	a := make([]string, 0, len(m))
	for k := range m {
		a = append(a, k)
	}
	sort.Strings(a)
297
	return a
298 299
}

Rob Pike's avatar
Rob Pike committed
300 301 302
// Extract the version number from the URL
func version() string {
	// Break on slashes and look for the first numeric field
303
	fields := strings.Split(*url, "/")
Rob Pike's avatar
Rob Pike committed
304 305
	for _, f := range fields {
		if len(f) > 0 && '0' <= f[0] && f[0] <= '9' {
306
			return f
Rob Pike's avatar
Rob Pike committed
307 308
		}
	}
Rob Pike's avatar
Rob Pike committed
309
	logger.Fatal("unknown version")
310
	return "Unknown"
Rob Pike's avatar
Rob Pike committed
311 312
}

Russ Cox's avatar
Russ Cox committed
313
func categoryOp(code rune, class uint8) bool {
314 315
	category := chars[code].category
	return len(category) > 0 && category[0] == class
Rob Pike's avatar
Rob Pike committed
316 317
}

318
func loadChars() {
319
	if *dataURL == "" {
320
		flag.Set("data", *url+"UnicodeData.txt")
321
	}
322
	input := open(*dataURL)
323 324
	defer input.close()
	scanner := bufio.NewScanner(input)
Russ Cox's avatar
Russ Cox committed
325
	var first rune = 0
326 327
	for scanner.Scan() {
		switch parseCategory(scanner.Text()) {
328 329
		case SNormal:
			if first != 0 {
Rob Pike's avatar
Rob Pike committed
330
				logger.Fatalf("bad state normal at %U", lastChar)
331 332 333
			}
		case SFirst:
			if first != 0 {
Rob Pike's avatar
Rob Pike committed
334
				logger.Fatalf("bad state first at %U", lastChar)
335
			}
336
			first = lastChar
337 338
		case SLast:
			if first == 0 {
Rob Pike's avatar
Rob Pike committed
339
				logger.Fatalf("bad state last at %U", lastChar)
340
			}
341
			for i := first + 1; i <= lastChar; i++ {
342 343
				chars[i] = chars[first]
				chars[i].codePoint = i
344
			}
345
			first = 0
346
		}
347
	}
348 349 350
	if scanner.Err() != nil {
		logger.Fatal(scanner.Err())
	}
351 352
}

Russ Cox's avatar
Russ Cox committed
353 354 355 356
func loadCasefold() {
	if *casefoldingURL == "" {
		flag.Set("casefolding", *url+"CaseFolding.txt")
	}
357
	input := open(*casefoldingURL)
358 359 360 361 362
	defer input.close()
	scanner := bufio.NewScanner(input)
	for scanner.Scan() {
		line := scanner.Text()
		if len(line) == 0 || line[0] == '#' || len(strings.TrimSpace(line)) == 0 {
Russ Cox's avatar
Russ Cox committed
363 364
			continue
		}
365
		field := strings.Split(line, "; ")
Russ Cox's avatar
Russ Cox committed
366 367 368 369 370 371 372 373
		if len(field) != 4 {
			logger.Fatalf("CaseFolding.txt %.5s...: %d fields (expected %d)\n", line, len(field), 4)
		}
		kind := field[1]
		if kind != "C" && kind != "S" {
			// Only care about 'common' and 'simple' foldings.
			continue
		}
Russ Cox's avatar
Russ Cox committed
374
		p1, err := strconv.ParseUint(field[0], 16, 64)
Russ Cox's avatar
Russ Cox committed
375 376 377
		if err != nil {
			logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
		}
Russ Cox's avatar
Russ Cox committed
378
		p2, err := strconv.ParseUint(field[2], 16, 64)
Russ Cox's avatar
Russ Cox committed
379 380 381
		if err != nil {
			logger.Fatalf("CaseFolding.txt %.5s...: %s", line, err)
		}
Russ Cox's avatar
Russ Cox committed
382
		chars[p1].foldCase = rune(p2)
Russ Cox's avatar
Russ Cox committed
383
	}
384 385 386
	if scanner.Err() != nil {
		logger.Fatal(scanner.Err())
	}
Russ Cox's avatar
Russ Cox committed
387 388
}

Rob Pike's avatar
Rob Pike committed
389
const progHeader = `// Generated by running
Russ Cox's avatar
Russ Cox committed
390
//	maketables --tables=%s --data=%s --casefolding=%s
Rob Pike's avatar
Rob Pike committed
391 392 393 394 395 396
// DO NOT EDIT

package unicode

`

397 398
func printCategories() {
	if *tablelist == "" {
399
		return
400
	}
Rob Pike's avatar
Rob Pike committed
401
	// Find out which categories to dump
402
	list := strings.Split(*tablelist, ",")
403
	if *tablelist == "all" {
404
		list = allCategories()
Rob Pike's avatar
Rob Pike committed
405 406
	}
	if *test {
407 408
		fullCategoryTest(list)
		return
Rob Pike's avatar
Rob Pike committed
409
	}
Russ Cox's avatar
Russ Cox committed
410
	fmt.Printf(progHeader, *tablelist, *dataURL, *casefoldingURL)
Rob Pike's avatar
Rob Pike committed
411

412 413
	fmt.Println("// Version is the Unicode edition from which the tables are derived.")
	fmt.Printf("const Version = %q\n\n", version())
Rob Pike's avatar
Rob Pike committed
414

415
	if *tablelist == "all" {
416
		fmt.Println("// Categories is the set of Unicode category tables.")
Rob Pike's avatar
Rob Pike committed
417
		fmt.Println("var Categories = map[string] *RangeTable {")
Russ Cox's avatar
Russ Cox committed
418
		for _, k := range allCategories() {
419
			fmt.Printf("\t%q: %s,\n", k, k)
Rob Pike's avatar
Rob Pike committed
420
		}
421
		fmt.Print("}\n\n")
Rob Pike's avatar
Rob Pike committed
422 423
	}

424
	decl := make(sort.StringSlice, len(list))
425
	ndecl := 0
Rob Pike's avatar
Rob Pike committed
426 427
	for _, name := range list {
		if _, ok := category[name]; !ok {
Rob Pike's avatar
Rob Pike committed
428
			logger.Fatal("unknown category", name)
Rob Pike's avatar
Rob Pike committed
429 430 431 432 433
		}
		// We generate an UpperCase name to serve as concise documentation and an _UnderScored
		// name to store the data.  This stops godoc dumping all the tables but keeps them
		// available to clients.
		// Cases deserving special comments
434
		varDecl := ""
Rob Pike's avatar
Rob Pike committed
435
		switch name {
436 437 438 439 440 441 442
		case "C":
			varDecl = "\tOther = _C;	// Other/C is the set of Unicode control and special characters, category C.\n"
			varDecl += "\tC = _C\n"
		case "L":
			varDecl = "\tLetter = _L;	// Letter/L is the set of Unicode letters, category L.\n"
			varDecl += "\tL = _L\n"
		case "M":
Oling Cat's avatar
Oling Cat committed
443
			varDecl = "\tMark = _M;	// Mark/M is the set of Unicode mark characters, category M.\n"
444 445 446 447 448 449 450 451 452 453 454 455 456
			varDecl += "\tM = _M\n"
		case "N":
			varDecl = "\tNumber = _N;	// Number/N is the set of Unicode number characters, category N.\n"
			varDecl += "\tN = _N\n"
		case "P":
			varDecl = "\tPunct = _P;	// Punct/P is the set of Unicode punctuation characters, category P.\n"
			varDecl += "\tP = _P\n"
		case "S":
			varDecl = "\tSymbol = _S;	// Symbol/S is the set of Unicode symbol characters, category S.\n"
			varDecl += "\tS = _S\n"
		case "Z":
			varDecl = "\tSpace = _Z;	// Space/Z is the set of Unicode space characters, category Z.\n"
			varDecl += "\tZ = _Z\n"
Rob Pike's avatar
Rob Pike committed
457
		case "Nd":
458
			varDecl = "\tDigit = _Nd;	// Digit is the set of Unicode characters with the \"decimal digit\" property.\n"
Rob Pike's avatar
Rob Pike committed
459
		case "Lu":
460
			varDecl = "\tUpper = _Lu;	// Upper is the set of Unicode upper case letters.\n"
Rob Pike's avatar
Rob Pike committed
461
		case "Ll":
462
			varDecl = "\tLower = _Ll;	// Lower is the set of Unicode lower case letters.\n"
Rob Pike's avatar
Rob Pike committed
463
		case "Lt":
464
			varDecl = "\tTitle = _Lt;	// Title is the set of Unicode title case letters.\n"
465
		}
466
		if len(name) > 1 {
467 468
			varDecl += fmt.Sprintf(
				"\t%s = _%s;	// %s is the set of Unicode characters in category %s.\n",
469
				name, name, name, name)
470
		}
471 472
		decl[ndecl] = varDecl
		ndecl++
473 474
		if len(name) == 1 { // unified categories
			decl := fmt.Sprintf("var _%s = &RangeTable{\n", name)
475
			dumpRange(
476
				decl,
Russ Cox's avatar
Russ Cox committed
477
				func(code rune) bool { return categoryOp(code, name[0]) })
478
			continue
Rob Pike's avatar
Rob Pike committed
479
		}
480
		dumpRange(
Rob Pike's avatar
Rob Pike committed
481
			fmt.Sprintf("var _%s = &RangeTable{\n", name),
Russ Cox's avatar
Russ Cox committed
482
			func(code rune) bool { return chars[code].category == name })
483
	}
484
	decl.Sort()
485
	fmt.Println("// These variables have type *RangeTable.")
486
	fmt.Println("var (")
487
	for _, d := range decl {
488
		fmt.Print(d)
489
	}
490
	fmt.Print(")\n\n")
491 492
}

Russ Cox's avatar
Russ Cox committed
493
type Op func(code rune) bool
Robert Griesemer's avatar
Robert Griesemer committed
494

Rob Pike's avatar
Rob Pike committed
495
const format = "\t\t{0x%04x, 0x%04x, %d},\n"
496

497
func dumpRange(header string, inCategory Op) {
498
	fmt.Print(header)
Russ Cox's avatar
Russ Cox committed
499
	next := rune(0)
500
	latinOffset := 0
Rob Pike's avatar
Rob Pike committed
501
	fmt.Print("\tR16: []Range16{\n")
502
	// one Range for each iteration
Rob Pike's avatar
Rob Pike committed
503 504
	count := &range16Count
	size := 16
505 506
	for {
		// look for start of range
Russ Cox's avatar
Russ Cox committed
507
		for next < rune(len(chars)) && !inCategory(next) {
508
			next++
509
		}
Russ Cox's avatar
Russ Cox committed
510
		if next >= rune(len(chars)) {
511
			// no characters remain
512
			break
513 514 515
		}

		// start of range
516 517
		lo := next
		hi := next
Russ Cox's avatar
Russ Cox committed
518
		stride := rune(1)
519
		// accept lo
520
		next++
521
		// look for another character to set the stride
Russ Cox's avatar
Russ Cox committed
522
		for next < rune(len(chars)) && !inCategory(next) {
523
			next++
524
		}
Russ Cox's avatar
Russ Cox committed
525
		if next >= rune(len(chars)) {
526
			// no more characters
527 528
			fmt.Printf(format, lo, hi, stride)
			break
529 530
		}
		// set stride
531
		stride = next - lo
532
		// check for length of run. next points to first jump in stride
Russ Cox's avatar
Russ Cox committed
533
		for i := next; i < rune(len(chars)); i++ {
534
			if inCategory(i) == (((i - lo) % stride) == 0) {
535 536
				// accept
				if inCategory(i) {
537
					hi = i
538 539 540
				}
			} else {
				// no more characters in this run
541
				break
542 543
			}
		}
544 545 546
		if uint32(hi) <= unicode.MaxLatin1 {
			latinOffset++
		}
547
		size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count)
548
		// next range: start looking where this range ends
549
		next = hi + 1
550
	}
Rob Pike's avatar
Rob Pike committed
551
	fmt.Print("\t},\n")
552 553 554
	if latinOffset > 0 {
		fmt.Printf("\tLatinOffset: %d,\n", latinOffset)
	}
555
	fmt.Print("}\n\n")
556
}
Rob Pike's avatar
Rob Pike committed
557

558 559 560 561
func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) {
	if size == 16 && hi >= 1<<16 {
		if lo < 1<<16 {
			if lo+stride != hi {
562
				logger.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride)
563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581
			}
			// No range contains U+FFFF as an instance, so split
			// the range into two entries. That way we can maintain
			// the invariant that R32 contains only >= 1<<16.
			fmt.Printf(format, lo, lo, 1)
			lo = hi
			stride = 1
			*count++
		}
		fmt.Print("\t},\n")
		fmt.Print("\tR32: []Range32{\n")
		size = 32
		count = &range32Count
	}
	fmt.Printf(format, lo, hi, stride)
	*count++
	return size, count
}

582
func fullCategoryTest(list []string) {
Rob Pike's avatar
Rob Pike committed
583 584
	for _, name := range list {
		if _, ok := category[name]; !ok {
Rob Pike's avatar
Rob Pike committed
585
			logger.Fatal("unknown category", name)
Rob Pike's avatar
Rob Pike committed
586
		}
587
		r, ok := unicode.Categories[name]
588 589
		if !ok && len(name) > 1 {
			logger.Fatalf("unknown table %q", name)
Rob Pike's avatar
Rob Pike committed
590
		}
591
		if len(name) == 1 {
Russ Cox's avatar
Russ Cox committed
592
			verifyRange(name, func(code rune) bool { return categoryOp(code, name[0]) }, r)
Rob Pike's avatar
Rob Pike committed
593 594 595
		} else {
			verifyRange(
				name,
Russ Cox's avatar
Russ Cox committed
596
				func(code rune) bool { return chars[code].category == name },
597
				r)
Rob Pike's avatar
Rob Pike committed
598 599 600 601
		}
	}
}

Rob Pike's avatar
Rob Pike committed
602
func verifyRange(name string, inCategory Op, table *unicode.RangeTable) {
603
	count := 0
Russ Cox's avatar
Russ Cox committed
604 605
	for j := range chars {
		i := rune(j)
606 607
		web := inCategory(i)
		pkg := unicode.Is(table, i)
Rob Pike's avatar
Rob Pike committed
608
		if web != pkg {
Rob Pike's avatar
Rob Pike committed
609
			fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg)
610 611 612 613
			count++
			if count > 10 {
				break
			}
Rob Pike's avatar
Rob Pike committed
614 615 616
		}
	}
}
617

Robert Griesemer's avatar
Robert Griesemer committed
618
func parseScript(line string, scripts map[string][]Script) {
619
	comment := strings.IndexByte(line, '#')
620
	if comment >= 0 {
621
		line = line[0:comment]
622
	}
623
	line = strings.TrimSpace(line)
624
	if len(line) == 0 {
625
		return
626
	}
627
	field := strings.Split(line, ";")
628
	if len(field) != 2 {
Rob Pike's avatar
Rob Pike committed
629
		logger.Fatalf("%s: %d fields (expected 2)\n", line, len(field))
630
	}
631
	matches := scriptRe.FindStringSubmatch(line)
632
	if len(matches) != 4 {
Rob Pike's avatar
Rob Pike committed
633
		logger.Fatalf("%s: %d matches (expected 3)\n", line, len(matches))
634
	}
Russ Cox's avatar
Russ Cox committed
635
	lo, err := strconv.ParseUint(matches[1], 16, 64)
636
	if err != nil {
Rob Pike's avatar
Rob Pike committed
637
		logger.Fatalf("%.5s...: %s", line, err)
638
	}
639 640
	hi := lo
	if len(matches[2]) > 2 { // ignore leading ..
Russ Cox's avatar
Russ Cox committed
641
		hi, err = strconv.ParseUint(matches[2][2:], 16, 64)
642
		if err != nil {
Rob Pike's avatar
Rob Pike committed
643
			logger.Fatalf("%.5s...: %s", line, err)
644 645
		}
	}
646
	name := matches[3]
Russ Cox's avatar
Russ Cox committed
647
	scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name})
648 649
}

Rob Pike's avatar
Rob Pike committed
650
// The script tables have a lot of adjacent elements. Fold them together.
Rob Pike's avatar
Rob Pike committed
651 652
func foldAdjacent(r []Script) []unicode.Range32 {
	s := make([]unicode.Range32, 0, len(r))
653
	j := 0
Rob Pike's avatar
Rob Pike committed
654
	for i := 0; i < len(r); i++ {
Rob Pike's avatar
Rob Pike committed
655 656
		if j > 0 && r[i].lo == s[j-1].Hi+1 {
			s[j-1].Hi = r[i].hi
Rob Pike's avatar
Rob Pike committed
657
		} else {
658
			s = s[0 : j+1]
659 660 661 662 663
			s[j] = unicode.Range32{
				Lo:     uint32(r[i].lo),
				Hi:     uint32(r[i].hi),
				Stride: 1,
			}
664
			j++
Rob Pike's avatar
Rob Pike committed
665 666
		}
	}
667
	return s
Rob Pike's avatar
Rob Pike committed
668 669
}

Rob Pike's avatar
Rob Pike committed
670
func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) {
Rob Pike's avatar
Rob Pike committed
671 672
	for _, name := range list {
		if _, ok := scripts[name]; !ok {
Rob Pike's avatar
Rob Pike committed
673
			logger.Fatal("unknown script", name)
Rob Pike's avatar
Rob Pike committed
674
		}
675
		_, ok := installed[name]
Rob Pike's avatar
Rob Pike committed
676
		if !ok {
Rob Pike's avatar
Rob Pike committed
677
			logger.Fatal("unknown table", name)
Rob Pike's avatar
Rob Pike committed
678 679 680
		}
		for _, script := range scripts[name] {
			for r := script.lo; r <= script.hi; r++ {
Russ Cox's avatar
Russ Cox committed
681
				if !unicode.Is(installed[name], rune(r)) {
Rob Pike's avatar
Rob Pike committed
682
					fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name)
Rob Pike's avatar
Rob Pike committed
683 684 685 686 687 688 689 690
				}
			}
		}
	}
}

// PropList.txt has the same format as Scripts.txt so we can share its parser.
func printScriptOrProperty(doProps bool) {
691 692 693 694 695
	flag := "scripts"
	flaglist := *scriptlist
	file := "Scripts.txt"
	table := scripts
	installed := unicode.Scripts
Rob Pike's avatar
Rob Pike committed
696
	if doProps {
697 698 699 700 701
		flag = "props"
		flaglist = *proplist
		file = "PropList.txt"
		table = props
		installed = unicode.Properties
Rob Pike's avatar
Rob Pike committed
702 703
	}
	if flaglist == "" {
704
		return
705
	}
706
	input := open(*url + file)
707 708 709 710 711 712
	scanner := bufio.NewScanner(input)
	for scanner.Scan() {
		parseScript(scanner.Text(), table)
	}
	if scanner.Err() != nil {
		logger.Fatal(scanner.Err())
713
	}
714
	input.close()
715 716

	// Find out which scripts to dump
717
	list := strings.Split(flaglist, ",")
Rob Pike's avatar
Rob Pike committed
718
	if flaglist == "all" {
719
		list = all(table)
720 721
	}
	if *test {
722 723
		fullScriptTest(list, installed, table)
		return
724 725 726
	}

	fmt.Printf(
727 728
		"// Generated by running\n"+
			"//	maketables --%s=%s --url=%s\n"+
Robert Griesemer's avatar
Robert Griesemer committed
729
			"// DO NOT EDIT\n\n",
Rob Pike's avatar
Rob Pike committed
730 731
		flag,
		flaglist,
732
		*url)
Rob Pike's avatar
Rob Pike committed
733 734
	if flaglist == "all" {
		if doProps {
735
			fmt.Println("// Properties is the set of Unicode property tables.")
Rob Pike's avatar
Rob Pike committed
736
			fmt.Println("var Properties = map[string] *RangeTable{")
Rob Pike's avatar
Rob Pike committed
737
		} else {
738
			fmt.Println("// Scripts is the set of Unicode script tables.")
Rob Pike's avatar
Rob Pike committed
739
			fmt.Println("var Scripts = map[string] *RangeTable{")
Rob Pike's avatar
Rob Pike committed
740
		}
Russ Cox's avatar
Russ Cox committed
741
		for _, k := range all(table) {
742
			fmt.Printf("\t%q: %s,\n", k, k)
743
		}
744
		fmt.Print("}\n\n")
745 746
	}

747
	decl := make(sort.StringSlice, len(list))
748
	ndecl := 0
749
	for _, name := range list {
Rob Pike's avatar
Rob Pike committed
750 751 752
		if doProps {
			decl[ndecl] = fmt.Sprintf(
				"\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n",
753
				name, name, name, name)
Rob Pike's avatar
Rob Pike committed
754 755 756
		} else {
			decl[ndecl] = fmt.Sprintf(
				"\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n",
757
				name, name, name, name)
Rob Pike's avatar
Rob Pike committed
758
		}
759
		ndecl++
Rob Pike's avatar
Rob Pike committed
760
		fmt.Printf("var _%s = &RangeTable {\n", name)
761
		ranges := foldAdjacent(table[name])
762
		fmt.Print("\tR16: []Range16{\n")
Rob Pike's avatar
Rob Pike committed
763 764
		size := 16
		count := &range16Count
765
		for _, s := range ranges {
766
			size, count = printRange(s.Lo, s.Hi, s.Stride, size, count)
767
		}
Rob Pike's avatar
Rob Pike committed
768
		fmt.Print("\t},\n")
769 770 771
		if off := findLatinOffset(ranges); off > 0 {
			fmt.Printf("\tLatinOffset: %d,\n", off)
		}
772
		fmt.Print("}\n\n")
773
	}
774
	decl.Sort()
775
	fmt.Println("// These variables have type *RangeTable.")
776
	fmt.Println("var (")
777
	for _, d := range decl {
778
		fmt.Print(d)
779
	}
780
	fmt.Print(")\n\n")
781 782
}

783 784 785 786 787 788 789 790
func findLatinOffset(ranges []unicode.Range32) int {
	i := 0
	for i < len(ranges) && ranges[i].Hi <= unicode.MaxLatin1 {
		i++
	}
	return i
}

791
const (
792 793 794 795 796
	CaseUpper = 1 << iota
	CaseLower
	CaseTitle
	CaseNone    = 0  // must be zero
	CaseMissing = -1 // character not present; not a valid case state
797 798 799
)

type caseState struct {
Russ Cox's avatar
Russ Cox committed
800
	point        rune
801
	_case        int
Russ Cox's avatar
Russ Cox committed
802 803 804
	deltaToUpper rune
	deltaToLower rune
	deltaToTitle rune
805 806 807 808 809
}

// Is d a continuation of the state of c?
func (c *caseState) adjacent(d *caseState) bool {
	if d.point < c.point {
810
		c, d = d, c
811 812
	}
	switch {
813
	case d.point != c.point+1: // code points not adjacent (shouldn't happen)
814
		return false
815
	case d._case != c._case: // different cases
816
		return c.upperLowerAdjacent(d)
817
	case c._case == CaseNone:
818
		return false
819
	case c._case == CaseMissing:
820
		return false
821
	case d.deltaToUpper != c.deltaToUpper:
822
		return false
823
	case d.deltaToLower != c.deltaToLower:
824
		return false
825
	case d.deltaToTitle != c.deltaToTitle:
826
		return false
827
	}
828
	return true
829 830
}

831 832 833 834 835 836
// Is d the same as c, but opposite in upper/lower case? this would make it
// an element of an UpperLower sequence.
func (c *caseState) upperLowerAdjacent(d *caseState) bool {
	// check they're a matched case pair.  we know they have adjacent values
	switch {
	case c._case == CaseUpper && d._case != CaseLower:
837
		return false
838
	case c._case == CaseLower && d._case != CaseUpper:
839
		return false
840 841 842
	}
	// matched pair (at least in upper/lower).  make the order Upper Lower
	if c._case == CaseLower {
843
		c, d = d, c
844 845 846 847 848 849
	}
	// for an Upper Lower sequence the deltas have to be in order
	//	c: 0 1 0
	//	d: -1 0 -1
	switch {
	case c.deltaToUpper != 0:
850
		return false
851
	case c.deltaToLower != 1:
852
		return false
853
	case c.deltaToTitle != 0:
854
		return false
855
	case d.deltaToUpper != -1:
856
		return false
857
	case d.deltaToLower != 0:
858
		return false
859
	case d.deltaToTitle != -1:
860
		return false
861
	}
862
	return true
863 864 865 866 867 868 869 870
}

// Does this character start an UpperLower sequence?
func (c *caseState) isUpperLower() bool {
	// for an Upper Lower sequence the deltas have to be in order
	//	c: 0 1 0
	switch {
	case c.deltaToUpper != 0:
871
		return false
872
	case c.deltaToLower != 1:
873
		return false
874
	case c.deltaToTitle != 0:
875
		return false
876
	}
877
	return true
878 879 880 881 882 883 884 885
}

// Does this character start a LowerUpper sequence?
func (c *caseState) isLowerUpper() bool {
	// for an Upper Lower sequence the deltas have to be in order
	//	c: -1 0 -1
	switch {
	case c.deltaToUpper != -1:
886
		return false
887
	case c.deltaToLower != 0:
888
		return false
889
	case c.deltaToTitle != -1:
890
		return false
891
	}
892
	return true
893 894
}

Russ Cox's avatar
Russ Cox committed
895
func getCaseState(i rune) (c *caseState) {
896 897
	c = &caseState{point: i, _case: CaseNone}
	ch := &chars[i]
Russ Cox's avatar
Russ Cox committed
898
	switch ch.codePoint {
899
	case 0:
900 901
		c._case = CaseMissing // Will get NUL wrong but that doesn't matter
		return
902
	case ch.upperCase:
903
		c._case = CaseUpper
904
	case ch.lowerCase:
905
		c._case = CaseLower
906
	case ch.titleCase:
907
		c._case = CaseTitle
908
	}
909 910 911 912 913 914 915 916 917 918
	// Some things such as roman numeral U+2161 don't describe themselves
	// as upper case, but have a lower case.  Second-guess them.
	if c._case == CaseNone && ch.lowerCase != 0 {
		c._case = CaseUpper
	}
	// Same in the other direction.
	if c._case == CaseNone && ch.upperCase != 0 {
		c._case = CaseLower
	}

919
	if ch.upperCase != 0 {
920
		c.deltaToUpper = ch.upperCase - i
921 922
	}
	if ch.lowerCase != 0 {
923
		c.deltaToLower = ch.lowerCase - i
924 925
	}
	if ch.titleCase != 0 {
926
		c.deltaToTitle = ch.titleCase - i
927
	}
928
	return
929 930 931 932
}

func printCases() {
	if !*cases {
933
		return
934 935
	}
	if *test {
936 937
		fullCaseTest()
		return
938 939
	}
	fmt.Printf(
940
		"// Generated by running\n"+
Russ Cox's avatar
Russ Cox committed
941
			"//	maketables --data=%s --casefolding=%s\n"+
942 943 944 945
			"// DO NOT EDIT\n\n"+
			"// CaseRanges is the table describing case mappings for all letters with\n"+
			"// non-self mappings.\n"+
			"var CaseRanges = _CaseRanges\n"+
Robert Griesemer's avatar
Robert Griesemer committed
946
			"var _CaseRanges = []CaseRange {\n",
Russ Cox's avatar
Russ Cox committed
947
		*dataURL, *casefoldingURL)
948

949 950
	var startState *caseState    // the start of a run; nil for not active
	var prevState = &caseState{} // the state of the previous character
951
	for i := range chars {
Russ Cox's avatar
Russ Cox committed
952
		state := getCaseState(rune(i))
953
		if state.adjacent(prevState) {
954 955
			prevState = state
			continue
956 957
		}
		// end of run (possibly)
958 959
		printCaseRange(startState, prevState)
		startState = nil
960
		if state._case != CaseMissing && state._case != CaseNone {
961
			startState = state
962
		}
963
		prevState = state
964
	}
965
	fmt.Print("}\n")
966 967 968 969
}

func printCaseRange(lo, hi *caseState) {
	if lo == nil {
970
		return
971 972 973
	}
	if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 {
		// character represents itself in all cases - no need to mention it
974
		return
975
	}
976 977
	switch {
	case hi.point > lo.point && lo.isUpperLower():
Rob Pike's avatar
Rob Pike committed
978
		fmt.Printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n",
979
			lo.point, hi.point)
980
	case hi.point > lo.point && lo.isLowerUpper():
Rob Pike's avatar
Rob Pike committed
981
		logger.Fatalf("LowerUpper sequence: should not happen: %U.  If it's real, need to fix To()", lo.point)
Rob Pike's avatar
Rob Pike committed
982
		fmt.Printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n",
983
			lo.point, hi.point)
984
	default:
Rob Pike's avatar
Rob Pike committed
985
		fmt.Printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n",
986
			lo.point, hi.point,
987
			lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle)
988
	}
989 990 991
}

// If the cased value in the Char is 0, it means use the rune itself.
Russ Cox's avatar
Russ Cox committed
992
func caseIt(r, cased rune) rune {
993
	if cased == 0 {
Russ Cox's avatar
Russ Cox committed
994
		return r
995
	}
996
	return cased
997 998 999
}

func fullCaseTest() {
Russ Cox's avatar
Russ Cox committed
1000 1001
	for j, c := range chars {
		i := rune(j)
1002 1003
		lower := unicode.ToLower(i)
		want := caseIt(i, c.lowerCase)
1004
		if lower != want {
Rob Pike's avatar
Rob Pike committed
1005
			fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower)
1006
		}
1007 1008
		upper := unicode.ToUpper(i)
		want = caseIt(i, c.upperCase)
1009
		if upper != want {
Rob Pike's avatar
Rob Pike committed
1010
			fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper)
1011
		}
1012 1013
		title := unicode.ToTitle(i)
		want = caseIt(i, c.titleCase)
1014
		if title != want {
Rob Pike's avatar
Rob Pike committed
1015
			fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title)
1016 1017 1018
		}
	}
}
Rob Pike's avatar
Rob Pike committed
1019

1020 1021 1022 1023
func printLatinProperties() {
	if *test {
		return
	}
1024 1025
	fmt.Println("var properties = [MaxLatin1+1]uint8{")
	for code := 0; code <= unicode.MaxLatin1; code++ {
1026 1027 1028 1029 1030 1031 1032 1033
		var property string
		switch chars[code].category {
		case "Cc", "": // NUL has no category.
			property = "pC"
		case "Cf": // soft hyphen, unique category, not printable.
			property = "0"
		case "Ll":
			property = "pLl | pp"
1034 1035
		case "Lo":
			property = "pLo | pp"
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052
		case "Lu":
			property = "pLu | pp"
		case "Nd", "No":
			property = "pN | pp"
		case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps":
			property = "pP | pp"
		case "Sc", "Sk", "Sm", "So":
			property = "pS | pp"
		case "Zs":
			property = "pZ"
		default:
			logger.Fatalf("%U has unknown category %q", code, chars[code].category)
		}
		// Special case
		if code == ' ' {
			property = "pZ | pp"
		}
Russ Cox's avatar
Russ Cox committed
1053 1054 1055 1056 1057
		fmt.Printf("\t0x%02X: %s, // %q\n", code, property, code)
	}
	fmt.Printf("}\n\n")
}

Russ Cox's avatar
Russ Cox committed
1058 1059 1060 1061 1062 1063
type runeSlice []rune

func (p runeSlice) Len() int           { return len(p) }
func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
func (p runeSlice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }

Russ Cox's avatar
Russ Cox committed
1064 1065
func printCasefold() {
	// Build list of case-folding groups attached to each canonical folded char (typically lower case).
Russ Cox's avatar
Russ Cox committed
1066 1067 1068
	var caseOrbit = make([][]rune, MaxChar+1)
	for j := range chars {
		i := rune(j)
Russ Cox's avatar
Russ Cox committed
1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080
		c := &chars[i]
		if c.foldCase == 0 {
			continue
		}
		orb := caseOrbit[c.foldCase]
		if orb == nil {
			orb = append(orb, c.foldCase)
		}
		caseOrbit[c.foldCase] = append(orb, i)
	}

	// Insert explicit 1-element groups when assuming [lower, upper] would be wrong.
Russ Cox's avatar
Russ Cox committed
1081 1082
	for j := range chars {
		i := rune(j)
Russ Cox's avatar
Russ Cox committed
1083 1084 1085 1086 1087 1088 1089 1090
		c := &chars[i]
		f := c.foldCase
		if f == 0 {
			f = i
		}
		orb := caseOrbit[f]
		if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) {
			// Default assumption of [upper, lower] is wrong.
Russ Cox's avatar
Russ Cox committed
1091
			caseOrbit[i] = []rune{i}
Russ Cox's avatar
Russ Cox committed
1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106
		}
	}

	// Delete the groups for which assuming [lower, upper] is right.
	for i, orb := range caseOrbit {
		if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] {
			caseOrbit[i] = nil
		}
	}

	// Record orbit information in chars.
	for _, orb := range caseOrbit {
		if orb == nil {
			continue
		}
Russ Cox's avatar
Russ Cox committed
1107
		sort.Sort(runeSlice(orb))
Russ Cox's avatar
Russ Cox committed
1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119
		c := orb[len(orb)-1]
		for _, d := range orb {
			chars[c].caseOrbit = d
			c = d
		}
	}

	printCaseOrbit()

	// Tables of category and script folding exceptions: code points
	// that must be added when interpreting a particular category/script
	// in a case-folding context.
Russ Cox's avatar
Russ Cox committed
1120
	cat := make(map[string]map[rune]bool)
Russ Cox's avatar
Russ Cox committed
1121 1122 1123 1124 1125 1126
	for name := range category {
		if x := foldExceptions(inCategory(name)); len(x) > 0 {
			cat[name] = x
		}
	}

Russ Cox's avatar
Russ Cox committed
1127
	scr := make(map[string]map[rune]bool)
Russ Cox's avatar
Russ Cox committed
1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138
	for name := range scripts {
		if x := foldExceptions(inScript(name)); len(x) > 0 {
			cat[name] = x
		}
	}

	printCatFold("FoldCategory", cat)
	printCatFold("FoldScript", scr)
}

// inCategory returns a list of all the runes in the category.
Russ Cox's avatar
Russ Cox committed
1139 1140 1141 1142
func inCategory(name string) []rune {
	var x []rune
	for j := range chars {
		i := rune(j)
Russ Cox's avatar
Russ Cox committed
1143 1144 1145 1146
		c := &chars[i]
		if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] {
			x = append(x, i)
		}
1147
	}
Russ Cox's avatar
Russ Cox committed
1148
	return x
1149 1150
}

Russ Cox's avatar
Russ Cox committed
1151
// inScript returns a list of all the runes in the script.
Russ Cox's avatar
Russ Cox committed
1152 1153
func inScript(name string) []rune {
	var x []rune
Russ Cox's avatar
Russ Cox committed
1154 1155
	for _, s := range scripts[name] {
		for c := s.lo; c <= s.hi; c++ {
Russ Cox's avatar
Russ Cox committed
1156
			x = append(x, rune(c))
Russ Cox's avatar
Russ Cox committed
1157 1158 1159 1160 1161 1162 1163
		}
	}
	return x
}

// foldExceptions returns a list of all the runes fold-equivalent
// to runes in class but not in class themselves.
Russ Cox's avatar
Russ Cox committed
1164
func foldExceptions(class []rune) map[rune]bool {
Russ Cox's avatar
Russ Cox committed
1165
	// Create map containing class and all fold-equivalent chars.
Russ Cox's avatar
Russ Cox committed
1166
	m := make(map[rune]bool)
Russ Cox's avatar
Russ Cox committed
1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192
	for _, r := range class {
		c := &chars[r]
		if c.caseOrbit == 0 {
			// Just upper and lower.
			if u := c.upperCase; u != 0 {
				m[u] = true
			}
			if l := c.lowerCase; l != 0 {
				m[l] = true
			}
			m[r] = true
			continue
		}
		// Otherwise walk orbit.
		r0 := r
		for {
			m[r] = true
			r = chars[r].caseOrbit
			if r == r0 {
				break
			}
		}
	}

	// Remove class itself.
	for _, r := range class {
Russ Cox's avatar
Russ Cox committed
1193
		delete(m, r)
Russ Cox's avatar
Russ Cox committed
1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213
	}

	// What's left is the exceptions.
	return m
}

var comment = map[string]string{
	"FoldCategory": "// FoldCategory maps a category name to a table of\n" +
		"// code points outside the category that are equivalent under\n" +
		"// simple case folding to code points inside the category.\n" +
		"// If there is no entry for a category name, there are no such points.\n",

	"FoldScript": "// FoldScript maps a script name to a table of\n" +
		"// code points outside the script that are equivalent under\n" +
		"// simple case folding to code points inside the script.\n" +
		"// If there is no entry for a script name, there are no such points.\n",
}

func printCaseOrbit() {
	if *test {
Russ Cox's avatar
Russ Cox committed
1214 1215
		for j := range chars {
			i := rune(j)
Russ Cox's avatar
Russ Cox committed
1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244
			c := &chars[i]
			f := c.caseOrbit
			if f == 0 {
				if c.lowerCase != i && c.lowerCase != 0 {
					f = c.lowerCase
				} else if c.upperCase != i && c.upperCase != 0 {
					f = c.upperCase
				} else {
					f = i
				}
			}
			if g := unicode.SimpleFold(i); g != f {
				fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f)
			}
		}
		return
	}

	fmt.Printf("var caseOrbit = []foldPair{\n")
	for i := range chars {
		c := &chars[i]
		if c.caseOrbit != 0 {
			fmt.Printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit)
			foldPairCount++
		}
	}
	fmt.Printf("}\n\n")
}

Russ Cox's avatar
Russ Cox committed
1245
func printCatFold(name string, m map[string]map[rune]bool) {
Russ Cox's avatar
Russ Cox committed
1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264
	if *test {
		var pkgMap map[string]*unicode.RangeTable
		if name == "FoldCategory" {
			pkgMap = unicode.FoldCategory
		} else {
			pkgMap = unicode.FoldScript
		}
		if len(pkgMap) != len(m) {
			fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m))
			return
		}
		for k, v := range m {
			t, ok := pkgMap[k]
			if !ok {
				fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k)
				continue
			}
			n := 0
			for _, r := range t.R16 {
Russ Cox's avatar
Russ Cox committed
1265
				for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) {
Russ Cox's avatar
Russ Cox committed
1266 1267 1268 1269 1270 1271 1272
					if !v[c] {
						fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
					}
					n++
				}
			}
			for _, r := range t.R32 {
Russ Cox's avatar
Russ Cox committed
1273
				for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) {
Russ Cox's avatar
Russ Cox committed
1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288
					if !v[c] {
						fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
					}
					n++
				}
			}
			if n != len(v) {
				fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v))
			}
		}
		return
	}

	fmt.Print(comment[name])
	fmt.Printf("var %s = map[string]*RangeTable{\n", name)
Russ Cox's avatar
Russ Cox committed
1289
	for _, name := range allCatFold(m) {
Russ Cox's avatar
Russ Cox committed
1290 1291 1292
		fmt.Printf("\t%q: fold%s,\n", name, name)
	}
	fmt.Printf("}\n\n")
Russ Cox's avatar
Russ Cox committed
1293 1294
	for _, name := range allCatFold(m) {
		class := m[name]
Russ Cox's avatar
Russ Cox committed
1295 1296
		dumpRange(
			fmt.Sprintf("var fold%s = &RangeTable{\n", name),
Russ Cox's avatar
Russ Cox committed
1297
			func(code rune) bool { return class[code] })
Russ Cox's avatar
Russ Cox committed
1298 1299 1300 1301 1302 1303
	}
}

var range16Count = 0  // Number of entries in the 16-bit range tables.
var range32Count = 0  // Number of entries in the 32-bit range tables.
var foldPairCount = 0 // Number of fold pairs in the exception tables.
Rob Pike's avatar
Rob Pike committed
1304 1305

func printSizes() {
1306 1307 1308
	if *test {
		return
	}
Rob Pike's avatar
Rob Pike committed
1309 1310 1311 1312 1313
	fmt.Println()
	fmt.Printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count)
	range16Bytes := range16Count * 3 * 2
	range32Bytes := range32Count * 3 * 4
	fmt.Printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
Russ Cox's avatar
Russ Cox committed
1314 1315
	fmt.Println()
	fmt.Printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2)
Rob Pike's avatar
Rob Pike committed
1316
}