Commit 4e2b7f8f authored by Rob Pike's avatar Rob Pike

Unicode: provide an ability to supplement the case-mapping tables

in character and string case mapping routines.

Add a custom mapper for Turkish and Azeri.

A more general solution for deriving the case information from Unicode's
SpecialCasing.txt will require more work.

Fixes #703.

R=rsc, rsc1
CC=golang-dev, mdakin
https://golang.org/cl/824043
parent c2f3737c
...@@ -291,6 +291,24 @@ func ToLower(s string) string { return Map(unicode.ToLower, s) } ...@@ -291,6 +291,24 @@ func ToLower(s string) string { return Map(unicode.ToLower, s) }
// ToTitle returns a copy of the string s with all Unicode letters mapped to their title case. // ToTitle returns a copy of the string s with all Unicode letters mapped to their title case.
func ToTitle(s string) string { return Map(unicode.ToTitle, s) } func ToTitle(s string) string { return Map(unicode.ToTitle, s) }
// ToUpperSpecial returns a copy of the string s with all Unicode letters mapped to their
// upper case, giving priority to the special casing rules.
func ToUpperSpecial(_case unicode.SpecialCase, s string) string {
return Map(func(r int) int { return _case.ToUpper(r) }, s)
}
// ToLowerSpecial returns a copy of the string s with all Unicode letters mapped to their
// lower case, giving priority to the special casing rules.
func ToLowerSpecial(_case unicode.SpecialCase, s string) string {
return Map(func(r int) int { return _case.ToLower(r) }, s)
}
// ToTitleSpecial returns a copy of the string s with all Unicode letters mapped to their
// title case, giving priority to the special casing rules.
func ToTitleSpecial(_case unicode.SpecialCase, s string) string {
return Map(func(r int) int { return _case.ToTitle(r) }, s)
}
// Trim returns a slice of the string s, with all leading and trailing white space // Trim returns a slice of the string s, with all leading and trailing white space
// removed, as defined by Unicode. // removed, as defined by Unicode.
func TrimSpace(s string) string { func TrimSpace(s string) string {
......
...@@ -341,6 +341,28 @@ func TestToUpper(t *testing.T) { runStringTests(t, ToUpper, "ToUpper", upperTest ...@@ -341,6 +341,28 @@ func TestToUpper(t *testing.T) { runStringTests(t, ToUpper, "ToUpper", upperTest
func TestToLower(t *testing.T) { runStringTests(t, ToLower, "ToLower", lowerTests) } func TestToLower(t *testing.T) { runStringTests(t, ToLower, "ToLower", lowerTests) }
func TestSpecialCase(t *testing.T) {
lower := "abcçdefgğhıijklmnoöprsştuüvyz"
upper := "ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ"
u := ToUpperSpecial(unicode.TurkishCase, upper)
if u != upper {
t.Errorf("Upper(upper) is %s not %s", u, upper)
}
u = ToUpperSpecial(unicode.TurkishCase, lower)
if u != upper {
t.Errorf("Upper(lower) is %s not %s", u, upper)
}
l := ToLowerSpecial(unicode.TurkishCase, lower)
if l != lower {
t.Errorf("Lower(lower) is %s not %s", l, lower)
}
l = ToLowerSpecial(unicode.TurkishCase, upper)
if l != lower {
t.Errorf("Lower(upper) is %s not %s", l, lower)
}
}
func TestTrimSpace(t *testing.T) { runStringTests(t, TrimSpace, "TrimSpace", trimSpaceTests) } func TestTrimSpace(t *testing.T) { runStringTests(t, TrimSpace, "TrimSpace", trimSpaceTests) }
func equal(m string, s1, s2 string, t *testing.T) bool { func equal(m string, s1, s2 string, t *testing.T) bool {
......
...@@ -6,6 +6,7 @@ include ../../Make.$(GOARCH) ...@@ -6,6 +6,7 @@ include ../../Make.$(GOARCH)
TARG=unicode TARG=unicode
GOFILES=\ GOFILES=\
casetables.go\
digit.go\ digit.go\
letter.go\ letter.go\
tables.go\ tables.go\
......
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// TODO: This file contains the special casing rules for Turkish and Azeri only.
// It should encompass all the languages with special casing rules
// and be generated automatically, but that requires some API
// development first.
package unicode
var TurkishCase = _TurkishCase
var _TurkishCase = SpecialCase{
CaseRange{0x0049, 0x0049, d{0, 0x131 - 0x49, 0}},
CaseRange{0x0069, 0x0069, d{0x130 - 0x69, 0, 0x130 - 0x69}},
CaseRange{0x0130, 0x0130, d{0, 0x69 - 0x130, 0}},
CaseRange{0x0131, 0x0131, d{0x49 - 0x131, 0, 0x49 - 0x131}},
}
var AzeriCase = _TurkishCase
...@@ -19,7 +19,8 @@ type Range struct { ...@@ -19,7 +19,8 @@ type Range struct {
Stride int Stride int
} }
// The representation of a range of Unicode code points for case conversion. // CaseRange represents a range of Unicode code points for simple (one
// code point to one code point) case conversion.
// The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas // The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas
// are the number to add to the code point to reach the code point for a // are the number to add to the code point to reach the code point for a
// different case for that character. They may be negative. If zero, it // different case for that character. They may be negative. If zero, it
...@@ -34,6 +35,13 @@ type CaseRange struct { ...@@ -34,6 +35,13 @@ type CaseRange struct {
Delta d Delta d
} }
// SpecialCase represents language-specific case mappings such as Turkish.
// Methods of SpecialCase customize (by overriding) the standard mappings.
type SpecialCase []CaseRange
//BUG(r): Provide a mechanism for full case folding (those that involve
// multiple runes in the input or output).
// Indices into the Delta arrays inside CaseRanges for case mapping. // Indices into the Delta arrays inside CaseRanges for case mapping.
const ( const (
UpperCase = iota UpperCase = iota
...@@ -130,17 +138,17 @@ func IsSpace(rune int) bool { ...@@ -130,17 +138,17 @@ func IsSpace(rune int) bool {
return Is(White_Space, rune) return Is(White_Space, rune)
} }
// To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase. // to maps the rune using the specified case mapping.
func To(_case int, rune int) int { func to(_case int, rune int, caseRange []CaseRange) int {
if _case < 0 || MaxCase <= _case { if _case < 0 || MaxCase <= _case {
return ReplacementChar // as reasonable an error as any return ReplacementChar // as reasonable an error as any
} }
// binary search over ranges // binary search over ranges
lo := 0 lo := 0
hi := len(CaseRanges) hi := len(caseRange)
for lo < hi { for lo < hi {
m := lo + (hi-lo)/2 m := lo + (hi-lo)/2
r := CaseRanges[m] r := caseRange[m]
if r.Lo <= rune && rune <= r.Hi { if r.Lo <= rune && rune <= r.Hi {
delta := int(r.Delta[_case]) delta := int(r.Delta[_case])
if delta > MaxRune { if delta > MaxRune {
...@@ -167,6 +175,11 @@ func To(_case int, rune int) int { ...@@ -167,6 +175,11 @@ func To(_case int, rune int) int {
return rune return rune
} }
// To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase.
func To(_case int, rune int) int {
return to(_case, rune, CaseRanges)
}
// ToUpper maps the rune to upper case. // ToUpper maps the rune to upper case.
func ToUpper(rune int) int { func ToUpper(rune int) int {
if rune < 0x80 { // quick ASCII check if rune < 0x80 { // quick ASCII check
...@@ -199,3 +212,30 @@ func ToTitle(rune int) int { ...@@ -199,3 +212,30 @@ func ToTitle(rune int) int {
} }
return To(TitleCase, rune) return To(TitleCase, rune)
} }
// ToUpper maps the rune to upper case giving priority to the special mapping.
func (special SpecialCase) ToUpper(rune int) int {
r := to(UpperCase, rune, []CaseRange(special))
if r == rune {
r = ToUpper(rune)
}
return r
}
// ToTitlemaps the rune to upper case giving priority to the special mapping.
func (special SpecialCase) ToTitle(rune int) int {
r := to(TitleCase, rune, []CaseRange(special))
if r == rune {
r = ToTitle(rune)
}
return r
}
// ToLower maps the rune to upper case giving priority to the special mapping.
func (special SpecialCase) ToLower(rune int) int {
r := to(LowerCase, rune, []CaseRange(special))
if r == rune {
r = ToLower(rune)
}
return r
}
...@@ -349,3 +349,29 @@ func TestLetterOptimizations(t *testing.T) { ...@@ -349,3 +349,29 @@ func TestLetterOptimizations(t *testing.T) {
} }
} }
} }
func TestTurkishCase(t *testing.T) {
lower := []int("abcçdefgğhıijklmnoöprsştuüvyz")
upper := []int("ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ")
for i, l := range lower {
u := upper[i]
if TurkishCase.ToLower(l) != l {
t.Errorf("lower(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToLower(l), l)
}
if TurkishCase.ToUpper(u) != u {
t.Errorf("upper(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToUpper(u), u)
}
if TurkishCase.ToUpper(l) != u {
t.Errorf("upper(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToUpper(l), u)
}
if TurkishCase.ToLower(u) != l {
t.Errorf("lower(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToLower(l), l)
}
if TurkishCase.ToTitle(u) != u {
t.Errorf("title(U+%04X) is U+%04X not U+%04X", u, TurkishCase.ToTitle(u), u)
}
if TurkishCase.ToTitle(l) != u {
t.Errorf("title(U+%04X) is U+%04X not U+%04X", l, TurkishCase.ToTitle(l), u)
}
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment