Commit 8ba20dbd authored by Marcel van Lohuizen's avatar Marcel van Lohuizen

exp/norm: a few minor changes in prepration for a table format change:

 - Unified bounary conditions for NFC and NFD and removed some indirections.
   This enforces boundaries at the character level, which is typically what
   the user expects. (NFD allows a boundary between 'a' and '`', for example,
   which may give unexpected results for collation.  The current implementation
   is already stricter than the standard, so nothing much changes.  This change
   just formalizes it.
 - Moved methods of qcflags to runeInfo.
 - Swapped YesC and YesMaybe bits in qcFlags. This is to aid future changes.
 - runeInfo return values use named fields in preperation for struct change.
 - Replaced some left-over uint32s with rune.

R=r
CC=golang-dev
https://golang.org/cl/5607050
parent d673c95d
...@@ -98,10 +98,10 @@ func (rb *reorderBuffer) insertOrdered(info runeInfo) bool { ...@@ -98,10 +98,10 @@ func (rb *reorderBuffer) insertOrdered(info runeInfo) bool {
func (rb *reorderBuffer) insert(src input, i int, info runeInfo) bool { func (rb *reorderBuffer) insert(src input, i int, info runeInfo) bool {
if info.size == 3 { if info.size == 3 {
if rune := src.hangul(i); rune != 0 { if rune := src.hangul(i); rune != 0 {
return rb.decomposeHangul(uint32(rune)) return rb.decomposeHangul(rune)
} }
} }
if info.flags.hasDecomposition() { if info.hasDecomposition() {
dcomp := rb.f.decompose(src, i) dcomp := rb.f.decompose(src, i)
rb.tmpBytes = inputBytes(dcomp) rb.tmpBytes = inputBytes(dcomp)
for i := 0; i < len(dcomp); { for i := 0; i < len(dcomp); {
...@@ -126,26 +126,26 @@ func (rb *reorderBuffer) insert(src input, i int, info runeInfo) bool { ...@@ -126,26 +126,26 @@ func (rb *reorderBuffer) insert(src input, i int, info runeInfo) bool {
} }
// appendRune inserts a rune at the end of the buffer. It is used for Hangul. // appendRune inserts a rune at the end of the buffer. It is used for Hangul.
func (rb *reorderBuffer) appendRune(r uint32) { func (rb *reorderBuffer) appendRune(r rune) {
bn := rb.nbyte bn := rb.nbyte
sz := utf8.EncodeRune(rb.byte[bn:], rune(r)) sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
rb.nbyte += utf8.UTFMax rb.nbyte += utf8.UTFMax
rb.rune[rb.nrune] = runeInfo{bn, uint8(sz), 0, 0} rb.rune[rb.nrune] = runeInfo{pos: bn, size: uint8(sz)}
rb.nrune++ rb.nrune++
} }
// assignRune sets a rune at position pos. It is used for Hangul and recomposition. // assignRune sets a rune at position pos. It is used for Hangul and recomposition.
func (rb *reorderBuffer) assignRune(pos int, r uint32) { func (rb *reorderBuffer) assignRune(pos int, r rune) {
bn := rb.rune[pos].pos bn := rb.rune[pos].pos
sz := utf8.EncodeRune(rb.byte[bn:], rune(r)) sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
rb.rune[pos] = runeInfo{bn, uint8(sz), 0, 0} rb.rune[pos] = runeInfo{pos: bn, size: uint8(sz)}
} }
// runeAt returns the rune at position n. It is used for Hangul and recomposition. // runeAt returns the rune at position n. It is used for Hangul and recomposition.
func (rb *reorderBuffer) runeAt(n int) uint32 { func (rb *reorderBuffer) runeAt(n int) rune {
inf := rb.rune[n] inf := rb.rune[n]
r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size]) r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size])
return uint32(r) return r
} }
// bytesAt returns the UTF-8 encoding of the rune at position n. // bytesAt returns the UTF-8 encoding of the rune at position n.
...@@ -237,7 +237,7 @@ func isHangulWithoutJamoT(b []byte) bool { ...@@ -237,7 +237,7 @@ func isHangulWithoutJamoT(b []byte) bool {
// decomposeHangul algorithmically decomposes a Hangul rune into // decomposeHangul algorithmically decomposes a Hangul rune into
// its Jamo components. // its Jamo components.
// See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul. // See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
func (rb *reorderBuffer) decomposeHangul(r uint32) bool { func (rb *reorderBuffer) decomposeHangul(r rune) bool {
b := rb.rune[:] b := rb.rune[:]
n := rb.nrune n := rb.nrune
if n+3 > len(b) { if n+3 > len(b) {
...@@ -319,7 +319,7 @@ func (rb *reorderBuffer) compose() { ...@@ -319,7 +319,7 @@ func (rb *reorderBuffer) compose() {
// get the info for the combined character. This is more // get the info for the combined character. This is more
// expensive than using the filter. Using combinesBackward() // expensive than using the filter. Using combinesBackward()
// is safe. // is safe.
if ii.flags.combinesBackward() { if ii.combinesBackward() {
cccB := b[k-1].ccc cccB := b[k-1].ccc
cccC := ii.ccc cccC := ii.ccc
blocked := false // b[i] blocked by starter or greater or equal CCC? blocked := false // b[i] blocked by starter or greater or equal CCC?
......
...@@ -14,7 +14,6 @@ type runeInfo struct { ...@@ -14,7 +14,6 @@ type runeInfo struct {
} }
// functions dispatchable per form // functions dispatchable per form
type boundaryFunc func(f *formInfo, info runeInfo) bool
type lookupFunc func(b input, i int) runeInfo type lookupFunc func(b input, i int) runeInfo
type decompFunc func(b input, i int) []byte type decompFunc func(b input, i int) []byte
...@@ -24,10 +23,8 @@ type formInfo struct { ...@@ -24,10 +23,8 @@ type formInfo struct {
composing, compatibility bool // form type composing, compatibility bool // form type
decompose decompFunc decompose decompFunc
info lookupFunc info lookupFunc
boundaryBefore boundaryFunc
boundaryAfter boundaryFunc
} }
var formTable []*formInfo var formTable []*formInfo
...@@ -49,27 +46,17 @@ func init() { ...@@ -49,27 +46,17 @@ func init() {
} }
if Form(i) == NFC || Form(i) == NFKC { if Form(i) == NFC || Form(i) == NFKC {
f.composing = true f.composing = true
f.boundaryBefore = compBoundaryBefore
f.boundaryAfter = compBoundaryAfter
} else {
f.boundaryBefore = decompBoundary
f.boundaryAfter = decompBoundary
} }
} }
} }
func decompBoundary(f *formInfo, info runeInfo) bool { // We do not distinguish between boundaries for NFC, NFD, etc. to avoid
if info.ccc == 0 && info.flags.isYesD() { // Implies isHangul(b) == true // unexpected behavior for the user. For example, in NFD, there is a boundary
return true // after 'a'. However, a might combine with modifiers, so from the application's
} // perspective it is not a good boundary. We will therefore always use the
// We assume that the CCC of the first character in a decomposition // boundaries for the combining variants.
// is always non-zero if different from info.ccc and that we can return func (i runeInfo) boundaryBefore() bool {
// false at this point. This is verified by maketables. if i.ccc == 0 && !i.combinesBackward() {
return false
}
func compBoundaryBefore(f *formInfo, info runeInfo) bool {
if info.ccc == 0 && !info.flags.combinesBackward() {
return true return true
} }
// We assume that the CCC of the first character in a decomposition // We assume that the CCC of the first character in a decomposition
...@@ -78,15 +65,13 @@ func compBoundaryBefore(f *formInfo, info runeInfo) bool { ...@@ -78,15 +65,13 @@ func compBoundaryBefore(f *formInfo, info runeInfo) bool {
return false return false
} }
func compBoundaryAfter(f *formInfo, info runeInfo) bool { func (i runeInfo) boundaryAfter() bool {
// This misses values where the last char in a decomposition is a return i.isInert()
// boundary such as Hangul with JamoT.
return info.isInert()
} }
// We pack quick check data in 4 bits: // We pack quick check data in 4 bits:
// 0: NFD_QC Yes (0) or No (1). No also means there is a decomposition. // 0: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
// 1..2: NFC_QC Yes(00), No (01), or Maybe (11) // 1..2: NFC_QC Yes(00), No (10), or Maybe (11)
// 3: Combines forward (0 == false, 1 == true) // 3: Combines forward (0 == false, 1 == true)
// //
// When all 4 bits are zero, the character is inert, meaning it is never // When all 4 bits are zero, the character is inert, meaning it is never
...@@ -95,15 +80,12 @@ func compBoundaryAfter(f *formInfo, info runeInfo) bool { ...@@ -95,15 +80,12 @@ func compBoundaryAfter(f *formInfo, info runeInfo) bool {
// We pack the bits for both NFC/D and NFKC/D in one byte. // We pack the bits for both NFC/D and NFKC/D in one byte.
type qcInfo uint8 type qcInfo uint8
func (i qcInfo) isYesC() bool { return i&0x2 == 0 } func (i runeInfo) isYesC() bool { return i.flags&0x4 == 0 }
func (i qcInfo) isNoC() bool { return i&0x6 == 0x2 } func (i runeInfo) isYesD() bool { return i.flags&0x1 == 0 }
func (i qcInfo) isMaybe() bool { return i&0x4 != 0 }
func (i qcInfo) isYesD() bool { return i&0x1 == 0 }
func (i qcInfo) isNoD() bool { return i&0x1 != 0 }
func (i qcInfo) combinesForward() bool { return i&0x8 != 0 } func (i runeInfo) combinesForward() bool { return i.flags&0x8 != 0 }
func (i qcInfo) combinesBackward() bool { return i&0x4 != 0 } // == isMaybe func (i runeInfo) combinesBackward() bool { return i.flags&0x2 != 0 } // == isMaybe
func (i qcInfo) hasDecomposition() bool { return i&0x1 != 0 } // == isNoD func (i runeInfo) hasDecomposition() bool { return i.flags&0x1 != 0 } // == isNoD
func (r runeInfo) isInert() bool { func (r runeInfo) isInert() bool {
return r.flags&0xf == 0 && r.ccc == 0 return r.flags&0xf == 0 && r.ccc == 0
...@@ -137,7 +119,7 @@ func decomposeNFKC(s input, i int) []byte { ...@@ -137,7 +119,7 @@ func decomposeNFKC(s input, i int) []byte {
// Note that the recomposition map for NFC and NFKC are identical. // Note that the recomposition map for NFC and NFKC are identical.
// combine returns the combined rune or 0 if it doesn't exist. // combine returns the combined rune or 0 if it doesn't exist.
func combine(a, b uint32) uint32 { func combine(a, b rune) rune {
key := uint32(uint16(a))<<16 + uint32(uint16(b)) key := uint32(uint16(a))<<16 + uint32(uint16(b))
return recompMap[key] return recompMap[key]
} }
...@@ -148,10 +130,10 @@ func combine(a, b uint32) uint32 { ...@@ -148,10 +130,10 @@ func combine(a, b uint32) uint32 {
// 12..15 qcInfo for NFKC/NFKD // 12..15 qcInfo for NFKC/NFKD
func lookupInfoNFC(b input, i int) runeInfo { func lookupInfoNFC(b input, i int) runeInfo {
v, sz := b.charinfo(i) v, sz := b.charinfo(i)
return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 8)} return runeInfo{size: uint8(sz), ccc: uint8(v), flags: qcInfo(v >> 8)}
} }
func lookupInfoNFKC(b input, i int) runeInfo { func lookupInfoNFKC(b input, i int) runeInfo {
v, sz := b.charinfo(i) v, sz := b.charinfo(i)
return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 12)} return runeInfo{size: uint8(sz), ccc: uint8(v), flags: qcInfo(v >> 12)}
} }
...@@ -14,7 +14,7 @@ type input interface { ...@@ -14,7 +14,7 @@ type input interface {
charinfo(p int) (uint16, int) charinfo(p int) (uint16, int)
decomposeNFC(p int) uint16 decomposeNFC(p int) uint16
decomposeNFKC(p int) uint16 decomposeNFKC(p int) uint16
hangul(p int) uint32 hangul(p int) rune
} }
type inputString string type inputString string
...@@ -54,12 +54,12 @@ func (s inputString) decomposeNFKC(p int) uint16 { ...@@ -54,12 +54,12 @@ func (s inputString) decomposeNFKC(p int) uint16 {
return nfkcDecompTrie.lookupStringUnsafe(string(s[p:])) return nfkcDecompTrie.lookupStringUnsafe(string(s[p:]))
} }
func (s inputString) hangul(p int) uint32 { func (s inputString) hangul(p int) rune {
if !isHangulString(string(s[p:])) { if !isHangulString(string(s[p:])) {
return 0 return 0
} }
rune, _ := utf8.DecodeRuneInString(string(s[p:])) rune, _ := utf8.DecodeRuneInString(string(s[p:]))
return uint32(rune) return rune
} }
type inputBytes []byte type inputBytes []byte
...@@ -96,10 +96,10 @@ func (s inputBytes) decomposeNFKC(p int) uint16 { ...@@ -96,10 +96,10 @@ func (s inputBytes) decomposeNFKC(p int) uint16 {
return nfkcDecompTrie.lookupUnsafe(s[p:]) return nfkcDecompTrie.lookupUnsafe(s[p:])
} }
func (s inputBytes) hangul(p int) uint32 { func (s inputBytes) hangul(p int) rune {
if !isHangul(s[p:]) { if !isHangul(s[p:]) {
return 0 return 0
} }
rune, _ := utf8.DecodeRune(s[p:]) rune, _ := utf8.DecodeRune(s[p:])
return uint32(rune) return rune
} }
...@@ -562,7 +562,7 @@ func makeEntry(f *FormInfo) uint16 { ...@@ -562,7 +562,7 @@ func makeEntry(f *FormInfo) uint16 {
switch f.quickCheck[MComposed] { switch f.quickCheck[MComposed] {
case QCYes: case QCYes:
case QCNo: case QCNo:
e |= 0x2 e |= 0x4
case QCMaybe: case QCMaybe:
e |= 0x6 e |= 0x6
default: default:
...@@ -718,7 +718,7 @@ func makeTables() { ...@@ -718,7 +718,7 @@ func makeTables() {
sz := nrentries * 8 sz := nrentries * 8
size += sz size += sz
fmt.Printf("// recompMap: %d bytes (entries only)\n", sz) fmt.Printf("// recompMap: %d bytes (entries only)\n", sz)
fmt.Println("var recompMap = map[uint32]uint32{") fmt.Println("var recompMap = map[uint32]rune{")
for i, c := range chars { for i, c := range chars {
f := c.forms[FCanonical] f := c.forms[FCanonical]
d := f.decomp d := f.decomp
......
...@@ -188,11 +188,11 @@ func doAppend(rb *reorderBuffer, out []byte, p int) []byte { ...@@ -188,11 +188,11 @@ func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
var info runeInfo var info runeInfo
if p < n { if p < n {
info = fd.info(src, p) info = fd.info(src, p)
if p == 0 && !fd.boundaryBefore(fd, info) { if p == 0 && !info.boundaryBefore() {
out = decomposeToLastBoundary(rb, out) out = decomposeToLastBoundary(rb, out)
} }
} }
if info.size == 0 || fd.boundaryBefore(fd, info) { if info.size == 0 || info.boundaryBefore() {
if fd.composing { if fd.composing {
rb.compose() rb.compose()
} }
...@@ -257,11 +257,11 @@ func quickSpan(rb *reorderBuffer, i int) int { ...@@ -257,11 +257,11 @@ func quickSpan(rb *reorderBuffer, i int) int {
} }
cc := info.ccc cc := info.ccc
if rb.f.composing { if rb.f.composing {
if !info.flags.isYesC() { if !info.isYesC() {
break break
} }
} else { } else {
if !info.flags.isYesD() { if !info.isYesD() {
break break
} }
} }
...@@ -316,13 +316,13 @@ func firstBoundary(rb *reorderBuffer) int { ...@@ -316,13 +316,13 @@ func firstBoundary(rb *reorderBuffer) int {
} }
fd := &rb.f fd := &rb.f
info := fd.info(src, i) info := fd.info(src, i)
for n := 0; info.size != 0 && !fd.boundaryBefore(fd, info); { for n := 0; info.size != 0 && !info.boundaryBefore(); {
i += int(info.size) i += int(info.size)
if n++; n >= maxCombiningChars { if n++; n >= maxCombiningChars {
return i return i
} }
if i >= nsrc { if i >= nsrc {
if !fd.boundaryAfter(fd, info) { if !info.boundaryAfter() {
return -1 return -1
} }
return nsrc return nsrc
...@@ -368,11 +368,11 @@ func lastBoundary(fd *formInfo, b []byte) int { ...@@ -368,11 +368,11 @@ func lastBoundary(fd *formInfo, b []byte) int {
if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8 if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
return i return i
} }
if fd.boundaryAfter(fd, info) { if info.boundaryAfter() {
return i return i
} }
i = p i = p
for n := 0; i >= 0 && !fd.boundaryBefore(fd, info); { for n := 0; i >= 0 && !info.boundaryBefore(); {
info, p = lastRuneStart(fd, b[:i]) info, p = lastRuneStart(fd, b[:i])
if n++; n >= maxCombiningChars { if n++; n >= maxCombiningChars {
return len(b) return len(b)
...@@ -404,7 +404,7 @@ func decomposeSegment(rb *reorderBuffer, sp int) int { ...@@ -404,7 +404,7 @@ func decomposeSegment(rb *reorderBuffer, sp int) int {
break break
} }
info = rb.f.info(rb.src, sp) info = rb.f.info(rb.src, sp)
bound := rb.f.boundaryBefore(&rb.f, info) bound := info.boundaryBefore()
if bound || info.size == 0 { if bound || info.size == 0 {
break break
} }
...@@ -419,7 +419,7 @@ func lastRuneStart(fd *formInfo, buf []byte) (runeInfo, int) { ...@@ -419,7 +419,7 @@ func lastRuneStart(fd *formInfo, buf []byte) (runeInfo, int) {
for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- { for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
} }
if p < 0 { if p < 0 {
return runeInfo{0, 0, 0, 0}, -1 return runeInfo{}, -1
} }
return fd.info(inputBytes(buf), p), p return fd.info(inputBytes(buf), p), p
} }
...@@ -433,7 +433,7 @@ func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte { ...@@ -433,7 +433,7 @@ func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
// illegal trailing continuation bytes // illegal trailing continuation bytes
return buf return buf
} }
if rb.f.boundaryAfter(fd, info) { if info.boundaryAfter() {
return buf return buf
} }
var add [maxBackRunes]runeInfo // stores runeInfo in reverse order var add [maxBackRunes]runeInfo // stores runeInfo in reverse order
...@@ -441,13 +441,13 @@ func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte { ...@@ -441,13 +441,13 @@ func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
padd := 1 padd := 1
n := 1 n := 1
p := len(buf) - int(info.size) p := len(buf) - int(info.size)
for ; p >= 0 && !rb.f.boundaryBefore(fd, info); p -= int(info.size) { for ; p >= 0 && !info.boundaryBefore(); p -= int(info.size) {
info, i = lastRuneStart(fd, buf[:p]) info, i = lastRuneStart(fd, buf[:p])
if int(info.size) != p-i { if int(info.size) != p-i {
break break
} }
// Check that decomposition doesn't result in overflow. // Check that decomposition doesn't result in overflow.
if info.flags.hasDecomposition() { if info.hasDecomposition() {
dcomp := rb.f.decompose(inputBytes(buf), p-int(info.size)) dcomp := rb.f.decompose(inputBytes(buf), p-int(info.size))
for i := 0; i < len(dcomp); { for i := 0; i < len(dcomp); {
inf := rb.f.info(inputBytes(dcomp), i) inf := rb.f.info(inputBytes(dcomp), i)
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment