forminfo.go 5.61 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package norm

// This file contains Form-specific logic and wrappers for data in tables.go.

type runeInfo struct {
	pos   uint8  // start position in reorderBuffer; used in composition.go
	size  uint8  // length of UTF-8 encoding of this rune
	ccc   uint8  // canonical combining class
	flags qcInfo // quick check flags
}

// functions dispatchable per form
type boundaryFunc func(f *formInfo, info runeInfo) bool
type lookupFunc func(b []byte) runeInfo
type lookupFuncString func(s string) runeInfo
type decompFunc func(b []byte) []byte
type decompFuncString func(s string) []byte

// formInfo holds Form-specific functions and tables.
type formInfo struct {
	form Form

	composing, compatibility bool // form type

	decompose       decompFunc
	decomposeString decompFuncString
	info            lookupFunc
	infoString      lookupFuncString
	boundaryBefore  boundaryFunc
	boundaryAfter   boundaryFunc
}

var formTable []*formInfo

func init() {
	formTable = make([]*formInfo, 4)

	for i := range formTable {
		f := &formInfo{}
		formTable[i] = f
		f.form = Form(i)
		if Form(i) == NFKD || Form(i) == NFKC {
			f.compatibility = true
			f.decompose = decomposeNFKC
			f.decomposeString = decomposeStringNFKC
			f.info = lookupInfoNFKC
			f.infoString = lookupInfoStringNFKC
		} else {
			f.decompose = decomposeNFC
			f.decomposeString = decomposeStringNFC
			f.info = lookupInfoNFC
			f.infoString = lookupInfoStringNFC
		}
		if Form(i) == NFC || Form(i) == NFKC {
			f.composing = true
			f.boundaryBefore = compBoundaryBefore
			f.boundaryAfter = compBoundaryAfter
		} else {
			f.boundaryBefore = decompBoundary
			f.boundaryAfter = decompBoundary
		}
	}
}

func decompBoundary(f *formInfo, info runeInfo) bool {
	if info.ccc == 0 && info.flags.isYesD() { // Implies isHangul(b) == true
		return true
	}
	// We assume that the CCC of the first character in a decomposition
	// is always non-zero if different from info.ccc and that we can return
	// false at this point. This is verified by maketables.
	return false
}

func compBoundaryBefore(f *formInfo, info runeInfo) bool {
80
	if info.ccc == 0 && !info.flags.combinesBackward() {
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
		return true
	}
	// We assume that the CCC of the first character in a decomposition
	// is always non-zero if different from info.ccc and that we can return
	// false at this point. This is verified by maketables.
	return false
}

func compBoundaryAfter(f *formInfo, info runeInfo) bool {
	// This misses values where the last char in a decomposition is a
	// boundary such as Hangul with JamoT.
	return info.flags.isInert()
}

// We pack quick check data in 4 bits:
//   0:    NFD_QC Yes (0) or No (1). No also means there is a decomposition.
//   1..2: NFC_QC Yes(00), No (01), or Maybe (11)
//   3:    Combines forward  (0 == false, 1 == true)
// 
// When all 4 bits are zero, the character is inert, meaning it is never
// influenced by normalization.
//
// We pack the bits for both NFC/D and NFKC/D in one byte.
type qcInfo uint8

func (i qcInfo) isYesC() bool  { return i&0x2 == 0 }
func (i qcInfo) isNoC() bool   { return i&0x6 == 0x2 }
func (i qcInfo) isMaybe() bool { return i&0x4 != 0 }
func (i qcInfo) isYesD() bool  { return i&0x1 == 0 }
func (i qcInfo) isNoD() bool   { return i&0x1 != 0 }
func (i qcInfo) isInert() bool { return i&0xf == 0 }

func (i qcInfo) combinesForward() bool  { return i&0x8 != 0 }
func (i qcInfo) combinesBackward() bool { return i&0x4 != 0 } // == isMaybe
func (i qcInfo) hasDecomposition() bool { return i&0x1 != 0 } // == isNoD

// Wrappers for tables.go

// The 16-bit value of the decompostion tries is an index into a byte
// array of UTF-8 decomposition sequences. The first byte is the number
// of bytes in the decomposition (excluding this length byte). The actual
// sequence starts at the offset+1.
func decomposeNFC(b []byte) []byte {
	p := nfcDecompTrie.lookupUnsafe(b)
	n := decomps[p]
	p++
	return decomps[p : p+uint16(n)]
}

func decomposeNFKC(b []byte) []byte {
	p := nfkcDecompTrie.lookupUnsafe(b)
	n := decomps[p]
	p++
	return decomps[p : p+uint16(n)]
}

func decomposeStringNFC(s string) []byte {
	p := nfcDecompTrie.lookupStringUnsafe(s)
	n := decomps[p]
	p++
	return decomps[p : p+uint16(n)]
}

func decomposeStringNFKC(s string) []byte {
	p := nfkcDecompTrie.lookupStringUnsafe(s)
	n := decomps[p]
	p++
	return decomps[p : p+uint16(n)]
}

// Recomposition
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
// This clips off the bits of three entries, but we know this will not
// result in a collision. In the unlikely event that changes to
// UnicodeData.txt introduce collisions, the compiler will catch it.
// Note that the recomposition map for NFC and NFKC are identical.

// combine returns the combined rune or 0 if it doesn't exist.
func combine(a, b uint32) uint32 {
	key := uint32(uint16(a))<<16 + uint32(uint16(b))
	return recompMap[key]
}

// The 16-bit character info has the following bit layout:
//    0..7   CCC value.
//    8..11  qcInfo for NFC/NFD
//   12..15  qcInfo for NFKC/NFKD
func lookupInfoNFC(b []byte) runeInfo {
	v, sz := charInfoTrie.lookup(b)
	return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 8)}
}

func lookupInfoStringNFC(s string) runeInfo {
	v, sz := charInfoTrie.lookupString(s)
	return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 8)}
}

func lookupInfoNFKC(b []byte) runeInfo {
	v, sz := charInfoTrie.lookup(b)
	return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 12)}
}

func lookupInfoStringNFKC(s string) runeInfo {
	v, sz := charInfoTrie.lookupString(s)
	return runeInfo{0, uint8(sz), uint8(v), qcInfo(v >> 12)}
}