Commit ef48dfa3 authored by Marcel van Lohuizen's avatar Marcel van Lohuizen

exp/locale/collate: added indices to builder for reusing blocks between locales.

Refactored build + buildTrie into build + buildOrdering.
Note that since the tailoring code is not checked in yet, all tailorings are identical
to root.  The table therefore should not and does not grow at this point.

R=r
CC=golang-dev
https://golang.org/cl/6500087
parent 21d94a22
...@@ -42,6 +42,11 @@ type Builder struct { ...@@ -42,6 +42,11 @@ type Builder struct {
minNonVar int // lowest primary recorded for a variable minNonVar int // lowest primary recorded for a variable
varTop int // highest primary recorded for a non-variable varTop int // highest primary recorded for a non-variable
// indexes used for reusing expansions and contractions
expIndex map[string]int // positions of expansions keyed by their string representation
ctHandle map[string]ctHandle // contraction handles keyed by a concatenation of the suffixes
ctElem map[string]int // contraction elements keyed by their string representation
} }
// A Tailoring builds a collation table based on another collation table. // A Tailoring builds a collation table based on another collation table.
...@@ -51,16 +56,20 @@ type Builder struct { ...@@ -51,16 +56,20 @@ type Builder struct {
// of languages (See http://www.unicode.org/Public/cldr/2.0.1/core.zip.) // of languages (See http://www.unicode.org/Public/cldr/2.0.1/core.zip.)
type Tailoring struct { type Tailoring struct {
id string id string
builder *Builder
index *ordering
// TODO: implement. // TODO: implement.
} }
// NewBuilder returns a new Builder. // NewBuilder returns a new Builder.
func NewBuilder() *Builder { func NewBuilder() *Builder {
b := &Builder{ return &Builder{
index: newTrieBuilder(), index: newTrieBuilder(),
root: makeRootOrdering(), root: makeRootOrdering(),
expIndex: make(map[string]int),
ctHandle: make(map[string]ctHandle),
ctElem: make(map[string]int),
} }
return b
} }
// Tailoring returns a Tailoring for the given locale. One should // Tailoring returns a Tailoring for the given locale. One should
...@@ -68,6 +77,8 @@ func NewBuilder() *Builder { ...@@ -68,6 +77,8 @@ func NewBuilder() *Builder {
func (b *Builder) Tailoring(locale string) *Tailoring { func (b *Builder) Tailoring(locale string) *Tailoring {
t := &Tailoring{ t := &Tailoring{
id: locale, id: locale,
builder: b,
index: b.root.clone(),
} }
b.locale = append(b.locale, t) b.locale = append(b.locale, t)
return t return t
...@@ -194,24 +205,45 @@ func (b *Builder) error(e error) { ...@@ -194,24 +205,45 @@ func (b *Builder) error(e error) {
} }
} }
func (b *Builder) buildOrdering(o *ordering) {
o.sort()
simplify(o)
b.processExpansions(o) // requires simplify
b.processContractions(o) // requires simplify
t := newNode()
for e := o.front(); e != nil; e, _ = e.nextIndexed() {
if !e.skip() {
ce, err := e.encode()
b.error(err)
t.insert(e.runes[0], ce)
}
}
o.handle = b.index.addTrie(t)
}
func (b *Builder) build() (*table, error) { func (b *Builder) build() (*table, error) {
if !b.built { if b.built {
return b.t, b.err
}
b.built = true b.built = true
b.t = &table{ b.t = &table{
maxContractLen: utf8.UTFMax, maxContractLen: utf8.UTFMax,
variableTop: uint32(b.varTop), variableTop: uint32(b.varTop),
} }
b.root.sort() b.buildOrdering(&b.root)
b.simplify() b.t.root = b.root.handle
b.processExpansions() // requires simplify for _, t := range b.locale {
b.processContractions() // requires simplify b.buildOrdering(t.index)
b.buildTrie() // requires process*
}
if b.err != nil { if b.err != nil {
return nil, b.err break
} }
return b.t, nil }
i, err := b.index.generate()
b.t.index = *i
b.error(err)
return b.t, b.err
} }
// Build builds the root Collator. // Build builds the root Collator.
...@@ -263,12 +295,10 @@ func reproducibleFromNFKD(e *entry, exp, nfkd [][]int) bool { ...@@ -263,12 +295,10 @@ func reproducibleFromNFKD(e *entry, exp, nfkd [][]int) bool {
return true return true
} }
func (b *Builder) simplify() { func simplify(o *ordering) {
// Runes that are a starter of a contraction should not be removed. // Runes that are a starter of a contraction should not be removed.
// (To date, there is only Kannada character 0CCA.) // (To date, there is only Kannada character 0CCA.)
keep := make(map[rune]bool) keep := make(map[rune]bool)
o := b.root
for e := o.front(); e != nil; e, _ = e.nextIndexed() { for e := o.front(); e != nil; e, _ = e.nextIndexed() {
if len(e.runes) > 1 { if len(e.runes) > 1 {
keep[e.runes[0]] = true keep[e.runes[0]] = true
...@@ -320,27 +350,24 @@ func (b *Builder) appendExpansion(e *entry) int { ...@@ -320,27 +350,24 @@ func (b *Builder) appendExpansion(e *entry) int {
// processExpansions extracts data necessary to generate // processExpansions extracts data necessary to generate
// the extraction tables. // the extraction tables.
func (b *Builder) processExpansions() { func (b *Builder) processExpansions(o *ordering) {
eidx := make(map[string]int)
o := b.root
for e := o.front(); e != nil; e, _ = e.nextIndexed() { for e := o.front(); e != nil; e, _ = e.nextIndexed() {
if !e.expansion() { if !e.expansion() {
continue continue
} }
key := fmt.Sprintf("%v", e.elems) key := fmt.Sprintf("%v", e.elems)
i, ok := eidx[key] i, ok := b.expIndex[key]
if !ok { if !ok {
i = b.appendExpansion(e) i = b.appendExpansion(e)
eidx[key] = i b.expIndex[key] = i
} }
e.expansionIndex = i e.expansionIndex = i
} }
} }
func (b *Builder) processContractions() { func (b *Builder) processContractions(o *ordering) {
// Collate contractions per starter rune. // Collate contractions per starter rune.
starters := []rune{} starters := []rune{}
o := b.root
cm := make(map[rune][]*entry) cm := make(map[rune][]*entry)
for e := o.front(); e != nil; e, _ = e.nextIndexed() { for e := o.front(); e != nil; e, _ = e.nextIndexed() {
if e.contraction() { if e.contraction() {
...@@ -365,7 +392,6 @@ func (b *Builder) processContractions() { ...@@ -365,7 +392,6 @@ func (b *Builder) processContractions() {
} }
// Build the tries for the contractions. // Build the tries for the contractions.
t := b.t t := b.t
handlemap := make(map[string]ctHandle)
for _, r := range starters { for _, r := range starters {
l := cm[r] l := cm[r]
// Compute suffix strings. There are 31 different contraction suffix // Compute suffix strings. There are 31 different contraction suffix
...@@ -387,14 +413,14 @@ func (b *Builder) processContractions() { ...@@ -387,14 +413,14 @@ func (b *Builder) processContractions() {
// Unique the suffix set. // Unique the suffix set.
sort.Strings(sufx) sort.Strings(sufx)
key := strings.Join(sufx, "\n") key := strings.Join(sufx, "\n")
handle, ok := handlemap[key] handle, ok := b.ctHandle[key]
if !ok { if !ok {
var err error var err error
handle, err = t.contractTries.appendTrie(sufx) handle, err = t.contractTries.appendTrie(sufx)
if err != nil { if err != nil {
b.error(err) b.error(err)
} }
handlemap[key] = handle b.ctHandle[key] = handle
} }
// Bucket sort entries in index order. // Bucket sort entries in index order.
es := make([]*entry, len(l)) es := make([]*entry, len(l))
...@@ -412,30 +438,22 @@ func (b *Builder) processContractions() { ...@@ -412,30 +438,22 @@ func (b *Builder) processContractions() {
} }
es[o] = e es[o] = e
} }
// Store info in entry for starter rune. // Create collation elements for contractions.
es[0].contractionIndex = len(t.contractElem) elems := []uint32{}
es[0].contractionHandle = handle
// Add collation elements for contractions.
for _, e := range es { for _, e := range es {
ce, err := e.encodeBase() ce, err := e.encodeBase()
b.error(err) b.error(err)
t.contractElem = append(t.contractElem, ce) elems = append(elems, ce)
} }
key = fmt.Sprintf("%v", elems)
i, ok := b.ctElem[key]
if !ok {
i = len(t.contractElem)
b.ctElem[key] = i
t.contractElem = append(t.contractElem, elems...)
} }
} // Store info in entry for starter rune.
es[0].contractionIndex = i
func (b *Builder) buildTrie() { es[0].contractionHandle = handle
t := newNode()
o := b.root
for e := o.front(); e != nil; e, _ = e.nextIndexed() {
if !e.skip() {
ce, err := e.encode()
b.error(err)
t.insert(e.runes[0], ce)
}
} }
b.t.root = b.index.addTrie(t)
i, err := b.index.generate()
b.t.index = *i
b.error(err)
} }
...@@ -143,8 +143,8 @@ var simplifyMarked = strArray{"\u01C5"} ...@@ -143,8 +143,8 @@ var simplifyMarked = strArray{"\u01C5"}
func TestSimplify(t *testing.T) { func TestSimplify(t *testing.T) {
b := newBuilder(t, simplifyTest) b := newBuilder(t, simplifyTest)
o := b.root o := &b.root
b.simplify() simplify(o)
for i, tt := range simplifyTest { for i, tt := range simplifyTest {
if simplifyRemoved.contains(tt.str) { if simplifyRemoved.contains(tt.str) {
...@@ -186,7 +186,7 @@ func TestExpand(t *testing.T) { ...@@ -186,7 +186,7 @@ func TestExpand(t *testing.T) {
) )
b := newBuilder(t, expandTest) b := newBuilder(t, expandTest)
o := &b.root o := &b.root
b.processExpansions() b.processExpansions(o)
e := o.front() e := o.front()
for _, tt := range expandTest { for _, tt := range expandTest {
...@@ -234,7 +234,7 @@ func TestContract(t *testing.T) { ...@@ -234,7 +234,7 @@ func TestContract(t *testing.T) {
) )
b := newBuilder(t, contractTest) b := newBuilder(t, contractTest)
o := &b.root o := &b.root
b.processContractions() b.processContractions(o)
indexMap := make(map[int]bool) indexMap := make(map[int]bool)
handleMap := make(map[rune]*entry) handleMap := make(map[rune]*entry)
......
...@@ -180,6 +180,7 @@ func (s sortedEntries) Less(i, j int) bool { ...@@ -180,6 +180,7 @@ func (s sortedEntries) Less(i, j int) bool {
type ordering struct { type ordering struct {
entryMap map[string]*entry entryMap map[string]*entry
ordered []*entry ordered []*entry
handle *trieHandle
} }
// insert inserts e into both entryMap and ordered. // insert inserts e into both entryMap and ordered.
...@@ -264,6 +265,7 @@ func (o *ordering) clone() *ordering { ...@@ -264,6 +265,7 @@ func (o *ordering) clone() *ordering {
str: e.str, str: e.str,
decompose: e.decompose, decompose: e.decompose,
exclude: e.exclude, exclude: e.exclude,
logical: e.logical,
} }
oo.insert(ne) oo.insert(ne)
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment