mirror of git://gcc.gnu.org/git/gcc.git
587 lines
15 KiB
Go
587 lines
15 KiB
Go
// Copyright 2012 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Package collate contains types for comparing and sorting Unicode strings
|
|
// according to a given collation order. Package locale provides a high-level
|
|
// interface to collation. Users should typically use that package instead.
|
|
package collate
|
|
|
|
import (
|
|
"bytes"
|
|
"exp/norm"
|
|
)
|
|
|
|
// AlternateHandling identifies the various ways in which variables are handled.
|
|
// A rune with a primary weight lower than the variable top is considered a
|
|
// variable.
|
|
// See http://www.unicode.org/reports/tr10/#Variable_Weighting for details.
|
|
type AlternateHandling int
|
|
|
|
const (
|
|
// AltNonIgnorable turns off special handling of variables.
|
|
AltNonIgnorable AlternateHandling = iota
|
|
|
|
// AltBlanked sets variables and all subsequent primary ignorables to be
|
|
// ignorable at all levels. This is identical to removing all variables
|
|
// and subsequent primary ignorables from the input.
|
|
AltBlanked
|
|
|
|
// AltShifted sets variables to be ignorable for levels one through three and
|
|
// adds a fourth level based on the values of the ignored levels.
|
|
AltShifted
|
|
|
|
// AltShiftTrimmed is a slight variant of AltShifted that is used to
|
|
// emulate POSIX.
|
|
AltShiftTrimmed
|
|
)
|
|
|
|
// Collator provides functionality for comparing strings for a given
|
|
// collation order.
|
|
type Collator struct {
|
|
// TODO: hide most of these options. Low-level options are set through the locale
|
|
// identifier (as defined by LDML) while high-level options are set through SetOptions.
|
|
// Using high-level options allows us to be more flexible (such as not ignoring
|
|
// Thai vowels for IgnoreDiacriticals) and more user-friendly (such as allowing
|
|
// diacritical marks to be ignored but not case without having to fiddle with levels).
|
|
|
|
// Strength sets the maximum level to use in comparison.
|
|
Strength Level
|
|
|
|
// Alternate specifies an alternative handling of variables.
|
|
Alternate AlternateHandling
|
|
|
|
// Backwards specifies the order of sorting at the secondary level.
|
|
// This option exists predominantly to support reverse sorting of accents in French.
|
|
Backwards bool
|
|
|
|
// TODO: implement:
|
|
// With HiraganaQuaternary enabled, Hiragana codepoints will get lower values
|
|
// than all the other non-variable code points. Strength must be greater or
|
|
// equal to Quaternary for this to take effect.
|
|
HiraganaQuaternary bool
|
|
|
|
// If CaseLevel is true, a level consisting only of case characteristics will
|
|
// be inserted in front of the tertiary level. To ignore accents but take
|
|
// cases into account, set Strength to Primary and CaseLevel to true.
|
|
CaseLevel bool
|
|
|
|
// If Numeric is true, any sequence of decimal digits (category is Nd) is sorted
|
|
// at a primary level with its numeric value. For example, "A-21" < "A-123".
|
|
Numeric bool
|
|
|
|
// The largest primary value that is considered to be variable.
|
|
variableTop uint32
|
|
|
|
f norm.Form
|
|
|
|
t Weigher
|
|
|
|
sorter sorter
|
|
|
|
_iter [2]iter
|
|
}
|
|
|
|
// An Option is used to change the behavior of Collator. They override the
|
|
// settings passed through the locale identifier.
|
|
type Option int
|
|
|
|
const (
|
|
Numeric Option = 1 << iota // Sort numbers numerically ("2" < "12").
|
|
IgnoreCase // Case-insensitive search.
|
|
IgnoreDiacritics // Ignore diacritical marks. ("o" == "ö").
|
|
IgnoreWidth // Ignore full versus normal width.
|
|
UpperFirst // Sort upper case before lower case.
|
|
LowerFirst // Sort lower case before upper case.
|
|
Force // Force ordering if strings are equivalent but not equal.
|
|
|
|
Loose = IgnoreDiacritics | IgnoreWidth | IgnoreCase
|
|
)
|
|
|
|
// SetOptions accepts a Options or-ed together. All previous calls to SetOptions are ignored.
|
|
func (c *Collator) SetOptions(o Option) {
|
|
// TODO: implement
|
|
}
|
|
|
|
func (c *Collator) iter(i int) *iter {
|
|
// TODO: evaluate performance for making the second iterator optional.
|
|
return &c._iter[i]
|
|
}
|
|
|
|
// Locales returns the list of locales for which collating differs from its parent locale.
|
|
// The returned value should not be modified.
|
|
func Locales() []string {
|
|
return availableLocales
|
|
}
|
|
|
|
// New returns a new Collator initialized for the given locale.
|
|
func New(loc string) *Collator {
|
|
// TODO: handle locale selection according to spec.
|
|
var t tableIndex
|
|
if loc != "" {
|
|
if idx, ok := locales[loc]; ok {
|
|
t = idx
|
|
} else {
|
|
t = locales["root"]
|
|
}
|
|
}
|
|
return NewFromTable(Init(t))
|
|
}
|
|
|
|
func NewFromTable(t Weigher) *Collator {
|
|
c := &Collator{
|
|
Strength: Tertiary,
|
|
f: norm.NFD,
|
|
t: t,
|
|
}
|
|
c._iter[0].init(c)
|
|
c._iter[1].init(c)
|
|
return c
|
|
}
|
|
|
|
// Buffer holds keys generated by Key and KeyString.
|
|
type Buffer struct {
|
|
buf [4096]byte
|
|
key []byte
|
|
}
|
|
|
|
func (b *Buffer) init() {
|
|
if b.key == nil {
|
|
b.key = b.buf[:0]
|
|
}
|
|
}
|
|
|
|
// Reset clears the buffer from previous results generated by Key and KeyString.
|
|
func (b *Buffer) Reset() {
|
|
b.key = b.key[:0]
|
|
}
|
|
|
|
// Compare returns an integer comparing the two byte slices.
|
|
// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
|
|
func (c *Collator) Compare(a, b []byte) int {
|
|
// TODO: skip identical prefixes once we have a fast way to detect if a rune is
|
|
// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
|
|
c.iter(0).setInput(a)
|
|
c.iter(1).setInput(b)
|
|
if res := c.compare(); res != 0 {
|
|
return res
|
|
}
|
|
if Identity == c.Strength {
|
|
return bytes.Compare(a, b)
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// CompareString returns an integer comparing the two strings.
|
|
// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
|
|
func (c *Collator) CompareString(a, b string) int {
|
|
// TODO: skip identical prefixes once we have a fast way to detect if a rune is
|
|
// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
|
|
c.iter(0).setInputString(a)
|
|
c.iter(1).setInputString(b)
|
|
if res := c.compare(); res != 0 {
|
|
return res
|
|
}
|
|
if Identity == c.Strength {
|
|
if a < b {
|
|
return -1
|
|
} else if a > b {
|
|
return 1
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func compareLevel(f func(i *iter) int, a, b *iter) int {
|
|
a.pce = 0
|
|
b.pce = 0
|
|
for {
|
|
va := f(a)
|
|
vb := f(b)
|
|
if va != vb {
|
|
if va < vb {
|
|
return -1
|
|
}
|
|
return 1
|
|
} else if va == 0 {
|
|
break
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func (c *Collator) compare() int {
|
|
ia, ib := c.iter(0), c.iter(1)
|
|
// Process primary level
|
|
if c.Alternate != AltShifted {
|
|
// TODO: implement script reordering
|
|
// TODO: special hiragana handling
|
|
if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 {
|
|
return res
|
|
}
|
|
} else {
|
|
// TODO: handle shifted
|
|
}
|
|
if Secondary <= c.Strength {
|
|
f := (*iter).nextSecondary
|
|
if c.Backwards {
|
|
f = (*iter).prevSecondary
|
|
}
|
|
if res := compareLevel(f, ia, ib); res != 0 {
|
|
return res
|
|
}
|
|
}
|
|
// TODO: special case handling (Danish?)
|
|
if Tertiary <= c.Strength || c.CaseLevel {
|
|
if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 {
|
|
return res
|
|
}
|
|
// TODO: Not needed for the default value of AltNonIgnorable?
|
|
if Quaternary <= c.Strength {
|
|
if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 {
|
|
return res
|
|
}
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// Key returns the collation key for str.
|
|
// Passing the buffer buf may avoid memory allocations.
|
|
// The returned slice will point to an allocation in Buffer and will remain
|
|
// valid until the next call to buf.Reset().
|
|
func (c *Collator) Key(buf *Buffer, str []byte) []byte {
|
|
// See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
|
|
buf.init()
|
|
return c.key(buf, c.getColElems(str))
|
|
}
|
|
|
|
// KeyFromString returns the collation key for str.
|
|
// Passing the buffer buf may avoid memory allocations.
|
|
// The returned slice will point to an allocation in Buffer and will retain
|
|
// valid until the next call to buf.ResetKeys().
|
|
func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
|
|
// See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
|
|
buf.init()
|
|
return c.key(buf, c.getColElemsString(str))
|
|
}
|
|
|
|
func (c *Collator) key(buf *Buffer, w []Elem) []byte {
|
|
processWeights(c.Alternate, c.variableTop, w)
|
|
kn := len(buf.key)
|
|
c.keyFromElems(buf, w)
|
|
return buf.key[kn:]
|
|
}
|
|
|
|
func (c *Collator) getColElems(str []byte) []Elem {
|
|
i := c.iter(0)
|
|
i.setInput(str)
|
|
for i.next() {
|
|
}
|
|
return i.ce
|
|
}
|
|
|
|
func (c *Collator) getColElemsString(str string) []Elem {
|
|
i := c.iter(0)
|
|
i.setInputString(str)
|
|
for i.next() {
|
|
}
|
|
return i.ce
|
|
}
|
|
|
|
type iter struct {
|
|
bytes []byte
|
|
str string
|
|
|
|
wa [512]Elem
|
|
ce []Elem
|
|
pce int
|
|
nce int // nce <= len(nce)
|
|
|
|
prevCCC uint8
|
|
pStarter int
|
|
|
|
t Weigher
|
|
}
|
|
|
|
func (i *iter) init(c *Collator) {
|
|
i.t = c.t
|
|
i.ce = i.wa[:0]
|
|
}
|
|
|
|
func (i *iter) reset() {
|
|
i.ce = i.ce[:0]
|
|
i.nce = 0
|
|
i.prevCCC = 0
|
|
i.pStarter = 0
|
|
}
|
|
|
|
func (i *iter) setInput(s []byte) *iter {
|
|
i.bytes = s
|
|
i.str = ""
|
|
i.reset()
|
|
return i
|
|
}
|
|
|
|
func (i *iter) setInputString(s string) *iter {
|
|
i.str = s
|
|
i.bytes = nil
|
|
i.reset()
|
|
return i
|
|
}
|
|
|
|
func (i *iter) done() bool {
|
|
return len(i.str) == 0 && len(i.bytes) == 0
|
|
}
|
|
|
|
func (i *iter) tail(n int) {
|
|
if i.bytes == nil {
|
|
i.str = i.str[n:]
|
|
} else {
|
|
i.bytes = i.bytes[n:]
|
|
}
|
|
}
|
|
|
|
func (i *iter) appendNext() int {
|
|
var sz int
|
|
if i.bytes == nil {
|
|
i.ce, sz = i.t.AppendNextString(i.ce, i.str)
|
|
} else {
|
|
i.ce, sz = i.t.AppendNext(i.ce, i.bytes)
|
|
}
|
|
return sz
|
|
}
|
|
|
|
// next appends Elems to the internal array until it adds an element with CCC=0.
|
|
// In the majority of cases, a Elem with a primary value > 0 will have
|
|
// a CCC of 0. The CCC values of colation elements are also used to detect if the
|
|
// input string was not normalized and to adjust the result accordingly.
|
|
func (i *iter) next() bool {
|
|
for !i.done() {
|
|
p0 := len(i.ce)
|
|
sz := i.appendNext()
|
|
i.tail(sz)
|
|
last := len(i.ce) - 1
|
|
if ccc := i.ce[last].CCC(); ccc == 0 {
|
|
i.nce = len(i.ce)
|
|
i.pStarter = last
|
|
i.prevCCC = 0
|
|
return true
|
|
} else if p0 < last && i.ce[p0].CCC() == 0 {
|
|
// set i.nce to only cover part of i.ce for which ccc == 0 and
|
|
// use rest the next call to next.
|
|
for p0++; p0 < last && i.ce[p0].CCC() == 0; p0++ {
|
|
}
|
|
i.nce = p0
|
|
i.pStarter = p0 - 1
|
|
i.prevCCC = ccc
|
|
return true
|
|
} else if ccc < i.prevCCC {
|
|
i.doNorm(p0, ccc) // should be rare for most common cases
|
|
} else {
|
|
i.prevCCC = ccc
|
|
}
|
|
}
|
|
if len(i.ce) != i.nce {
|
|
i.nce = len(i.ce)
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// nextPlain is the same as next, but does not "normalize" the collation
|
|
// elements.
|
|
// TODO: remove this function. Using this instead of next does not seem
|
|
// to improve performance in any significant way. We retain this until
|
|
// later for evaluation purposes.
|
|
func (i *iter) nextPlain() bool {
|
|
if i.done() {
|
|
return false
|
|
}
|
|
sz := i.appendNext()
|
|
i.tail(sz)
|
|
i.nce = len(i.ce)
|
|
return true
|
|
}
|
|
|
|
const maxCombiningCharacters = 30
|
|
|
|
// doNorm reorders the collation elements in i.ce.
|
|
// It assumes that blocks of collation elements added with appendNext
|
|
// either start and end with the same CCC or start with CCC == 0.
|
|
// This allows for a single insertion point for the entire block.
|
|
// The correctness of this assumption is verified in builder.go.
|
|
func (i *iter) doNorm(p int, ccc uint8) {
|
|
if p-i.pStarter > maxCombiningCharacters {
|
|
i.prevCCC = i.ce[len(i.ce)-1].CCC()
|
|
i.pStarter = len(i.ce) - 1
|
|
return
|
|
}
|
|
n := len(i.ce)
|
|
k := p
|
|
for p--; p > i.pStarter && ccc < i.ce[p-1].CCC(); p-- {
|
|
}
|
|
i.ce = append(i.ce, i.ce[p:k]...)
|
|
copy(i.ce[p:], i.ce[k:])
|
|
i.ce = i.ce[:n]
|
|
}
|
|
|
|
func (i *iter) nextPrimary() int {
|
|
for {
|
|
for ; i.pce < i.nce; i.pce++ {
|
|
if v := i.ce[i.pce].Primary(); v != 0 {
|
|
i.pce++
|
|
return v
|
|
}
|
|
}
|
|
if !i.next() {
|
|
return 0
|
|
}
|
|
}
|
|
panic("should not reach here")
|
|
}
|
|
|
|
func (i *iter) nextSecondary() int {
|
|
for ; i.pce < len(i.ce); i.pce++ {
|
|
if v := i.ce[i.pce].Secondary(); v != 0 {
|
|
i.pce++
|
|
return v
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func (i *iter) prevSecondary() int {
|
|
for ; i.pce < len(i.ce); i.pce++ {
|
|
if v := i.ce[len(i.ce)-i.pce-1].Secondary(); v != 0 {
|
|
i.pce++
|
|
return v
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func (i *iter) nextTertiary() int {
|
|
for ; i.pce < len(i.ce); i.pce++ {
|
|
if v := i.ce[i.pce].Tertiary(); v != 0 {
|
|
i.pce++
|
|
return int(v)
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func (i *iter) nextQuaternary() int {
|
|
for ; i.pce < len(i.ce); i.pce++ {
|
|
if v := i.ce[i.pce].Quaternary(); v != 0 {
|
|
i.pce++
|
|
return v
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func appendPrimary(key []byte, p int) []byte {
|
|
// Convert to variable length encoding; supports up to 23 bits.
|
|
if p <= 0x7FFF {
|
|
key = append(key, uint8(p>>8), uint8(p))
|
|
} else {
|
|
key = append(key, uint8(p>>16)|0x80, uint8(p>>8), uint8(p))
|
|
}
|
|
return key
|
|
}
|
|
|
|
// keyFromElems converts the weights ws to a compact sequence of bytes.
|
|
// The result will be appended to the byte buffer in buf.
|
|
func (c *Collator) keyFromElems(buf *Buffer, ws []Elem) {
|
|
for _, v := range ws {
|
|
if w := v.Primary(); w > 0 {
|
|
buf.key = appendPrimary(buf.key, w)
|
|
}
|
|
}
|
|
if Secondary <= c.Strength {
|
|
buf.key = append(buf.key, 0, 0)
|
|
// TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF.
|
|
if !c.Backwards {
|
|
for _, v := range ws {
|
|
if w := v.Secondary(); w > 0 {
|
|
buf.key = append(buf.key, uint8(w>>8), uint8(w))
|
|
}
|
|
}
|
|
} else {
|
|
for i := len(ws) - 1; i >= 0; i-- {
|
|
if w := ws[i].Secondary(); w > 0 {
|
|
buf.key = append(buf.key, uint8(w>>8), uint8(w))
|
|
}
|
|
}
|
|
}
|
|
} else if c.CaseLevel {
|
|
buf.key = append(buf.key, 0, 0)
|
|
}
|
|
if Tertiary <= c.Strength || c.CaseLevel {
|
|
buf.key = append(buf.key, 0, 0)
|
|
for _, v := range ws {
|
|
if w := v.Tertiary(); w > 0 {
|
|
buf.key = append(buf.key, uint8(w))
|
|
}
|
|
}
|
|
// Derive the quaternary weights from the options and other levels.
|
|
// Note that we represent MaxQuaternary as 0xFF. The first byte of the
|
|
// representation of a primary weight is always smaller than 0xFF,
|
|
// so using this single byte value will compare correctly.
|
|
if Quaternary <= c.Strength && c.Alternate >= AltShifted {
|
|
if c.Alternate == AltShiftTrimmed {
|
|
lastNonFFFF := len(buf.key)
|
|
buf.key = append(buf.key, 0)
|
|
for _, v := range ws {
|
|
if w := v.Quaternary(); w == MaxQuaternary {
|
|
buf.key = append(buf.key, 0xFF)
|
|
} else if w > 0 {
|
|
buf.key = appendPrimary(buf.key, w)
|
|
lastNonFFFF = len(buf.key)
|
|
}
|
|
}
|
|
buf.key = buf.key[:lastNonFFFF]
|
|
} else {
|
|
buf.key = append(buf.key, 0)
|
|
for _, v := range ws {
|
|
if w := v.Quaternary(); w == MaxQuaternary {
|
|
buf.key = append(buf.key, 0xFF)
|
|
} else if w > 0 {
|
|
buf.key = appendPrimary(buf.key, w)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func processWeights(vw AlternateHandling, top uint32, wa []Elem) {
|
|
ignore := false
|
|
vtop := int(top)
|
|
switch vw {
|
|
case AltShifted, AltShiftTrimmed:
|
|
for i := range wa {
|
|
if p := wa[i].Primary(); p <= vtop && p != 0 {
|
|
wa[i] = MakeQuaternary(p)
|
|
ignore = true
|
|
} else if p == 0 {
|
|
if ignore {
|
|
wa[i] = ceIgnore
|
|
}
|
|
} else {
|
|
ignore = false
|
|
}
|
|
}
|
|
case AltBlanked:
|
|
for i := range wa {
|
|
if p := wa[i].Primary(); p <= vtop && (ignore || p != 0) {
|
|
wa[i] = ceIgnore
|
|
ignore = true
|
|
} else {
|
|
ignore = false
|
|
}
|
|
}
|
|
}
|
|
}
|