runes.go 5.53 KB
Newer Older
Jeromy's avatar
Jeromy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package runes provide transforms for UTF-8 encoded text.
package runes

import (
	"unicode"
	"unicode/utf8"

	"gx/QmVgdgtv5rUcWWcHvHN1vdGkXi8UNztJ7JuT72YUJgG2ic/go-text/transform"
)

// A Set is a collection of runes.
type Set interface {
	// Contains returns true if r is contained in the set.
	Contains(r rune) bool
}

type setFunc func(rune) bool

func (s setFunc) Contains(r rune) bool {
	return s(r)
}

// Note: using funcs here instead of wrapping types result in cleaner
// documentation and a smaller API.

// In creates a Set with a Contains method that returns true for all runes in
// the given RangeTable.
func In(rt *unicode.RangeTable) Set {
	return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
}

// In creates a Set with a Contains method that returns true for all runes not
// in the given RangeTable.
func NotIn(rt *unicode.RangeTable) Set {
	return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
}

// Predicate creates a Set with a Contains method that returns f(r).
func Predicate(f func(rune) bool) Set {
	return setFunc(f)
}

// Transformer implements the transform.Transformer interface.
type Transformer struct {
	transform.Transformer
}

// Bytes returns a new byte slice with the result of converting b using t.  It
// calls Reset on t. It returns nil if any error was found. This can only happen
// if an error-producing Transformer is passed to If.
func (t Transformer) Bytes(b []byte) []byte {
	b, _, err := transform.Bytes(t, b)
	if err != nil {
		return nil
	}
	return b
}

// String returns a string with the result of converting s using t. It calls
// Reset on t. It returns the empty string if any error was found. This can only
// happen if an error-producing Transformer is passed to If.
func (t Transformer) String(s string) string {
	s, _, err := transform.String(t, s)
	if err != nil {
		return ""
	}
	return s
}

// TODO:
// - Copy: copying strings and bytes in whole-rune units.
// - Validation (maybe)
// - Well-formed-ness (maybe)

const runeErrorString = string(utf8.RuneError)

// Remove returns a Transformer that removes runes r for which s.Contains(r).
// Illegal input bytes are replaced by RuneError before being passed to f.
func Remove(s Set) Transformer {
	if f, ok := s.(setFunc); ok {
		// This little trick cuts the running time of BenchmarkRemove for sets
		// created by Predicate roughly in half.
		// TODO: special-case RangeTables as well.
		return Transformer{remove(f)}
	}
	return Transformer{remove(s.Contains)}
}

// TODO: remove transform.RemoveFunc.

type remove func(r rune) bool

func (remove) Reset() {}

// Transform implements transform.Transformer.
func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	for r, size := rune(0), 0; nSrc < len(src); {
		if r = rune(src[nSrc]); r < utf8.RuneSelf {
			size = 1
		} else {
			r, size = utf8.DecodeRune(src[nSrc:])

			if size == 1 {
				// Invalid rune.
				if !atEOF && !utf8.FullRune(src[nSrc:]) {
					err = transform.ErrShortSrc
					break
				}
				// We replace illegal bytes with RuneError. Not doing so might
				// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
				// The resulting byte sequence may subsequently contain runes
				// for which t(r) is true that were passed unnoticed.
				if !t(utf8.RuneError) {
					if nDst+3 > len(dst) {
						err = transform.ErrShortDst
						break
					}
					dst[nDst+0] = runeErrorString[0]
					dst[nDst+1] = runeErrorString[1]
					dst[nDst+2] = runeErrorString[2]
					nDst += 3
				}
				nSrc++
				continue
			}
		}

		if t(r) {
			nSrc += size
			continue
		}
		if nDst+size > len(dst) {
			err = transform.ErrShortDst
			break
		}
		for i := 0; i < size; i++ {
			dst[nDst] = src[nSrc]
			nDst++
			nSrc++
		}
	}
	return
}

// Map returns a Transformer that maps the runes in the input using the given
// mapping. Illegal bytes in the input are converted to utf8.RuneError before
// being passed to the mapping func.
func Map(mapping func(rune) rune) Transformer {
	return Transformer{mapper(mapping)}
}

type mapper func(rune) rune

func (mapper) Reset() {}

// Transform implements transform.Transformer.
func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	var replacement rune
	var b [utf8.UTFMax]byte

	for r, size := rune(0), 0; nSrc < len(src); {
		if r = rune(src[nSrc]); r < utf8.RuneSelf {
			if replacement = t(r); replacement < utf8.RuneSelf {
				if nDst == len(dst) {
					err = transform.ErrShortDst
					break
				}
				dst[nDst] = byte(replacement)
				nDst++
				nSrc++
				continue
			}
			size = 1
		} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
			// Invalid rune.
			if !atEOF && !utf8.FullRune(src[nSrc:]) {
				err = transform.ErrShortSrc
				break
			}

			if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
				if nDst+3 > len(dst) {
					err = transform.ErrShortDst
					break
				}
				dst[nDst+0] = runeErrorString[0]
				dst[nDst+1] = runeErrorString[1]
				dst[nDst+2] = runeErrorString[2]
				nDst += 3
				nSrc++
				continue
			}
		} else if replacement = t(r); replacement == r {
			if nDst+size > len(dst) {
				err = transform.ErrShortDst
				break
			}
			for i := 0; i < size; i++ {
				dst[nDst] = src[nSrc]
				nDst++
				nSrc++
			}
			continue
		}

		n := utf8.EncodeRune(b[:], replacement)

		if nDst+n > len(dst) {
			err = transform.ErrShortDst
			break
		}
		for i := 0; i < n; i++ {
			dst[nDst] = b[i]
			nDst++
		}
		nSrc += size
	}
	return
}