gen.go 3.6 KB
Newer Older
Jeromy's avatar
Jeromy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build ignore

// This program generates the trie for width operations. The generated table
// includes width category information as well as the normalization mappings.
package main

import (
	"bytes"
	"fmt"
	"io"
	"io/ioutil"
	"log"
	"math"
	"unicode/utf8"

	"golang.org/x/text/internal/gen"
	"golang.org/x/text/internal/triegen"
)

// See gen_common.go for flags.

func main() {
	gen.Init()
	genTables()
	genTests()
	repackage("gen_trieval.go", "trieval.go")
	repackage("gen_common.go", "common_test.go")
}

func genTables() {
	t := triegen.NewTrie("width")
	// fold and inverse mappings. See mapComment for a description of the format
	// of each entry. Add dummy value to make an index of 0 mean no mapping.
	inverse := [][4]byte{{}}
	mapping := map[[4]byte]int{[4]byte{}: 0}

	getWidthData(func(r rune, tag elem, alt rune) {
		idx := 0
		if alt != 0 {
			var buf [4]byte
			buf[0] = byte(utf8.EncodeRune(buf[1:], alt))
			s := string(r)
			buf[buf[0]] ^= s[len(s)-1]
			var ok bool
			if idx, ok = mapping[buf]; !ok {
				idx = len(mapping)
				if idx > math.MaxUint8 {
					log.Fatalf("Index %d does not fit in a byte.", idx)
				}
				mapping[buf] = idx
				inverse = append(inverse, buf)
			}
		}
		t.Insert(r, uint64(tag|elem(idx)))
	})

	w := &bytes.Buffer{}
	gen.WriteUnicodeVersion(w)

	sz, err := t.Gen(w)
	if err != nil {
		log.Fatal(err)
	}

	sz += writeMappings(w, inverse)

	fmt.Fprintf(w, "// Total table size %d bytes (%dKiB)\n", sz, sz/1024)

	gen.WriteGoFile(*outputFile, "width", w.Bytes())
}

const inverseDataComment = `
// inverseData contains 4-byte entries of the following format:
//   <length> <modified UTF-8-encoded rune> <0 padding>
// The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the
// UTF-8 encoding of the original rune. Mappings often have the following
// pattern:
//   A -> A  (U+FF21 -> U+0041)
//   B -> B  (U+FF22 -> U+0042)
//   ...
// By xor-ing the last byte the same entry can be shared by many mappings. This
// reduces the total number of distinct entries by about two thirds.
// The resulting entry for the aforementioned mappings is
//   { 0x01, 0xE0, 0x00, 0x00 }
// Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get
//   E0 ^ A1 = 41.
// Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get
//   E0 ^ A2 = 42.
// Note that because of the xor-ing, the byte sequence stored in the entry is
// not valid UTF-8.`

func writeMappings(w io.Writer, data [][4]byte) int {
	fmt.Fprintln(w, inverseDataComment)
	fmt.Fprintf(w, "var inverseData = [%d][4]byte{\n", len(data))
	for _, x := range data {
		fmt.Fprintf(w, "{ 0x%02x, 0x%02x, 0x%02x, 0x%02x },\n", x[0], x[1], x[2], x[3])
	}
	fmt.Fprintln(w, "}")
	return len(data) * 4
}

func genTests() {
	w := &bytes.Buffer{}
	fmt.Fprintf(w, "\nvar mapRunes = map[rune]struct{r rune; e elem}{\n")
	getWidthData(func(r rune, tag elem, alt rune) {
		if alt != 0 {
			fmt.Fprintf(w, "\t0x%X: {0x%X, 0x%X},\n", r, alt, tag)
		}
	})
	fmt.Fprintln(w, "}")
	gen.WriteGoFile("runes_test.go", "width", w.Bytes())
}

// repackage rewrites a file from belonging to package main to belonging to
// package width.
func repackage(inFile, outFile string) {
	src, err := ioutil.ReadFile(inFile)
	if err != nil {
		log.Fatalf("reading %s: %v", inFile, err)
	}
	const toDelete = "package main\n\n"
	i := bytes.Index(src, []byte(toDelete))
	if i < 0 {
		log.Fatalf("Could not find %q in gen_trieval.go", toDelete)
	}
	w := &bytes.Buffer{}
	w.Write(src[i+len(toDelete):])
	gen.WriteGoFile(outFile, "width", w.Bytes())
}