charmap.go (5648B)
1 // Copyright 2024 Garrett D'Amore 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use file except in compliance with the License. 5 // You may obtain a copy of the license at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package encoding 16 17 import ( 18 "sync" 19 "unicode/utf8" 20 21 "golang.org/x/text/encoding" 22 "golang.org/x/text/transform" 23 ) 24 25 const ( 26 // RuneError is an alias for the UTF-8 replacement rune, '\uFFFD'. 27 RuneError = '\uFFFD' 28 29 // RuneSelf is the rune below which UTF-8 and the Unicode values are 30 // identical. Its also the limit for ASCII. 31 RuneSelf = 0x80 32 33 // ASCIISub is the ASCII substitution character. 34 ASCIISub = '\x1a' 35 ) 36 37 // Charmap is a structure for setting up encodings for 8-bit character sets, 38 // for transforming between UTF8 and that other character set. It has some 39 // ideas borrowed from golang.org/x/text/encoding/charmap, but it uses a 40 // different implementation. This implementation uses maps, and supports 41 // user-defined maps. 42 // 43 // We do assume that a character map has a reasonable substitution character, 44 // and that valid encodings are stable (exactly a 1:1 map) and stateless 45 // (that is there is no shift character or anything like that.) Hence this 46 // approach will not work for many East Asian character sets. 47 // 48 // Measurement shows little or no measurable difference in the performance of 49 // the two approaches. The difference was down to a couple of nsec/op, and 50 // no consistent pattern as to which ran faster. With the conversion to 51 // UTF-8 the code takes about 25 nsec/op. The conversion in the reverse 52 // direction takes about 100 nsec/op. (The larger cost for conversion 53 // from UTF-8 is most likely due to the need to convert the UTF-8 byte stream 54 // to a rune before conversion. 55 type Charmap struct { 56 transform.NopResetter 57 bytes map[rune]byte 58 runes [256][]byte 59 once sync.Once 60 61 // The map between bytes and runes. To indicate that a specific 62 // byte value is invalid for a charcter set, use the rune 63 // utf8.RuneError. Values that are absent from this map will 64 // be assumed to have the identity mapping -- that is the default 65 // is to assume ISO8859-1, where all 8-bit characters have the same 66 // numeric value as their Unicode runes. (Not to be confused with 67 // the UTF-8 values, which *will* be different for non-ASCII runes.) 68 // 69 // If no values less than RuneSelf are changed (or have non-identity 70 // mappings), then the character set is assumed to be an ASCII 71 // superset, and certain assumptions and optimizations become 72 // available for ASCII bytes. 73 Map map[byte]rune 74 75 // The ReplacementChar is the byte value to use for substitution. 76 // It should normally be ASCIISub for ASCII encodings. This may be 77 // unset (left to zero) for mappings that are strictly ASCII supersets. 78 // In that case ASCIISub will be assumed instead. 79 ReplacementChar byte 80 } 81 82 type cmapDecoder struct { 83 transform.NopResetter 84 runes [256][]byte 85 } 86 87 type cmapEncoder struct { 88 transform.NopResetter 89 bytes map[rune]byte 90 replace byte 91 } 92 93 // Init initializes internal values of a character map. This should 94 // be done early, to minimize the cost of allocation of transforms 95 // later. It is not strictly necessary however, as the allocation 96 // functions will arrange to call it if it has not already been done. 97 func (c *Charmap) Init() { 98 c.once.Do(c.initialize) 99 } 100 101 func (c *Charmap) initialize() { 102 c.bytes = make(map[rune]byte) 103 ascii := true 104 105 for i := 0; i < 256; i++ { 106 r, ok := c.Map[byte(i)] 107 if !ok { 108 r = rune(i) 109 } 110 if r < 128 && r != rune(i) { 111 ascii = false 112 } 113 if r != RuneError { 114 c.bytes[r] = byte(i) 115 } 116 utf := make([]byte, utf8.RuneLen(r)) 117 utf8.EncodeRune(utf, r) 118 c.runes[i] = utf 119 } 120 if ascii && c.ReplacementChar == '\x00' { 121 c.ReplacementChar = ASCIISub 122 } 123 } 124 125 // NewDecoder returns a Decoder the converts from the 8-bit 126 // character set to UTF-8. Unknown mappings, if any, are mapped 127 // to '\uFFFD'. 128 func (c *Charmap) NewDecoder() *encoding.Decoder { 129 c.Init() 130 return &encoding.Decoder{Transformer: &cmapDecoder{runes: c.runes}} 131 } 132 133 // NewEncoder returns a Transformer that converts from UTF8 to the 134 // 8-bit character set. Unknown mappings are mapped to 0x1A. 135 func (c *Charmap) NewEncoder() *encoding.Encoder { 136 c.Init() 137 return &encoding.Encoder{ 138 Transformer: &cmapEncoder{ 139 bytes: c.bytes, 140 replace: c.ReplacementChar, 141 }, 142 } 143 } 144 145 func (d *cmapDecoder) Transform(dst, src []byte, atEOF bool) (int, int, error) { 146 var e error 147 var ndst, nsrc int 148 149 for _, c := range src { 150 b := d.runes[c] 151 l := len(b) 152 153 if ndst+l > len(dst) { 154 e = transform.ErrShortDst 155 break 156 } 157 for i := 0; i < l; i++ { 158 dst[ndst] = b[i] 159 ndst++ 160 } 161 nsrc++ 162 } 163 return ndst, nsrc, e 164 } 165 166 func (d *cmapEncoder) Transform(dst, src []byte, atEOF bool) (int, int, error) { 167 var e error 168 var ndst, nsrc int 169 for nsrc < len(src) { 170 if ndst >= len(dst) { 171 e = transform.ErrShortDst 172 break 173 } 174 175 r, sz := utf8.DecodeRune(src[nsrc:]) 176 if r == utf8.RuneError && sz == 1 { 177 // If its inconclusive due to insufficient data in 178 // in the source, report it 179 if atEOF && !utf8.FullRune(src[nsrc:]) { 180 e = transform.ErrShortSrc 181 break 182 } 183 } 184 185 if c, ok := d.bytes[r]; ok { 186 dst[ndst] = c 187 } else { 188 dst[ndst] = d.replace 189 } 190 nsrc += sz 191 ndst++ 192 } 193 194 return ndst, nsrc, e 195 }