gemini-browser

A text-based gemini browser
git clone git://git.laack.co/gemini-browser.git
Log | Files | Refs | README

encoding.go (9695B)


      1 // Copyright 2013 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Package encoding defines an interface for character encodings, such as Shift
      6 // JIS and Windows 1252, that can convert to and from UTF-8.
      7 //
      8 // Encoding implementations are provided in other packages, such as
      9 // golang.org/x/text/encoding/charmap and
     10 // golang.org/x/text/encoding/japanese.
     11 package encoding // import "golang.org/x/text/encoding"
     12 
     13 import (
     14 	"errors"
     15 	"io"
     16 	"strconv"
     17 	"unicode/utf8"
     18 
     19 	"golang.org/x/text/encoding/internal/identifier"
     20 	"golang.org/x/text/transform"
     21 )
     22 
     23 // TODO:
     24 // - There seems to be some inconsistency in when decoders return errors
     25 //   and when not. Also documentation seems to suggest they shouldn't return
     26 //   errors at all (except for UTF-16).
     27 // - Encoders seem to rely on or at least benefit from the input being in NFC
     28 //   normal form. Perhaps add an example how users could prepare their output.
     29 
     30 // Encoding is a character set encoding that can be transformed to and from
     31 // UTF-8.
     32 type Encoding interface {
     33 	// NewDecoder returns a Decoder.
     34 	NewDecoder() *Decoder
     35 
     36 	// NewEncoder returns an Encoder.
     37 	NewEncoder() *Encoder
     38 }
     39 
     40 // A Decoder converts bytes to UTF-8. It implements transform.Transformer.
     41 //
     42 // Transforming source bytes that are not of that encoding will not result in an
     43 // error per se. Each byte that cannot be transcoded will be represented in the
     44 // output by the UTF-8 encoding of '\uFFFD', the replacement rune.
     45 type Decoder struct {
     46 	transform.Transformer
     47 
     48 	// This forces external creators of Decoders to use names in struct
     49 	// initializers, allowing for future extendibility without having to break
     50 	// code.
     51 	_ struct{}
     52 }
     53 
     54 // Bytes converts the given encoded bytes to UTF-8. It returns the converted
     55 // bytes or nil, err if any error occurred.
     56 func (d *Decoder) Bytes(b []byte) ([]byte, error) {
     57 	b, _, err := transform.Bytes(d, b)
     58 	if err != nil {
     59 		return nil, err
     60 	}
     61 	return b, nil
     62 }
     63 
     64 // String converts the given encoded string to UTF-8. It returns the converted
     65 // string or "", err if any error occurred.
     66 func (d *Decoder) String(s string) (string, error) {
     67 	s, _, err := transform.String(d, s)
     68 	if err != nil {
     69 		return "", err
     70 	}
     71 	return s, nil
     72 }
     73 
     74 // Reader wraps another Reader to decode its bytes.
     75 //
     76 // The Decoder may not be used for any other operation as long as the returned
     77 // Reader is in use.
     78 func (d *Decoder) Reader(r io.Reader) io.Reader {
     79 	return transform.NewReader(r, d)
     80 }
     81 
     82 // An Encoder converts bytes from UTF-8. It implements transform.Transformer.
     83 //
     84 // Each rune that cannot be transcoded will result in an error. In this case,
     85 // the transform will consume all source byte up to, not including the offending
     86 // rune. Transforming source bytes that are not valid UTF-8 will be replaced by
     87 // `\uFFFD`. To return early with an error instead, use transform.Chain to
     88 // preprocess the data with a UTF8Validator.
     89 type Encoder struct {
     90 	transform.Transformer
     91 
     92 	// This forces external creators of Encoders to use names in struct
     93 	// initializers, allowing for future extendibility without having to break
     94 	// code.
     95 	_ struct{}
     96 }
     97 
     98 // Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
     99 // any error occurred.
    100 func (e *Encoder) Bytes(b []byte) ([]byte, error) {
    101 	b, _, err := transform.Bytes(e, b)
    102 	if err != nil {
    103 		return nil, err
    104 	}
    105 	return b, nil
    106 }
    107 
    108 // String converts a string from UTF-8. It returns the converted string or
    109 // "", err if any error occurred.
    110 func (e *Encoder) String(s string) (string, error) {
    111 	s, _, err := transform.String(e, s)
    112 	if err != nil {
    113 		return "", err
    114 	}
    115 	return s, nil
    116 }
    117 
    118 // Writer wraps another Writer to encode its UTF-8 output.
    119 //
    120 // The Encoder may not be used for any other operation as long as the returned
    121 // Writer is in use.
    122 func (e *Encoder) Writer(w io.Writer) io.Writer {
    123 	return transform.NewWriter(w, e)
    124 }
    125 
    126 // ASCIISub is the ASCII substitute character, as recommended by
    127 // https://unicode.org/reports/tr36/#Text_Comparison
    128 const ASCIISub = '\x1a'
    129 
    130 // Nop is the nop encoding. Its transformed bytes are the same as the source
    131 // bytes; it does not replace invalid UTF-8 sequences.
    132 var Nop Encoding = nop{}
    133 
    134 type nop struct{}
    135 
    136 func (nop) NewDecoder() *Decoder {
    137 	return &Decoder{Transformer: transform.Nop}
    138 }
    139 func (nop) NewEncoder() *Encoder {
    140 	return &Encoder{Transformer: transform.Nop}
    141 }
    142 
    143 // Replacement is the replacement encoding. Decoding from the replacement
    144 // encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
    145 // the replacement encoding yields the same as the source bytes except that
    146 // invalid UTF-8 is converted to '\uFFFD'.
    147 //
    148 // It is defined at http://encoding.spec.whatwg.org/#replacement
    149 var Replacement Encoding = replacement{}
    150 
    151 type replacement struct{}
    152 
    153 func (replacement) NewDecoder() *Decoder {
    154 	return &Decoder{Transformer: replacementDecoder{}}
    155 }
    156 
    157 func (replacement) NewEncoder() *Encoder {
    158 	return &Encoder{Transformer: replacementEncoder{}}
    159 }
    160 
    161 func (replacement) ID() (mib identifier.MIB, other string) {
    162 	return identifier.Replacement, ""
    163 }
    164 
    165 type replacementDecoder struct{ transform.NopResetter }
    166 
    167 func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    168 	if len(dst) < 3 {
    169 		return 0, 0, transform.ErrShortDst
    170 	}
    171 	if atEOF {
    172 		const fffd = "\ufffd"
    173 		dst[0] = fffd[0]
    174 		dst[1] = fffd[1]
    175 		dst[2] = fffd[2]
    176 		nDst = 3
    177 	}
    178 	return nDst, len(src), nil
    179 }
    180 
    181 type replacementEncoder struct{ transform.NopResetter }
    182 
    183 func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    184 	r, size := rune(0), 0
    185 
    186 	for ; nSrc < len(src); nSrc += size {
    187 		r = rune(src[nSrc])
    188 
    189 		// Decode a 1-byte rune.
    190 		if r < utf8.RuneSelf {
    191 			size = 1
    192 
    193 		} else {
    194 			// Decode a multi-byte rune.
    195 			r, size = utf8.DecodeRune(src[nSrc:])
    196 			if size == 1 {
    197 				// All valid runes of size 1 (those below utf8.RuneSelf) were
    198 				// handled above. We have invalid UTF-8 or we haven't seen the
    199 				// full character yet.
    200 				if !atEOF && !utf8.FullRune(src[nSrc:]) {
    201 					err = transform.ErrShortSrc
    202 					break
    203 				}
    204 				r = '\ufffd'
    205 			}
    206 		}
    207 
    208 		if nDst+utf8.RuneLen(r) > len(dst) {
    209 			err = transform.ErrShortDst
    210 			break
    211 		}
    212 		nDst += utf8.EncodeRune(dst[nDst:], r)
    213 	}
    214 	return nDst, nSrc, err
    215 }
    216 
    217 // HTMLEscapeUnsupported wraps encoders to replace source runes outside the
    218 // repertoire of the destination encoding with HTML escape sequences.
    219 //
    220 // This wrapper exists to comply to URL and HTML forms requiring a
    221 // non-terminating legacy encoder. The produced sequences may lead to data
    222 // loss as they are indistinguishable from legitimate input. To avoid this
    223 // issue, use UTF-8 encodings whenever possible.
    224 func HTMLEscapeUnsupported(e *Encoder) *Encoder {
    225 	return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
    226 }
    227 
    228 // ReplaceUnsupported wraps encoders to replace source runes outside the
    229 // repertoire of the destination encoding with an encoding-specific
    230 // replacement.
    231 //
    232 // This wrapper is only provided for backwards compatibility and legacy
    233 // handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
    234 func ReplaceUnsupported(e *Encoder) *Encoder {
    235 	return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
    236 }
    237 
    238 type errorHandler struct {
    239 	*Encoder
    240 	handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
    241 }
    242 
    243 // TODO: consider making this error public in some form.
    244 type repertoireError interface {
    245 	Replacement() byte
    246 }
    247 
    248 func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    249 	nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
    250 	for err != nil {
    251 		rerr, ok := err.(repertoireError)
    252 		if !ok {
    253 			return nDst, nSrc, err
    254 		}
    255 		r, sz := utf8.DecodeRune(src[nSrc:])
    256 		n, ok := h.handler(dst[nDst:], r, rerr)
    257 		if !ok {
    258 			return nDst, nSrc, transform.ErrShortDst
    259 		}
    260 		err = nil
    261 		nDst += n
    262 		if nSrc += sz; nSrc < len(src) {
    263 			var dn, sn int
    264 			dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
    265 			nDst += dn
    266 			nSrc += sn
    267 		}
    268 	}
    269 	return nDst, nSrc, err
    270 }
    271 
    272 func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
    273 	buf := [8]byte{}
    274 	b := strconv.AppendUint(buf[:0], uint64(r), 10)
    275 	if n = len(b) + len("&#;"); n >= len(dst) {
    276 		return 0, false
    277 	}
    278 	dst[0] = '&'
    279 	dst[1] = '#'
    280 	dst[copy(dst[2:], b)+2] = ';'
    281 	return n, true
    282 }
    283 
    284 func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
    285 	if len(dst) == 0 {
    286 		return 0, false
    287 	}
    288 	dst[0] = err.Replacement()
    289 	return 1, true
    290 }
    291 
    292 // ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
    293 var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
    294 
    295 // UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
    296 // input byte that is not valid UTF-8.
    297 var UTF8Validator transform.Transformer = utf8Validator{}
    298 
    299 type utf8Validator struct{ transform.NopResetter }
    300 
    301 func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    302 	n := len(src)
    303 	if n > len(dst) {
    304 		n = len(dst)
    305 	}
    306 	for i := 0; i < n; {
    307 		if c := src[i]; c < utf8.RuneSelf {
    308 			dst[i] = c
    309 			i++
    310 			continue
    311 		}
    312 		_, size := utf8.DecodeRune(src[i:])
    313 		if size == 1 {
    314 			// All valid runes of size 1 (those below utf8.RuneSelf) were
    315 			// handled above. We have invalid UTF-8 or we haven't seen the
    316 			// full character yet.
    317 			err = ErrInvalidUTF8
    318 			if !atEOF && !utf8.FullRune(src[i:]) {
    319 				err = transform.ErrShortSrc
    320 			}
    321 			return i, i, err
    322 		}
    323 		if i+size > len(dst) {
    324 			return i, i, transform.ErrShortDst
    325 		}
    326 		for ; size > 0; size-- {
    327 			dst[i] = src[i]
    328 			i++
    329 		}
    330 	}
    331 	if len(src) > len(dst) {
    332 		err = transform.ErrShortDst
    333 	}
    334 	return n, n, err
    335 }