gemini-browser

A text-based gemini browser
git clone git://git.laack.co/gemini-browser.git
Log | Files | Refs | README

charmap.go (5648B)


      1 // Copyright 2024 Garrett D'Amore
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use file except in compliance with the License.
      5 // You may obtain a copy of the license at
      6 //
      7 //    http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 package encoding
     16 
     17 import (
     18 	"sync"
     19 	"unicode/utf8"
     20 
     21 	"golang.org/x/text/encoding"
     22 	"golang.org/x/text/transform"
     23 )
     24 
     25 const (
     26 	// RuneError is an alias for the UTF-8 replacement rune, '\uFFFD'.
     27 	RuneError = '\uFFFD'
     28 
     29 	// RuneSelf is the rune below which UTF-8 and the Unicode values are
     30 	// identical.  Its also the limit for ASCII.
     31 	RuneSelf = 0x80
     32 
     33 	// ASCIISub is the ASCII substitution character.
     34 	ASCIISub = '\x1a'
     35 )
     36 
     37 // Charmap is a structure for setting up encodings for 8-bit character sets,
     38 // for transforming between UTF8 and that other character set.  It has some
     39 // ideas borrowed from golang.org/x/text/encoding/charmap, but it uses a
     40 // different implementation.  This implementation uses maps, and supports
     41 // user-defined maps.
     42 //
     43 // We do assume that a character map has a reasonable substitution character,
     44 // and that valid encodings are stable (exactly a 1:1 map) and stateless
     45 // (that is there is no shift character or anything like that.)  Hence this
     46 // approach will not work for many East Asian character sets.
     47 //
     48 // Measurement shows little or no measurable difference in the performance of
     49 // the two approaches.  The difference was down to a couple of nsec/op, and
     50 // no consistent pattern as to which ran faster.  With the conversion to
     51 // UTF-8 the code takes about 25 nsec/op.  The conversion in the reverse
     52 // direction takes about 100 nsec/op.  (The larger cost for conversion
     53 // from UTF-8 is most likely due to the need to convert the UTF-8 byte stream
     54 // to a rune before conversion.
     55 type Charmap struct {
     56 	transform.NopResetter
     57 	bytes map[rune]byte
     58 	runes [256][]byte
     59 	once  sync.Once
     60 
     61 	// The map between bytes and runes.  To indicate that a specific
     62 	// byte value is invalid for a charcter set, use the rune
     63 	// utf8.RuneError.  Values that are absent from this map will
     64 	// be assumed to have the identity mapping -- that is the default
     65 	// is to assume ISO8859-1, where all 8-bit characters have the same
     66 	// numeric value as their Unicode runes.  (Not to be confused with
     67 	// the UTF-8 values, which *will* be different for non-ASCII runes.)
     68 	//
     69 	// If no values less than RuneSelf are changed (or have non-identity
     70 	// mappings), then the character set is assumed to be an ASCII
     71 	// superset, and certain assumptions and optimizations become
     72 	// available for ASCII bytes.
     73 	Map map[byte]rune
     74 
     75 	// The ReplacementChar is the byte value to use for substitution.
     76 	// It should normally be ASCIISub for ASCII encodings.  This may be
     77 	// unset (left to zero) for mappings that are strictly ASCII supersets.
     78 	// In that case ASCIISub will be assumed instead.
     79 	ReplacementChar byte
     80 }
     81 
     82 type cmapDecoder struct {
     83 	transform.NopResetter
     84 	runes [256][]byte
     85 }
     86 
     87 type cmapEncoder struct {
     88 	transform.NopResetter
     89 	bytes   map[rune]byte
     90 	replace byte
     91 }
     92 
     93 // Init initializes internal values of a character map.  This should
     94 // be done early, to minimize the cost of allocation of transforms
     95 // later.  It is not strictly necessary however, as the allocation
     96 // functions will arrange to call it if it has not already been done.
     97 func (c *Charmap) Init() {
     98 	c.once.Do(c.initialize)
     99 }
    100 
    101 func (c *Charmap) initialize() {
    102 	c.bytes = make(map[rune]byte)
    103 	ascii := true
    104 
    105 	for i := 0; i < 256; i++ {
    106 		r, ok := c.Map[byte(i)]
    107 		if !ok {
    108 			r = rune(i)
    109 		}
    110 		if r < 128 && r != rune(i) {
    111 			ascii = false
    112 		}
    113 		if r != RuneError {
    114 			c.bytes[r] = byte(i)
    115 		}
    116 		utf := make([]byte, utf8.RuneLen(r))
    117 		utf8.EncodeRune(utf, r)
    118 		c.runes[i] = utf
    119 	}
    120 	if ascii && c.ReplacementChar == '\x00' {
    121 		c.ReplacementChar = ASCIISub
    122 	}
    123 }
    124 
    125 // NewDecoder returns a Decoder the converts from the 8-bit
    126 // character set to UTF-8.  Unknown mappings, if any, are mapped
    127 // to '\uFFFD'.
    128 func (c *Charmap) NewDecoder() *encoding.Decoder {
    129 	c.Init()
    130 	return &encoding.Decoder{Transformer: &cmapDecoder{runes: c.runes}}
    131 }
    132 
    133 // NewEncoder returns a Transformer that converts from UTF8 to the
    134 // 8-bit character set.  Unknown mappings are mapped to 0x1A.
    135 func (c *Charmap) NewEncoder() *encoding.Encoder {
    136 	c.Init()
    137 	return &encoding.Encoder{
    138 		Transformer: &cmapEncoder{
    139 			bytes:   c.bytes,
    140 			replace: c.ReplacementChar,
    141 		},
    142 	}
    143 }
    144 
    145 func (d *cmapDecoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
    146 	var e error
    147 	var ndst, nsrc int
    148 
    149 	for _, c := range src {
    150 		b := d.runes[c]
    151 		l := len(b)
    152 
    153 		if ndst+l > len(dst) {
    154 			e = transform.ErrShortDst
    155 			break
    156 		}
    157 		for i := 0; i < l; i++ {
    158 			dst[ndst] = b[i]
    159 			ndst++
    160 		}
    161 		nsrc++
    162 	}
    163 	return ndst, nsrc, e
    164 }
    165 
    166 func (d *cmapEncoder) Transform(dst, src []byte, atEOF bool) (int, int, error) {
    167 	var e error
    168 	var ndst, nsrc int
    169 	for nsrc < len(src) {
    170 		if ndst >= len(dst) {
    171 			e = transform.ErrShortDst
    172 			break
    173 		}
    174 
    175 		r, sz := utf8.DecodeRune(src[nsrc:])
    176 		if r == utf8.RuneError && sz == 1 {
    177 			// If its inconclusive due to insufficient data in
    178 			// in the source, report it
    179 			if atEOF && !utf8.FullRune(src[nsrc:]) {
    180 				e = transform.ErrShortSrc
    181 				break
    182 			}
    183 		}
    184 
    185 		if c, ok := d.bytes[r]; ok {
    186 			dst[ndst] = c
    187 		} else {
    188 			dst[ndst] = d.replace
    189 		}
    190 		nsrc += sz
    191 		ndst++
    192 	}
    193 
    194 	return ndst, nsrc, e
    195 }