gemini-browser

A text-based gemini browser
git clone git://git.laack.co/gemini-browser.git
Log | Files | Refs | README

forminfo.go (8913B)


      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package norm
      6 
      7 import "encoding/binary"
      8 
      9 // This file contains Form-specific logic and wrappers for data in tables.go.
     10 
     11 // Rune info is stored in a separate trie per composing form. A composing form
     12 // and its corresponding decomposing form share the same trie.  Each trie maps
     13 // a rune to a uint16. The values take two forms.  For v >= 0x8000:
     14 //   bits
     15 //   15:    1 (inverse of NFD_QC bit of qcInfo)
     16 //   13..7: qcInfo (see below). isYesD is always true (no decomposition).
     17 //    6..0: ccc (compressed CCC value).
     18 // For v < 0x8000, the respective rune has a decomposition and v is an index
     19 // into a byte array of UTF-8 decomposition sequences and additional info and
     20 // has the form:
     21 //    <header> <decomp_byte>* [<tccc> [<lccc>]]
     22 // The header contains the number of bytes in the decomposition (excluding this
     23 // length byte). The two most significant bits of this length byte correspond
     24 // to bit 5 and 4 of qcInfo (see below).  The byte sequence itself starts at v+1.
     25 // The byte sequence is followed by a trailing and leading CCC if the values
     26 // for these are not zero.  The value of v determines which ccc are appended
     27 // to the sequences.  For v < firstCCC, there are none, for v >= firstCCC,
     28 // the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
     29 // there is an additional leading ccc. The value of tccc itself is the
     30 // trailing CCC shifted left 2 bits. The two least-significant bits of tccc
     31 // are the number of trailing non-starters.
     32 
     33 const (
     34 	qcInfoMask      = 0x3F // to clear all but the relevant bits in a qcInfo
     35 	headerLenMask   = 0x3F // extract the length value from the header byte
     36 	headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
     37 )
     38 
     39 // Properties provides access to normalization properties of a rune.
     40 type Properties struct {
     41 	pos   uint8  // start position in reorderBuffer; used in composition.go
     42 	size  uint8  // length of UTF-8 encoding of this rune
     43 	ccc   uint8  // leading canonical combining class (ccc if not decomposition)
     44 	tccc  uint8  // trailing canonical combining class (ccc if not decomposition)
     45 	nLead uint8  // number of leading non-starters.
     46 	flags qcInfo // quick check flags
     47 	index uint16
     48 }
     49 
     50 // functions dispatchable per form
     51 type lookupFunc func(b input, i int) Properties
     52 
     53 // formInfo holds Form-specific functions and tables.
     54 type formInfo struct {
     55 	form                     Form
     56 	composing, compatibility bool // form type
     57 	info                     lookupFunc
     58 	nextMain                 iterFunc
     59 }
     60 
     61 var formTable = []*formInfo{{
     62 	form:          NFC,
     63 	composing:     true,
     64 	compatibility: false,
     65 	info:          lookupInfoNFC,
     66 	nextMain:      nextComposed,
     67 }, {
     68 	form:          NFD,
     69 	composing:     false,
     70 	compatibility: false,
     71 	info:          lookupInfoNFC,
     72 	nextMain:      nextDecomposed,
     73 }, {
     74 	form:          NFKC,
     75 	composing:     true,
     76 	compatibility: true,
     77 	info:          lookupInfoNFKC,
     78 	nextMain:      nextComposed,
     79 }, {
     80 	form:          NFKD,
     81 	composing:     false,
     82 	compatibility: true,
     83 	info:          lookupInfoNFKC,
     84 	nextMain:      nextDecomposed,
     85 }}
     86 
     87 // We do not distinguish between boundaries for NFC, NFD, etc. to avoid
     88 // unexpected behavior for the user.  For example, in NFD, there is a boundary
     89 // after 'a'.  However, 'a' might combine with modifiers, so from the application's
     90 // perspective it is not a good boundary. We will therefore always use the
     91 // boundaries for the combining variants.
     92 
     93 // BoundaryBefore returns true if this rune starts a new segment and
     94 // cannot combine with any rune on the left.
     95 func (p Properties) BoundaryBefore() bool {
     96 	if p.ccc == 0 && !p.combinesBackward() {
     97 		return true
     98 	}
     99 	// We assume that the CCC of the first character in a decomposition
    100 	// is always non-zero if different from info.ccc and that we can return
    101 	// false at this point. This is verified by maketables.
    102 	return false
    103 }
    104 
    105 // BoundaryAfter returns true if runes cannot combine with or otherwise
    106 // interact with this or previous runes.
    107 func (p Properties) BoundaryAfter() bool {
    108 	// TODO: loosen these conditions.
    109 	return p.isInert()
    110 }
    111 
    112 // We pack quick check data in 4 bits:
    113 //
    114 //	5:    Combines forward  (0 == false, 1 == true)
    115 //	4..3: NFC_QC Yes(00), No (10), or Maybe (11)
    116 //	2:    NFD_QC Yes (0) or No (1). No also means there is a decomposition.
    117 //	1..0: Number of trailing non-starters.
    118 //
    119 // When all 4 bits are zero, the character is inert, meaning it is never
    120 // influenced by normalization.
    121 type qcInfo uint8
    122 
    123 func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
    124 func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
    125 
    126 func (p Properties) combinesForward() bool  { return p.flags&0x20 != 0 }
    127 func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
    128 func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
    129 
    130 func (p Properties) isInert() bool {
    131 	return p.flags&qcInfoMask == 0 && p.ccc == 0
    132 }
    133 
    134 func (p Properties) multiSegment() bool {
    135 	return p.index >= firstMulti && p.index < endMulti
    136 }
    137 
    138 func (p Properties) nLeadingNonStarters() uint8 {
    139 	return p.nLead
    140 }
    141 
    142 func (p Properties) nTrailingNonStarters() uint8 {
    143 	return uint8(p.flags & 0x03)
    144 }
    145 
    146 // Decomposition returns the decomposition for the underlying rune
    147 // or nil if there is none.
    148 func (p Properties) Decomposition() []byte {
    149 	// TODO: create the decomposition for Hangul?
    150 	if p.index == 0 {
    151 		return nil
    152 	}
    153 	i := p.index
    154 	n := decomps[i] & headerLenMask
    155 	i++
    156 	return decomps[i : i+uint16(n)]
    157 }
    158 
    159 // Size returns the length of UTF-8 encoding of the rune.
    160 func (p Properties) Size() int {
    161 	return int(p.size)
    162 }
    163 
    164 // CCC returns the canonical combining class of the underlying rune.
    165 func (p Properties) CCC() uint8 {
    166 	if p.index >= firstCCCZeroExcept {
    167 		return 0
    168 	}
    169 	return ccc[p.ccc]
    170 }
    171 
    172 // LeadCCC returns the CCC of the first rune in the decomposition.
    173 // If there is no decomposition, LeadCCC equals CCC.
    174 func (p Properties) LeadCCC() uint8 {
    175 	return ccc[p.ccc]
    176 }
    177 
    178 // TrailCCC returns the CCC of the last rune in the decomposition.
    179 // If there is no decomposition, TrailCCC equals CCC.
    180 func (p Properties) TrailCCC() uint8 {
    181 	return ccc[p.tccc]
    182 }
    183 
    184 func buildRecompMap() {
    185 	recompMap = make(map[uint32]rune, len(recompMapPacked)/8)
    186 	var buf [8]byte
    187 	for i := 0; i < len(recompMapPacked); i += 8 {
    188 		copy(buf[:], recompMapPacked[i:i+8])
    189 		key := binary.BigEndian.Uint32(buf[:4])
    190 		val := binary.BigEndian.Uint32(buf[4:])
    191 		recompMap[key] = rune(val)
    192 	}
    193 }
    194 
    195 // Recomposition
    196 // We use 32-bit keys instead of 64-bit for the two codepoint keys.
    197 // This clips off the bits of three entries, but we know this will not
    198 // result in a collision. In the unlikely event that changes to
    199 // UnicodeData.txt introduce collisions, the compiler will catch it.
    200 // Note that the recomposition map for NFC and NFKC are identical.
    201 
    202 // combine returns the combined rune or 0 if it doesn't exist.
    203 //
    204 // The caller is responsible for calling
    205 // recompMapOnce.Do(buildRecompMap) sometime before this is called.
    206 func combine(a, b rune) rune {
    207 	key := uint32(uint16(a))<<16 + uint32(uint16(b))
    208 	if recompMap == nil {
    209 		panic("caller error") // see func comment
    210 	}
    211 	return recompMap[key]
    212 }
    213 
    214 func lookupInfoNFC(b input, i int) Properties {
    215 	v, sz := b.charinfoNFC(i)
    216 	return compInfo(v, sz)
    217 }
    218 
    219 func lookupInfoNFKC(b input, i int) Properties {
    220 	v, sz := b.charinfoNFKC(i)
    221 	return compInfo(v, sz)
    222 }
    223 
    224 // Properties returns properties for the first rune in s.
    225 func (f Form) Properties(s []byte) Properties {
    226 	if f == NFC || f == NFD {
    227 		return compInfo(nfcData.lookup(s))
    228 	}
    229 	return compInfo(nfkcData.lookup(s))
    230 }
    231 
    232 // PropertiesString returns properties for the first rune in s.
    233 func (f Form) PropertiesString(s string) Properties {
    234 	if f == NFC || f == NFD {
    235 		return compInfo(nfcData.lookupString(s))
    236 	}
    237 	return compInfo(nfkcData.lookupString(s))
    238 }
    239 
    240 // compInfo converts the information contained in v and sz
    241 // to a Properties.  See the comment at the top of the file
    242 // for more information on the format.
    243 func compInfo(v uint16, sz int) Properties {
    244 	if v == 0 {
    245 		return Properties{size: uint8(sz)}
    246 	} else if v >= 0x8000 {
    247 		p := Properties{
    248 			size:  uint8(sz),
    249 			ccc:   uint8(v),
    250 			tccc:  uint8(v),
    251 			flags: qcInfo(v >> 8),
    252 		}
    253 		if p.ccc > 0 || p.combinesBackward() {
    254 			p.nLead = uint8(p.flags & 0x3)
    255 		}
    256 		return p
    257 	}
    258 	// has decomposition
    259 	h := decomps[v]
    260 	f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
    261 	p := Properties{size: uint8(sz), flags: f, index: v}
    262 	if v >= firstCCC {
    263 		v += uint16(h&headerLenMask) + 1
    264 		c := decomps[v]
    265 		p.tccc = c >> 2
    266 		p.flags |= qcInfo(c & 0x3)
    267 		if v >= firstLeadingCCC {
    268 			p.nLead = c & 0x3
    269 			if v >= firstStarterWithNLead {
    270 				// We were tricked. Remove the decomposition.
    271 				p.flags &= 0x03
    272 				p.index = 0
    273 				return p
    274 			}
    275 			p.ccc = decomps[v+1]
    276 		}
    277 	}
    278 	return p
    279 }