gemini-browser

A text-based gemini browser
git clone git://git.laack.co/gemini-browser.git
Log | Files | Refs | README

composition.go (14452B)


      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package norm
      6 
      7 import "unicode/utf8"
      8 
      9 const (
     10 	maxNonStarters = 30
     11 	// The maximum number of characters needed for a buffer is
     12 	// maxNonStarters + 1 for the starter + 1 for the GCJ
     13 	maxBufferSize    = maxNonStarters + 2
     14 	maxNFCExpansion  = 3  // NFC(0x1D160)
     15 	maxNFKCExpansion = 18 // NFKC(0xFDFA)
     16 
     17 	maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128
     18 )
     19 
     20 // ssState is used for reporting the segment state after inserting a rune.
     21 // It is returned by streamSafe.next.
     22 type ssState int
     23 
     24 const (
     25 	// Indicates a rune was successfully added to the segment.
     26 	ssSuccess ssState = iota
     27 	// Indicates a rune starts a new segment and should not be added.
     28 	ssStarter
     29 	// Indicates a rune caused a segment overflow and a CGJ should be inserted.
     30 	ssOverflow
     31 )
     32 
     33 // streamSafe implements the policy of when a CGJ should be inserted.
     34 type streamSafe uint8
     35 
     36 // first inserts the first rune of a segment. It is a faster version of next if
     37 // it is known p represents the first rune in a segment.
     38 func (ss *streamSafe) first(p Properties) {
     39 	*ss = streamSafe(p.nTrailingNonStarters())
     40 }
     41 
     42 // insert returns a ssState value to indicate whether a rune represented by p
     43 // can be inserted.
     44 func (ss *streamSafe) next(p Properties) ssState {
     45 	if *ss > maxNonStarters {
     46 		panic("streamSafe was not reset")
     47 	}
     48 	n := p.nLeadingNonStarters()
     49 	if *ss += streamSafe(n); *ss > maxNonStarters {
     50 		*ss = 0
     51 		return ssOverflow
     52 	}
     53 	// The Stream-Safe Text Processing prescribes that the counting can stop
     54 	// as soon as a starter is encountered. However, there are some starters,
     55 	// like Jamo V and T, that can combine with other runes, leaving their
     56 	// successive non-starters appended to the previous, possibly causing an
     57 	// overflow. We will therefore consider any rune with a non-zero nLead to
     58 	// be a non-starter. Note that it always hold that if nLead > 0 then
     59 	// nLead == nTrail.
     60 	if n == 0 {
     61 		*ss = streamSafe(p.nTrailingNonStarters())
     62 		return ssStarter
     63 	}
     64 	return ssSuccess
     65 }
     66 
     67 // backwards is used for checking for overflow and segment starts
     68 // when traversing a string backwards. Users do not need to call first
     69 // for the first rune. The state of the streamSafe retains the count of
     70 // the non-starters loaded.
     71 func (ss *streamSafe) backwards(p Properties) ssState {
     72 	if *ss > maxNonStarters {
     73 		panic("streamSafe was not reset")
     74 	}
     75 	c := *ss + streamSafe(p.nTrailingNonStarters())
     76 	if c > maxNonStarters {
     77 		return ssOverflow
     78 	}
     79 	*ss = c
     80 	if p.nLeadingNonStarters() == 0 {
     81 		return ssStarter
     82 	}
     83 	return ssSuccess
     84 }
     85 
     86 func (ss streamSafe) isMax() bool {
     87 	return ss == maxNonStarters
     88 }
     89 
     90 // GraphemeJoiner is inserted after maxNonStarters non-starter runes.
     91 const GraphemeJoiner = "\u034F"
     92 
     93 // reorderBuffer is used to normalize a single segment.  Characters inserted with
     94 // insert are decomposed and reordered based on CCC. The compose method can
     95 // be used to recombine characters.  Note that the byte buffer does not hold
     96 // the UTF-8 characters in order.  Only the rune array is maintained in sorted
     97 // order. flush writes the resulting segment to a byte array.
     98 type reorderBuffer struct {
     99 	rune  [maxBufferSize]Properties // Per character info.
    100 	byte  [maxByteBufferSize]byte   // UTF-8 buffer. Referenced by runeInfo.pos.
    101 	nbyte uint8                     // Number or bytes.
    102 	ss    streamSafe                // For limiting length of non-starter sequence.
    103 	nrune int                       // Number of runeInfos.
    104 	f     formInfo
    105 
    106 	src      input
    107 	nsrc     int
    108 	tmpBytes input
    109 
    110 	out    []byte
    111 	flushF func(*reorderBuffer) bool
    112 }
    113 
    114 func (rb *reorderBuffer) init(f Form, src []byte) {
    115 	rb.f = *formTable[f]
    116 	rb.src.setBytes(src)
    117 	rb.nsrc = len(src)
    118 	rb.ss = 0
    119 }
    120 
    121 func (rb *reorderBuffer) initString(f Form, src string) {
    122 	rb.f = *formTable[f]
    123 	rb.src.setString(src)
    124 	rb.nsrc = len(src)
    125 	rb.ss = 0
    126 }
    127 
    128 func (rb *reorderBuffer) setFlusher(out []byte, f func(*reorderBuffer) bool) {
    129 	rb.out = out
    130 	rb.flushF = f
    131 }
    132 
    133 // reset discards all characters from the buffer.
    134 func (rb *reorderBuffer) reset() {
    135 	rb.nrune = 0
    136 	rb.nbyte = 0
    137 }
    138 
    139 func (rb *reorderBuffer) doFlush() bool {
    140 	if rb.f.composing {
    141 		rb.compose()
    142 	}
    143 	res := rb.flushF(rb)
    144 	rb.reset()
    145 	return res
    146 }
    147 
    148 // appendFlush appends the normalized segment to rb.out.
    149 func appendFlush(rb *reorderBuffer) bool {
    150 	for i := 0; i < rb.nrune; i++ {
    151 		start := rb.rune[i].pos
    152 		end := start + rb.rune[i].size
    153 		rb.out = append(rb.out, rb.byte[start:end]...)
    154 	}
    155 	return true
    156 }
    157 
    158 // flush appends the normalized segment to out and resets rb.
    159 func (rb *reorderBuffer) flush(out []byte) []byte {
    160 	for i := 0; i < rb.nrune; i++ {
    161 		start := rb.rune[i].pos
    162 		end := start + rb.rune[i].size
    163 		out = append(out, rb.byte[start:end]...)
    164 	}
    165 	rb.reset()
    166 	return out
    167 }
    168 
    169 // flushCopy copies the normalized segment to buf and resets rb.
    170 // It returns the number of bytes written to buf.
    171 func (rb *reorderBuffer) flushCopy(buf []byte) int {
    172 	p := 0
    173 	for i := 0; i < rb.nrune; i++ {
    174 		runep := rb.rune[i]
    175 		p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size])
    176 	}
    177 	rb.reset()
    178 	return p
    179 }
    180 
    181 // insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
    182 // It returns false if the buffer is not large enough to hold the rune.
    183 // It is used internally by insert and insertString only.
    184 func (rb *reorderBuffer) insertOrdered(info Properties) {
    185 	n := rb.nrune
    186 	b := rb.rune[:]
    187 	cc := info.ccc
    188 	if cc > 0 {
    189 		// Find insertion position + move elements to make room.
    190 		for ; n > 0; n-- {
    191 			if b[n-1].ccc <= cc {
    192 				break
    193 			}
    194 			b[n] = b[n-1]
    195 		}
    196 	}
    197 	rb.nrune += 1
    198 	pos := uint8(rb.nbyte)
    199 	rb.nbyte += utf8.UTFMax
    200 	info.pos = pos
    201 	b[n] = info
    202 }
    203 
    204 // insertErr is an error code returned by insert. Using this type instead
    205 // of error improves performance up to 20% for many of the benchmarks.
    206 type insertErr int
    207 
    208 const (
    209 	iSuccess insertErr = -iota
    210 	iShortDst
    211 	iShortSrc
    212 )
    213 
    214 // insertFlush inserts the given rune in the buffer ordered by CCC.
    215 // If a decomposition with multiple segments are encountered, they leading
    216 // ones are flushed.
    217 // It returns a non-zero error code if the rune was not inserted.
    218 func (rb *reorderBuffer) insertFlush(src input, i int, info Properties) insertErr {
    219 	if rune := src.hangul(i); rune != 0 {
    220 		rb.decomposeHangul(rune)
    221 		return iSuccess
    222 	}
    223 	if info.hasDecomposition() {
    224 		return rb.insertDecomposed(info.Decomposition())
    225 	}
    226 	rb.insertSingle(src, i, info)
    227 	return iSuccess
    228 }
    229 
    230 // insertUnsafe inserts the given rune in the buffer ordered by CCC.
    231 // It is assumed there is sufficient space to hold the runes. It is the
    232 // responsibility of the caller to ensure this. This can be done by checking
    233 // the state returned by the streamSafe type.
    234 func (rb *reorderBuffer) insertUnsafe(src input, i int, info Properties) {
    235 	if rune := src.hangul(i); rune != 0 {
    236 		rb.decomposeHangul(rune)
    237 	}
    238 	if info.hasDecomposition() {
    239 		// TODO: inline.
    240 		rb.insertDecomposed(info.Decomposition())
    241 	} else {
    242 		rb.insertSingle(src, i, info)
    243 	}
    244 }
    245 
    246 // insertDecomposed inserts an entry in to the reorderBuffer for each rune
    247 // in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
    248 // It flushes the buffer on each new segment start.
    249 func (rb *reorderBuffer) insertDecomposed(dcomp []byte) insertErr {
    250 	rb.tmpBytes.setBytes(dcomp)
    251 	// As the streamSafe accounting already handles the counting for modifiers,
    252 	// we don't have to call next. However, we do need to keep the accounting
    253 	// intact when flushing the buffer.
    254 	for i := 0; i < len(dcomp); {
    255 		info := rb.f.info(rb.tmpBytes, i)
    256 		if info.BoundaryBefore() && rb.nrune > 0 && !rb.doFlush() {
    257 			return iShortDst
    258 		}
    259 		i += copy(rb.byte[rb.nbyte:], dcomp[i:i+int(info.size)])
    260 		rb.insertOrdered(info)
    261 	}
    262 	return iSuccess
    263 }
    264 
    265 // insertSingle inserts an entry in the reorderBuffer for the rune at
    266 // position i. info is the runeInfo for the rune at position i.
    267 func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) {
    268 	src.copySlice(rb.byte[rb.nbyte:], i, i+int(info.size))
    269 	rb.insertOrdered(info)
    270 }
    271 
    272 // insertCGJ inserts a Combining Grapheme Joiner (0x034f) into rb.
    273 func (rb *reorderBuffer) insertCGJ() {
    274 	rb.insertSingle(input{str: GraphemeJoiner}, 0, Properties{size: uint8(len(GraphemeJoiner))})
    275 }
    276 
    277 // appendRune inserts a rune at the end of the buffer. It is used for Hangul.
    278 func (rb *reorderBuffer) appendRune(r rune) {
    279 	bn := rb.nbyte
    280 	sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
    281 	rb.nbyte += utf8.UTFMax
    282 	rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)}
    283 	rb.nrune++
    284 }
    285 
    286 // assignRune sets a rune at position pos. It is used for Hangul and recomposition.
    287 func (rb *reorderBuffer) assignRune(pos int, r rune) {
    288 	bn := rb.rune[pos].pos
    289 	sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
    290 	rb.rune[pos] = Properties{pos: bn, size: uint8(sz)}
    291 }
    292 
    293 // runeAt returns the rune at position n. It is used for Hangul and recomposition.
    294 func (rb *reorderBuffer) runeAt(n int) rune {
    295 	inf := rb.rune[n]
    296 	r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size])
    297 	return r
    298 }
    299 
    300 // bytesAt returns the UTF-8 encoding of the rune at position n.
    301 // It is used for Hangul and recomposition.
    302 func (rb *reorderBuffer) bytesAt(n int) []byte {
    303 	inf := rb.rune[n]
    304 	return rb.byte[inf.pos : int(inf.pos)+int(inf.size)]
    305 }
    306 
    307 // For Hangul we combine algorithmically, instead of using tables.
    308 const (
    309 	hangulBase  = 0xAC00 // UTF-8(hangulBase) -> EA B0 80
    310 	hangulBase0 = 0xEA
    311 	hangulBase1 = 0xB0
    312 	hangulBase2 = 0x80
    313 
    314 	hangulEnd  = hangulBase + jamoLVTCount // UTF-8(0xD7A4) -> ED 9E A4
    315 	hangulEnd0 = 0xED
    316 	hangulEnd1 = 0x9E
    317 	hangulEnd2 = 0xA4
    318 
    319 	jamoLBase  = 0x1100 // UTF-8(jamoLBase) -> E1 84 00
    320 	jamoLBase0 = 0xE1
    321 	jamoLBase1 = 0x84
    322 	jamoLEnd   = 0x1113
    323 	jamoVBase  = 0x1161
    324 	jamoVEnd   = 0x1176
    325 	jamoTBase  = 0x11A7
    326 	jamoTEnd   = 0x11C3
    327 
    328 	jamoTCount   = 28
    329 	jamoVCount   = 21
    330 	jamoVTCount  = 21 * 28
    331 	jamoLVTCount = 19 * 21 * 28
    332 )
    333 
    334 const hangulUTF8Size = 3
    335 
    336 func isHangul(b []byte) bool {
    337 	if len(b) < hangulUTF8Size {
    338 		return false
    339 	}
    340 	b0 := b[0]
    341 	if b0 < hangulBase0 {
    342 		return false
    343 	}
    344 	b1 := b[1]
    345 	switch {
    346 	case b0 == hangulBase0:
    347 		return b1 >= hangulBase1
    348 	case b0 < hangulEnd0:
    349 		return true
    350 	case b0 > hangulEnd0:
    351 		return false
    352 	case b1 < hangulEnd1:
    353 		return true
    354 	}
    355 	return b1 == hangulEnd1 && b[2] < hangulEnd2
    356 }
    357 
    358 func isHangulString(b string) bool {
    359 	if len(b) < hangulUTF8Size {
    360 		return false
    361 	}
    362 	b0 := b[0]
    363 	if b0 < hangulBase0 {
    364 		return false
    365 	}
    366 	b1 := b[1]
    367 	switch {
    368 	case b0 == hangulBase0:
    369 		return b1 >= hangulBase1
    370 	case b0 < hangulEnd0:
    371 		return true
    372 	case b0 > hangulEnd0:
    373 		return false
    374 	case b1 < hangulEnd1:
    375 		return true
    376 	}
    377 	return b1 == hangulEnd1 && b[2] < hangulEnd2
    378 }
    379 
    380 // Caller must ensure len(b) >= 2.
    381 func isJamoVT(b []byte) bool {
    382 	// True if (rune & 0xff00) == jamoLBase
    383 	return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1
    384 }
    385 
    386 func isHangulWithoutJamoT(b []byte) bool {
    387 	c, _ := utf8.DecodeRune(b)
    388 	c -= hangulBase
    389 	return c < jamoLVTCount && c%jamoTCount == 0
    390 }
    391 
    392 // decomposeHangul writes the decomposed Hangul to buf and returns the number
    393 // of bytes written.  len(buf) should be at least 9.
    394 func decomposeHangul(buf []byte, r rune) int {
    395 	const JamoUTF8Len = 3
    396 	r -= hangulBase
    397 	x := r % jamoTCount
    398 	r /= jamoTCount
    399 	utf8.EncodeRune(buf, jamoLBase+r/jamoVCount)
    400 	utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount)
    401 	if x != 0 {
    402 		utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x)
    403 		return 3 * JamoUTF8Len
    404 	}
    405 	return 2 * JamoUTF8Len
    406 }
    407 
    408 // decomposeHangul algorithmically decomposes a Hangul rune into
    409 // its Jamo components.
    410 // See https://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
    411 func (rb *reorderBuffer) decomposeHangul(r rune) {
    412 	r -= hangulBase
    413 	x := r % jamoTCount
    414 	r /= jamoTCount
    415 	rb.appendRune(jamoLBase + r/jamoVCount)
    416 	rb.appendRune(jamoVBase + r%jamoVCount)
    417 	if x != 0 {
    418 		rb.appendRune(jamoTBase + x)
    419 	}
    420 }
    421 
    422 // combineHangul algorithmically combines Jamo character components into Hangul.
    423 // See https://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
    424 func (rb *reorderBuffer) combineHangul(s, i, k int) {
    425 	b := rb.rune[:]
    426 	bn := rb.nrune
    427 	for ; i < bn; i++ {
    428 		cccB := b[k-1].ccc
    429 		cccC := b[i].ccc
    430 		if cccB == 0 {
    431 			s = k - 1
    432 		}
    433 		if s != k-1 && cccB >= cccC {
    434 			// b[i] is blocked by greater-equal cccX below it
    435 			b[k] = b[i]
    436 			k++
    437 		} else {
    438 			l := rb.runeAt(s) // also used to compare to hangulBase
    439 			v := rb.runeAt(i) // also used to compare to jamoT
    440 			switch {
    441 			case jamoLBase <= l && l < jamoLEnd &&
    442 				jamoVBase <= v && v < jamoVEnd:
    443 				// 11xx plus 116x to LV
    444 				rb.assignRune(s, hangulBase+
    445 					(l-jamoLBase)*jamoVTCount+(v-jamoVBase)*jamoTCount)
    446 			case hangulBase <= l && l < hangulEnd &&
    447 				jamoTBase < v && v < jamoTEnd &&
    448 				((l-hangulBase)%jamoTCount) == 0:
    449 				// ACxx plus 11Ax to LVT
    450 				rb.assignRune(s, l+v-jamoTBase)
    451 			default:
    452 				b[k] = b[i]
    453 				k++
    454 			}
    455 		}
    456 	}
    457 	rb.nrune = k
    458 }
    459 
    460 // compose recombines the runes in the buffer.
    461 // It should only be used to recompose a single segment, as it will not
    462 // handle alternations between Hangul and non-Hangul characters correctly.
    463 func (rb *reorderBuffer) compose() {
    464 	// Lazily load the map used by the combine func below, but do
    465 	// it outside of the loop.
    466 	recompMapOnce.Do(buildRecompMap)
    467 
    468 	// UAX #15, section X5 , including Corrigendum #5
    469 	// "In any character sequence beginning with starter S, a character C is
    470 	//  blocked from S if and only if there is some character B between S
    471 	//  and C, and either B is a starter or it has the same or higher
    472 	//  combining class as C."
    473 	bn := rb.nrune
    474 	if bn == 0 {
    475 		return
    476 	}
    477 	k := 1
    478 	b := rb.rune[:]
    479 	for s, i := 0, 1; i < bn; i++ {
    480 		if isJamoVT(rb.bytesAt(i)) {
    481 			// Redo from start in Hangul mode. Necessary to support
    482 			// U+320E..U+321E in NFKC mode.
    483 			rb.combineHangul(s, i, k)
    484 			return
    485 		}
    486 		ii := b[i]
    487 		// We can only use combineForward as a filter if we later
    488 		// get the info for the combined character. This is more
    489 		// expensive than using the filter. Using combinesBackward()
    490 		// is safe.
    491 		if ii.combinesBackward() {
    492 			cccB := b[k-1].ccc
    493 			cccC := ii.ccc
    494 			blocked := false // b[i] blocked by starter or greater or equal CCC?
    495 			if cccB == 0 {
    496 				s = k - 1
    497 			} else {
    498 				blocked = s != k-1 && cccB >= cccC
    499 			}
    500 			if !blocked {
    501 				combined := combine(rb.runeAt(s), rb.runeAt(i))
    502 				if combined != 0 {
    503 					rb.assignRune(s, combined)
    504 					continue
    505 				}
    506 			}
    507 		}
    508 		b[k] = b[i]
    509 		k++
    510 	}
    511 	rb.nrune = k
    512 }