gemini-browser

A text-based gemini browser
git clone git://git.laack.co/gemini-browser.git
Log | Files | Refs | README

normalize.go (15238B)


      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Note: the file data_test.go that is generated should not be checked in.
      6 //go:generate go run maketables.go triegen.go
      7 //go:generate go test -tags test
      8 
      9 // Package norm contains types and functions for normalizing Unicode strings.
     10 package norm // import "golang.org/x/text/unicode/norm"
     11 
     12 import (
     13 	"unicode/utf8"
     14 
     15 	"golang.org/x/text/transform"
     16 )
     17 
     18 // A Form denotes a canonical representation of Unicode code points.
     19 // The Unicode-defined normalization and equivalence forms are:
     20 //
     21 //	NFC   Unicode Normalization Form C
     22 //	NFD   Unicode Normalization Form D
     23 //	NFKC  Unicode Normalization Form KC
     24 //	NFKD  Unicode Normalization Form KD
     25 //
     26 // For a Form f, this documentation uses the notation f(x) to mean
     27 // the bytes or string x converted to the given form.
     28 // A position n in x is called a boundary if conversion to the form can
     29 // proceed independently on both sides:
     30 //
     31 //	f(x) == append(f(x[0:n]), f(x[n:])...)
     32 //
     33 // References: https://unicode.org/reports/tr15/ and
     34 // https://unicode.org/notes/tn5/.
     35 type Form int
     36 
     37 const (
     38 	NFC Form = iota
     39 	NFD
     40 	NFKC
     41 	NFKD
     42 )
     43 
     44 // Bytes returns f(b). May return b if f(b) = b.
     45 func (f Form) Bytes(b []byte) []byte {
     46 	src := inputBytes(b)
     47 	ft := formTable[f]
     48 	n, ok := ft.quickSpan(src, 0, len(b), true)
     49 	if ok {
     50 		return b
     51 	}
     52 	out := make([]byte, n, len(b))
     53 	copy(out, b[0:n])
     54 	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
     55 	return doAppendInner(&rb, n)
     56 }
     57 
     58 // String returns f(s).
     59 func (f Form) String(s string) string {
     60 	src := inputString(s)
     61 	ft := formTable[f]
     62 	n, ok := ft.quickSpan(src, 0, len(s), true)
     63 	if ok {
     64 		return s
     65 	}
     66 	out := make([]byte, n, len(s))
     67 	copy(out, s[0:n])
     68 	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
     69 	return string(doAppendInner(&rb, n))
     70 }
     71 
     72 // IsNormal returns true if b == f(b).
     73 func (f Form) IsNormal(b []byte) bool {
     74 	src := inputBytes(b)
     75 	ft := formTable[f]
     76 	bp, ok := ft.quickSpan(src, 0, len(b), true)
     77 	if ok {
     78 		return true
     79 	}
     80 	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
     81 	rb.setFlusher(nil, cmpNormalBytes)
     82 	for bp < len(b) {
     83 		rb.out = b[bp:]
     84 		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
     85 			return false
     86 		}
     87 		bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
     88 	}
     89 	return true
     90 }
     91 
     92 func cmpNormalBytes(rb *reorderBuffer) bool {
     93 	b := rb.out
     94 	for i := 0; i < rb.nrune; i++ {
     95 		info := rb.rune[i]
     96 		if int(info.size) > len(b) {
     97 			return false
     98 		}
     99 		p := info.pos
    100 		pe := p + info.size
    101 		for ; p < pe; p++ {
    102 			if b[0] != rb.byte[p] {
    103 				return false
    104 			}
    105 			b = b[1:]
    106 		}
    107 	}
    108 	return true
    109 }
    110 
    111 // IsNormalString returns true if s == f(s).
    112 func (f Form) IsNormalString(s string) bool {
    113 	src := inputString(s)
    114 	ft := formTable[f]
    115 	bp, ok := ft.quickSpan(src, 0, len(s), true)
    116 	if ok {
    117 		return true
    118 	}
    119 	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
    120 	rb.setFlusher(nil, func(rb *reorderBuffer) bool {
    121 		for i := 0; i < rb.nrune; i++ {
    122 			info := rb.rune[i]
    123 			if bp+int(info.size) > len(s) {
    124 				return false
    125 			}
    126 			p := info.pos
    127 			pe := p + info.size
    128 			for ; p < pe; p++ {
    129 				if s[bp] != rb.byte[p] {
    130 					return false
    131 				}
    132 				bp++
    133 			}
    134 		}
    135 		return true
    136 	})
    137 	for bp < len(s) {
    138 		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
    139 			return false
    140 		}
    141 		bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
    142 	}
    143 	return true
    144 }
    145 
    146 // patchTail fixes a case where a rune may be incorrectly normalized
    147 // if it is followed by illegal continuation bytes. It returns the
    148 // patched buffer and whether the decomposition is still in progress.
    149 func patchTail(rb *reorderBuffer) bool {
    150 	info, p := lastRuneStart(&rb.f, rb.out)
    151 	if p == -1 || info.size == 0 {
    152 		return true
    153 	}
    154 	end := p + int(info.size)
    155 	extra := len(rb.out) - end
    156 	if extra > 0 {
    157 		// Potentially allocating memory. However, this only
    158 		// happens with ill-formed UTF-8.
    159 		x := make([]byte, 0)
    160 		x = append(x, rb.out[len(rb.out)-extra:]...)
    161 		rb.out = rb.out[:end]
    162 		decomposeToLastBoundary(rb)
    163 		rb.doFlush()
    164 		rb.out = append(rb.out, x...)
    165 		return false
    166 	}
    167 	buf := rb.out[p:]
    168 	rb.out = rb.out[:p]
    169 	decomposeToLastBoundary(rb)
    170 	if s := rb.ss.next(info); s == ssStarter {
    171 		rb.doFlush()
    172 		rb.ss.first(info)
    173 	} else if s == ssOverflow {
    174 		rb.doFlush()
    175 		rb.insertCGJ()
    176 		rb.ss = 0
    177 	}
    178 	rb.insertUnsafe(inputBytes(buf), 0, info)
    179 	return true
    180 }
    181 
    182 func appendQuick(rb *reorderBuffer, i int) int {
    183 	if rb.nsrc == i {
    184 		return i
    185 	}
    186 	end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
    187 	rb.out = rb.src.appendSlice(rb.out, i, end)
    188 	return end
    189 }
    190 
    191 // Append returns f(append(out, b...)).
    192 // The buffer out must be nil, empty, or equal to f(out).
    193 func (f Form) Append(out []byte, src ...byte) []byte {
    194 	return f.doAppend(out, inputBytes(src), len(src))
    195 }
    196 
    197 func (f Form) doAppend(out []byte, src input, n int) []byte {
    198 	if n == 0 {
    199 		return out
    200 	}
    201 	ft := formTable[f]
    202 	// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
    203 	if len(out) == 0 {
    204 		p, _ := ft.quickSpan(src, 0, n, true)
    205 		out = src.appendSlice(out, 0, p)
    206 		if p == n {
    207 			return out
    208 		}
    209 		rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
    210 		return doAppendInner(&rb, p)
    211 	}
    212 	rb := reorderBuffer{f: *ft, src: src, nsrc: n}
    213 	return doAppend(&rb, out, 0)
    214 }
    215 
    216 func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
    217 	rb.setFlusher(out, appendFlush)
    218 	src, n := rb.src, rb.nsrc
    219 	doMerge := len(out) > 0
    220 	if q := src.skipContinuationBytes(p); q > p {
    221 		// Move leading non-starters to destination.
    222 		rb.out = src.appendSlice(rb.out, p, q)
    223 		p = q
    224 		doMerge = patchTail(rb)
    225 	}
    226 	fd := &rb.f
    227 	if doMerge {
    228 		var info Properties
    229 		if p < n {
    230 			info = fd.info(src, p)
    231 			if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
    232 				if p == 0 {
    233 					decomposeToLastBoundary(rb)
    234 				}
    235 				p = decomposeSegment(rb, p, true)
    236 			}
    237 		}
    238 		if info.size == 0 {
    239 			rb.doFlush()
    240 			// Append incomplete UTF-8 encoding.
    241 			return src.appendSlice(rb.out, p, n)
    242 		}
    243 		if rb.nrune > 0 {
    244 			return doAppendInner(rb, p)
    245 		}
    246 	}
    247 	p = appendQuick(rb, p)
    248 	return doAppendInner(rb, p)
    249 }
    250 
    251 func doAppendInner(rb *reorderBuffer, p int) []byte {
    252 	for n := rb.nsrc; p < n; {
    253 		p = decomposeSegment(rb, p, true)
    254 		p = appendQuick(rb, p)
    255 	}
    256 	return rb.out
    257 }
    258 
    259 // AppendString returns f(append(out, []byte(s))).
    260 // The buffer out must be nil, empty, or equal to f(out).
    261 func (f Form) AppendString(out []byte, src string) []byte {
    262 	return f.doAppend(out, inputString(src), len(src))
    263 }
    264 
    265 // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
    266 // It is not guaranteed to return the largest such n.
    267 func (f Form) QuickSpan(b []byte) int {
    268 	n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
    269 	return n
    270 }
    271 
    272 // Span implements transform.SpanningTransformer. It returns a boundary n such
    273 // that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
    274 func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
    275 	n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
    276 	if n < len(b) {
    277 		if !ok {
    278 			err = transform.ErrEndOfSpan
    279 		} else {
    280 			err = transform.ErrShortSrc
    281 		}
    282 	}
    283 	return n, err
    284 }
    285 
    286 // SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
    287 // It is not guaranteed to return the largest such n.
    288 func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
    289 	n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
    290 	if n < len(s) {
    291 		if !ok {
    292 			err = transform.ErrEndOfSpan
    293 		} else {
    294 			err = transform.ErrShortSrc
    295 		}
    296 	}
    297 	return n, err
    298 }
    299 
    300 // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
    301 // whether any non-normalized parts were found. If atEOF is false, n will
    302 // not point past the last segment if this segment might be become
    303 // non-normalized by appending other runes.
    304 func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
    305 	var lastCC uint8
    306 	ss := streamSafe(0)
    307 	lastSegStart := i
    308 	for n = end; i < n; {
    309 		if j := src.skipASCII(i, n); i != j {
    310 			i = j
    311 			lastSegStart = i - 1
    312 			lastCC = 0
    313 			ss = 0
    314 			continue
    315 		}
    316 		info := f.info(src, i)
    317 		if info.size == 0 {
    318 			if atEOF {
    319 				// include incomplete runes
    320 				return n, true
    321 			}
    322 			return lastSegStart, true
    323 		}
    324 		// This block needs to be before the next, because it is possible to
    325 		// have an overflow for runes that are starters (e.g. with U+FF9E).
    326 		switch ss.next(info) {
    327 		case ssStarter:
    328 			lastSegStart = i
    329 		case ssOverflow:
    330 			return lastSegStart, false
    331 		case ssSuccess:
    332 			if lastCC > info.ccc {
    333 				return lastSegStart, false
    334 			}
    335 		}
    336 		if f.composing {
    337 			if !info.isYesC() {
    338 				break
    339 			}
    340 		} else {
    341 			if !info.isYesD() {
    342 				break
    343 			}
    344 		}
    345 		lastCC = info.ccc
    346 		i += int(info.size)
    347 	}
    348 	if i == n {
    349 		if !atEOF {
    350 			n = lastSegStart
    351 		}
    352 		return n, true
    353 	}
    354 	return lastSegStart, false
    355 }
    356 
    357 // QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
    358 // It is not guaranteed to return the largest such n.
    359 func (f Form) QuickSpanString(s string) int {
    360 	n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
    361 	return n
    362 }
    363 
    364 // FirstBoundary returns the position i of the first boundary in b
    365 // or -1 if b contains no boundary.
    366 func (f Form) FirstBoundary(b []byte) int {
    367 	return f.firstBoundary(inputBytes(b), len(b))
    368 }
    369 
    370 func (f Form) firstBoundary(src input, nsrc int) int {
    371 	i := src.skipContinuationBytes(0)
    372 	if i >= nsrc {
    373 		return -1
    374 	}
    375 	fd := formTable[f]
    376 	ss := streamSafe(0)
    377 	// We should call ss.first here, but we can't as the first rune is
    378 	// skipped already. This means FirstBoundary can't really determine
    379 	// CGJ insertion points correctly. Luckily it doesn't have to.
    380 	for {
    381 		info := fd.info(src, i)
    382 		if info.size == 0 {
    383 			return -1
    384 		}
    385 		if s := ss.next(info); s != ssSuccess {
    386 			return i
    387 		}
    388 		i += int(info.size)
    389 		if i >= nsrc {
    390 			if !info.BoundaryAfter() && !ss.isMax() {
    391 				return -1
    392 			}
    393 			return nsrc
    394 		}
    395 	}
    396 }
    397 
    398 // FirstBoundaryInString returns the position i of the first boundary in s
    399 // or -1 if s contains no boundary.
    400 func (f Form) FirstBoundaryInString(s string) int {
    401 	return f.firstBoundary(inputString(s), len(s))
    402 }
    403 
    404 // NextBoundary reports the index of the boundary between the first and next
    405 // segment in b or -1 if atEOF is false and there are not enough bytes to
    406 // determine this boundary.
    407 func (f Form) NextBoundary(b []byte, atEOF bool) int {
    408 	return f.nextBoundary(inputBytes(b), len(b), atEOF)
    409 }
    410 
    411 // NextBoundaryInString reports the index of the boundary between the first and
    412 // next segment in b or -1 if atEOF is false and there are not enough bytes to
    413 // determine this boundary.
    414 func (f Form) NextBoundaryInString(s string, atEOF bool) int {
    415 	return f.nextBoundary(inputString(s), len(s), atEOF)
    416 }
    417 
    418 func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
    419 	if nsrc == 0 {
    420 		if atEOF {
    421 			return 0
    422 		}
    423 		return -1
    424 	}
    425 	fd := formTable[f]
    426 	info := fd.info(src, 0)
    427 	if info.size == 0 {
    428 		if atEOF {
    429 			return 1
    430 		}
    431 		return -1
    432 	}
    433 	ss := streamSafe(0)
    434 	ss.first(info)
    435 
    436 	for i := int(info.size); i < nsrc; i += int(info.size) {
    437 		info = fd.info(src, i)
    438 		if info.size == 0 {
    439 			if atEOF {
    440 				return i
    441 			}
    442 			return -1
    443 		}
    444 		// TODO: Using streamSafe to determine the boundary isn't the same as
    445 		// using BoundaryBefore. Determine which should be used.
    446 		if s := ss.next(info); s != ssSuccess {
    447 			return i
    448 		}
    449 	}
    450 	if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
    451 		return -1
    452 	}
    453 	return nsrc
    454 }
    455 
    456 // LastBoundary returns the position i of the last boundary in b
    457 // or -1 if b contains no boundary.
    458 func (f Form) LastBoundary(b []byte) int {
    459 	return lastBoundary(formTable[f], b)
    460 }
    461 
    462 func lastBoundary(fd *formInfo, b []byte) int {
    463 	i := len(b)
    464 	info, p := lastRuneStart(fd, b)
    465 	if p == -1 {
    466 		return -1
    467 	}
    468 	if info.size == 0 { // ends with incomplete rune
    469 		if p == 0 { // starts with incomplete rune
    470 			return -1
    471 		}
    472 		i = p
    473 		info, p = lastRuneStart(fd, b[:i])
    474 		if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
    475 			return i
    476 		}
    477 	}
    478 	if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
    479 		return i
    480 	}
    481 	if info.BoundaryAfter() {
    482 		return i
    483 	}
    484 	ss := streamSafe(0)
    485 	v := ss.backwards(info)
    486 	for i = p; i >= 0 && v != ssStarter; i = p {
    487 		info, p = lastRuneStart(fd, b[:i])
    488 		if v = ss.backwards(info); v == ssOverflow {
    489 			break
    490 		}
    491 		if p+int(info.size) != i {
    492 			if p == -1 { // no boundary found
    493 				return -1
    494 			}
    495 			return i // boundary after an illegal UTF-8 encoding
    496 		}
    497 	}
    498 	return i
    499 }
    500 
    501 // decomposeSegment scans the first segment in src into rb. It inserts 0x034f
    502 // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
    503 // and returns the number of bytes consumed from src or iShortDst or iShortSrc.
    504 func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
    505 	// Force one character to be consumed.
    506 	info := rb.f.info(rb.src, sp)
    507 	if info.size == 0 {
    508 		return 0
    509 	}
    510 	if s := rb.ss.next(info); s == ssStarter {
    511 		// TODO: this could be removed if we don't support merging.
    512 		if rb.nrune > 0 {
    513 			goto end
    514 		}
    515 	} else if s == ssOverflow {
    516 		rb.insertCGJ()
    517 		goto end
    518 	}
    519 	if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
    520 		return int(err)
    521 	}
    522 	for {
    523 		sp += int(info.size)
    524 		if sp >= rb.nsrc {
    525 			if !atEOF && !info.BoundaryAfter() {
    526 				return int(iShortSrc)
    527 			}
    528 			break
    529 		}
    530 		info = rb.f.info(rb.src, sp)
    531 		if info.size == 0 {
    532 			if !atEOF {
    533 				return int(iShortSrc)
    534 			}
    535 			break
    536 		}
    537 		if s := rb.ss.next(info); s == ssStarter {
    538 			break
    539 		} else if s == ssOverflow {
    540 			rb.insertCGJ()
    541 			break
    542 		}
    543 		if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
    544 			return int(err)
    545 		}
    546 	}
    547 end:
    548 	if !rb.doFlush() {
    549 		return int(iShortDst)
    550 	}
    551 	return sp
    552 }
    553 
    554 // lastRuneStart returns the runeInfo and position of the last
    555 // rune in buf or the zero runeInfo and -1 if no rune was found.
    556 func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
    557 	p := len(buf) - 1
    558 	for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
    559 	}
    560 	if p < 0 {
    561 		return Properties{}, -1
    562 	}
    563 	return fd.info(inputBytes(buf), p), p
    564 }
    565 
    566 // decomposeToLastBoundary finds an open segment at the end of the buffer
    567 // and scans it into rb. Returns the buffer minus the last segment.
    568 func decomposeToLastBoundary(rb *reorderBuffer) {
    569 	fd := &rb.f
    570 	info, i := lastRuneStart(fd, rb.out)
    571 	if int(info.size) != len(rb.out)-i {
    572 		// illegal trailing continuation bytes
    573 		return
    574 	}
    575 	if info.BoundaryAfter() {
    576 		return
    577 	}
    578 	var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
    579 	padd := 0
    580 	ss := streamSafe(0)
    581 	p := len(rb.out)
    582 	for {
    583 		add[padd] = info
    584 		v := ss.backwards(info)
    585 		if v == ssOverflow {
    586 			// Note that if we have an overflow, it the string we are appending to
    587 			// is not correctly normalized. In this case the behavior is undefined.
    588 			break
    589 		}
    590 		padd++
    591 		p -= int(info.size)
    592 		if v == ssStarter || p < 0 {
    593 			break
    594 		}
    595 		info, i = lastRuneStart(fd, rb.out[:p])
    596 		if int(info.size) != p-i {
    597 			break
    598 		}
    599 	}
    600 	rb.ss = ss
    601 	// Copy bytes for insertion as we may need to overwrite rb.out.
    602 	var buf [maxBufferSize * utf8.UTFMax]byte
    603 	cp := buf[:copy(buf[:], rb.out[p:])]
    604 	rb.out = rb.out[:p]
    605 	for padd--; padd >= 0; padd-- {
    606 		info = add[padd]
    607 		rb.insertUnsafe(inputBytes(cp), 0, info)
    608 		cp = cp[info.size:]
    609 	}
    610 }