gemini-browser

A text-based gemini browser
git clone git://git.laack.co/gemini-browser.git
Log | Files | Refs | README

iter.go (11280B)


      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package norm
      6 
      7 import (
      8 	"fmt"
      9 	"unicode/utf8"
     10 )
     11 
     12 // MaxSegmentSize is the maximum size of a byte buffer needed to consider any
     13 // sequence of starter and non-starter runes for the purpose of normalization.
     14 const MaxSegmentSize = maxByteBufferSize
     15 
     16 // An Iter iterates over a string or byte slice, while normalizing it
     17 // to a given Form.
     18 type Iter struct {
     19 	rb     reorderBuffer
     20 	buf    [maxByteBufferSize]byte
     21 	info   Properties // first character saved from previous iteration
     22 	next   iterFunc   // implementation of next depends on form
     23 	asciiF iterFunc
     24 
     25 	p        int    // current position in input source
     26 	multiSeg []byte // remainder of multi-segment decomposition
     27 }
     28 
     29 type iterFunc func(*Iter) []byte
     30 
     31 // Init initializes i to iterate over src after normalizing it to Form f.
     32 func (i *Iter) Init(f Form, src []byte) {
     33 	i.p = 0
     34 	if len(src) == 0 {
     35 		i.setDone()
     36 		i.rb.nsrc = 0
     37 		return
     38 	}
     39 	i.multiSeg = nil
     40 	i.rb.init(f, src)
     41 	i.next = i.rb.f.nextMain
     42 	i.asciiF = nextASCIIBytes
     43 	i.info = i.rb.f.info(i.rb.src, i.p)
     44 	i.rb.ss.first(i.info)
     45 }
     46 
     47 // InitString initializes i to iterate over src after normalizing it to Form f.
     48 func (i *Iter) InitString(f Form, src string) {
     49 	i.p = 0
     50 	if len(src) == 0 {
     51 		i.setDone()
     52 		i.rb.nsrc = 0
     53 		return
     54 	}
     55 	i.multiSeg = nil
     56 	i.rb.initString(f, src)
     57 	i.next = i.rb.f.nextMain
     58 	i.asciiF = nextASCIIString
     59 	i.info = i.rb.f.info(i.rb.src, i.p)
     60 	i.rb.ss.first(i.info)
     61 }
     62 
     63 // Seek sets the segment to be returned by the next call to Next to start
     64 // at position p.  It is the responsibility of the caller to set p to the
     65 // start of a segment.
     66 func (i *Iter) Seek(offset int64, whence int) (int64, error) {
     67 	var abs int64
     68 	switch whence {
     69 	case 0:
     70 		abs = offset
     71 	case 1:
     72 		abs = int64(i.p) + offset
     73 	case 2:
     74 		abs = int64(i.rb.nsrc) + offset
     75 	default:
     76 		return 0, fmt.Errorf("norm: invalid whence")
     77 	}
     78 	if abs < 0 {
     79 		return 0, fmt.Errorf("norm: negative position")
     80 	}
     81 	if int(abs) >= i.rb.nsrc {
     82 		i.setDone()
     83 		return int64(i.p), nil
     84 	}
     85 	i.p = int(abs)
     86 	i.multiSeg = nil
     87 	i.next = i.rb.f.nextMain
     88 	i.info = i.rb.f.info(i.rb.src, i.p)
     89 	i.rb.ss.first(i.info)
     90 	return abs, nil
     91 }
     92 
     93 // returnSlice returns a slice of the underlying input type as a byte slice.
     94 // If the underlying is of type []byte, it will simply return a slice.
     95 // If the underlying is of type string, it will copy the slice to the buffer
     96 // and return that.
     97 func (i *Iter) returnSlice(a, b int) []byte {
     98 	if i.rb.src.bytes == nil {
     99 		return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
    100 	}
    101 	return i.rb.src.bytes[a:b]
    102 }
    103 
    104 // Pos returns the byte position at which the next call to Next will commence processing.
    105 func (i *Iter) Pos() int {
    106 	return i.p
    107 }
    108 
    109 func (i *Iter) setDone() {
    110 	i.next = nextDone
    111 	i.p = i.rb.nsrc
    112 }
    113 
    114 // Done returns true if there is no more input to process.
    115 func (i *Iter) Done() bool {
    116 	return i.p >= i.rb.nsrc
    117 }
    118 
    119 // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
    120 // For any input a and b for which f(a) == f(b), subsequent calls
    121 // to Next will return the same segments.
    122 // Modifying runes are grouped together with the preceding starter, if such a starter exists.
    123 // Although not guaranteed, n will typically be the smallest possible n.
    124 func (i *Iter) Next() []byte {
    125 	return i.next(i)
    126 }
    127 
    128 func nextASCIIBytes(i *Iter) []byte {
    129 	p := i.p + 1
    130 	if p >= i.rb.nsrc {
    131 		p0 := i.p
    132 		i.setDone()
    133 		return i.rb.src.bytes[p0:p]
    134 	}
    135 	if i.rb.src.bytes[p] < utf8.RuneSelf {
    136 		p0 := i.p
    137 		i.p = p
    138 		return i.rb.src.bytes[p0:p]
    139 	}
    140 	i.info = i.rb.f.info(i.rb.src, i.p)
    141 	i.next = i.rb.f.nextMain
    142 	return i.next(i)
    143 }
    144 
    145 func nextASCIIString(i *Iter) []byte {
    146 	p := i.p + 1
    147 	if p >= i.rb.nsrc {
    148 		i.buf[0] = i.rb.src.str[i.p]
    149 		i.setDone()
    150 		return i.buf[:1]
    151 	}
    152 	if i.rb.src.str[p] < utf8.RuneSelf {
    153 		i.buf[0] = i.rb.src.str[i.p]
    154 		i.p = p
    155 		return i.buf[:1]
    156 	}
    157 	i.info = i.rb.f.info(i.rb.src, i.p)
    158 	i.next = i.rb.f.nextMain
    159 	return i.next(i)
    160 }
    161 
    162 func nextHangul(i *Iter) []byte {
    163 	p := i.p
    164 	next := p + hangulUTF8Size
    165 	if next >= i.rb.nsrc {
    166 		i.setDone()
    167 	} else if i.rb.src.hangul(next) == 0 {
    168 		i.rb.ss.next(i.info)
    169 		i.info = i.rb.f.info(i.rb.src, i.p)
    170 		i.next = i.rb.f.nextMain
    171 		return i.next(i)
    172 	}
    173 	i.p = next
    174 	return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
    175 }
    176 
    177 func nextDone(i *Iter) []byte {
    178 	return nil
    179 }
    180 
    181 // nextMulti is used for iterating over multi-segment decompositions
    182 // for decomposing normal forms.
    183 func nextMulti(i *Iter) []byte {
    184 	j := 0
    185 	d := i.multiSeg
    186 	// skip first rune
    187 	for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
    188 	}
    189 	for j < len(d) {
    190 		info := i.rb.f.info(input{bytes: d}, j)
    191 		if info.BoundaryBefore() {
    192 			i.multiSeg = d[j:]
    193 			return d[:j]
    194 		}
    195 		j += int(info.size)
    196 	}
    197 	// treat last segment as normal decomposition
    198 	i.next = i.rb.f.nextMain
    199 	return i.next(i)
    200 }
    201 
    202 // nextMultiNorm is used for iterating over multi-segment decompositions
    203 // for composing normal forms.
    204 func nextMultiNorm(i *Iter) []byte {
    205 	j := 0
    206 	d := i.multiSeg
    207 	for j < len(d) {
    208 		info := i.rb.f.info(input{bytes: d}, j)
    209 		if info.BoundaryBefore() {
    210 			i.rb.compose()
    211 			seg := i.buf[:i.rb.flushCopy(i.buf[:])]
    212 			i.rb.insertUnsafe(input{bytes: d}, j, info)
    213 			i.multiSeg = d[j+int(info.size):]
    214 			return seg
    215 		}
    216 		i.rb.insertUnsafe(input{bytes: d}, j, info)
    217 		j += int(info.size)
    218 	}
    219 	i.multiSeg = nil
    220 	i.next = nextComposed
    221 	return doNormComposed(i)
    222 }
    223 
    224 // nextDecomposed is the implementation of Next for forms NFD and NFKD.
    225 func nextDecomposed(i *Iter) (next []byte) {
    226 	outp := 0
    227 	inCopyStart, outCopyStart := i.p, 0
    228 	for {
    229 		if sz := int(i.info.size); sz <= 1 {
    230 			i.rb.ss = 0
    231 			p := i.p
    232 			i.p++ // ASCII or illegal byte.  Either way, advance by 1.
    233 			if i.p >= i.rb.nsrc {
    234 				i.setDone()
    235 				return i.returnSlice(p, i.p)
    236 			} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
    237 				i.next = i.asciiF
    238 				return i.returnSlice(p, i.p)
    239 			}
    240 			outp++
    241 		} else if d := i.info.Decomposition(); d != nil {
    242 			// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
    243 			// Case 1: there is a leftover to copy.  In this case the decomposition
    244 			// must begin with a modifier and should always be appended.
    245 			// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
    246 			p := outp + len(d)
    247 			if outp > 0 {
    248 				i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
    249 				// TODO: this condition should not be possible, but we leave it
    250 				// in for defensive purposes.
    251 				if p > len(i.buf) {
    252 					return i.buf[:outp]
    253 				}
    254 			} else if i.info.multiSegment() {
    255 				// outp must be 0 as multi-segment decompositions always
    256 				// start a new segment.
    257 				if i.multiSeg == nil {
    258 					i.multiSeg = d
    259 					i.next = nextMulti
    260 					return nextMulti(i)
    261 				}
    262 				// We are in the last segment.  Treat as normal decomposition.
    263 				d = i.multiSeg
    264 				i.multiSeg = nil
    265 				p = len(d)
    266 			}
    267 			prevCC := i.info.tccc
    268 			if i.p += sz; i.p >= i.rb.nsrc {
    269 				i.setDone()
    270 				i.info = Properties{} // Force BoundaryBefore to succeed.
    271 			} else {
    272 				i.info = i.rb.f.info(i.rb.src, i.p)
    273 			}
    274 			switch i.rb.ss.next(i.info) {
    275 			case ssOverflow:
    276 				i.next = nextCGJDecompose
    277 				fallthrough
    278 			case ssStarter:
    279 				if outp > 0 {
    280 					copy(i.buf[outp:], d)
    281 					return i.buf[:p]
    282 				}
    283 				return d
    284 			}
    285 			copy(i.buf[outp:], d)
    286 			outp = p
    287 			inCopyStart, outCopyStart = i.p, outp
    288 			if i.info.ccc < prevCC {
    289 				goto doNorm
    290 			}
    291 			continue
    292 		} else if r := i.rb.src.hangul(i.p); r != 0 {
    293 			outp = decomposeHangul(i.buf[:], r)
    294 			i.p += hangulUTF8Size
    295 			inCopyStart, outCopyStart = i.p, outp
    296 			if i.p >= i.rb.nsrc {
    297 				i.setDone()
    298 				break
    299 			} else if i.rb.src.hangul(i.p) != 0 {
    300 				i.next = nextHangul
    301 				return i.buf[:outp]
    302 			}
    303 		} else {
    304 			p := outp + sz
    305 			if p > len(i.buf) {
    306 				break
    307 			}
    308 			outp = p
    309 			i.p += sz
    310 		}
    311 		if i.p >= i.rb.nsrc {
    312 			i.setDone()
    313 			break
    314 		}
    315 		prevCC := i.info.tccc
    316 		i.info = i.rb.f.info(i.rb.src, i.p)
    317 		if v := i.rb.ss.next(i.info); v == ssStarter {
    318 			break
    319 		} else if v == ssOverflow {
    320 			i.next = nextCGJDecompose
    321 			break
    322 		}
    323 		if i.info.ccc < prevCC {
    324 			goto doNorm
    325 		}
    326 	}
    327 	if outCopyStart == 0 {
    328 		return i.returnSlice(inCopyStart, i.p)
    329 	} else if inCopyStart < i.p {
    330 		i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
    331 	}
    332 	return i.buf[:outp]
    333 doNorm:
    334 	// Insert what we have decomposed so far in the reorderBuffer.
    335 	// As we will only reorder, there will always be enough room.
    336 	i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
    337 	i.rb.insertDecomposed(i.buf[0:outp])
    338 	return doNormDecomposed(i)
    339 }
    340 
    341 func doNormDecomposed(i *Iter) []byte {
    342 	for {
    343 		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
    344 		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
    345 			i.setDone()
    346 			break
    347 		}
    348 		i.info = i.rb.f.info(i.rb.src, i.p)
    349 		if i.info.ccc == 0 {
    350 			break
    351 		}
    352 		if s := i.rb.ss.next(i.info); s == ssOverflow {
    353 			i.next = nextCGJDecompose
    354 			break
    355 		}
    356 	}
    357 	// new segment or too many combining characters: exit normalization
    358 	return i.buf[:i.rb.flushCopy(i.buf[:])]
    359 }
    360 
    361 func nextCGJDecompose(i *Iter) []byte {
    362 	i.rb.ss = 0
    363 	i.rb.insertCGJ()
    364 	i.next = nextDecomposed
    365 	i.rb.ss.first(i.info)
    366 	buf := doNormDecomposed(i)
    367 	return buf
    368 }
    369 
    370 // nextComposed is the implementation of Next for forms NFC and NFKC.
    371 func nextComposed(i *Iter) []byte {
    372 	outp, startp := 0, i.p
    373 	var prevCC uint8
    374 	for {
    375 		if !i.info.isYesC() {
    376 			goto doNorm
    377 		}
    378 		prevCC = i.info.tccc
    379 		sz := int(i.info.size)
    380 		if sz == 0 {
    381 			sz = 1 // illegal rune: copy byte-by-byte
    382 		}
    383 		p := outp + sz
    384 		if p > len(i.buf) {
    385 			break
    386 		}
    387 		outp = p
    388 		i.p += sz
    389 		if i.p >= i.rb.nsrc {
    390 			i.setDone()
    391 			break
    392 		} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
    393 			i.rb.ss = 0
    394 			i.next = i.asciiF
    395 			break
    396 		}
    397 		i.info = i.rb.f.info(i.rb.src, i.p)
    398 		if v := i.rb.ss.next(i.info); v == ssStarter {
    399 			break
    400 		} else if v == ssOverflow {
    401 			i.next = nextCGJCompose
    402 			break
    403 		}
    404 		if i.info.ccc < prevCC {
    405 			goto doNorm
    406 		}
    407 	}
    408 	return i.returnSlice(startp, i.p)
    409 doNorm:
    410 	// reset to start position
    411 	i.p = startp
    412 	i.info = i.rb.f.info(i.rb.src, i.p)
    413 	i.rb.ss.first(i.info)
    414 	if i.info.multiSegment() {
    415 		d := i.info.Decomposition()
    416 		info := i.rb.f.info(input{bytes: d}, 0)
    417 		i.rb.insertUnsafe(input{bytes: d}, 0, info)
    418 		i.multiSeg = d[int(info.size):]
    419 		i.next = nextMultiNorm
    420 		return nextMultiNorm(i)
    421 	}
    422 	i.rb.ss.first(i.info)
    423 	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
    424 	return doNormComposed(i)
    425 }
    426 
    427 func doNormComposed(i *Iter) []byte {
    428 	// First rune should already be inserted.
    429 	for {
    430 		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
    431 			i.setDone()
    432 			break
    433 		}
    434 		i.info = i.rb.f.info(i.rb.src, i.p)
    435 		if s := i.rb.ss.next(i.info); s == ssStarter {
    436 			break
    437 		} else if s == ssOverflow {
    438 			i.next = nextCGJCompose
    439 			break
    440 		}
    441 		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
    442 	}
    443 	i.rb.compose()
    444 	seg := i.buf[:i.rb.flushCopy(i.buf[:])]
    445 	return seg
    446 }
    447 
    448 func nextCGJCompose(i *Iter) []byte {
    449 	i.rb.ss = 0 // instead of first
    450 	i.rb.insertCGJ()
    451 	i.next = nextComposed
    452 	// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
    453 	// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
    454 	// If we ever change that, insert a check here.
    455 	i.rb.ss.first(i.info)
    456 	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
    457 	return doNormComposed(i)
    458 }