stemmer.go (16163B)
1 // SOURCE: https://github.com/reiver/go-porterstemmer/blob/master/porterstemmer.go 2 // ATTRIBUTION: Created by Charles Iliya Krempeaux - reiver on GitHub 3 // porter stemmer 4 5 package gemtextparser 6 7 import ( 8 "unicode" 9 ) 10 11 func isConsonant(s []rune, i int) bool { 12 13 //DEBUG 14 //log.Printf("isConsonant: [%+v]", string(s[i])) 15 16 result := true 17 18 switch s[i] { 19 case 'a', 'e', 'i', 'o', 'u': 20 result = false 21 case 'y': 22 if 0 == i { 23 result = true 24 } else { 25 result = !isConsonant(s, i-1) 26 } 27 default: 28 result = true 29 } 30 31 return result 32 } 33 34 func measure(s []rune) uint { 35 36 // Initialize. 37 lenS := len(s) 38 result := uint(0) 39 i := 0 40 41 // Short Circuit. 42 if 0 == lenS { 43 /////////// RETURN 44 return result 45 } 46 47 // Ignore (potential) consonant sequence at the beginning of word. 48 for isConsonant(s, i) { 49 50 //DEBUG 51 //log.Printf("[measure([%s])] Eat Consonant [%d] -> [%s]", string(s), i, string(s[i])) 52 53 i++ 54 if i >= lenS { 55 /////////////// RETURN 56 return result 57 } 58 } 59 60 // For each pair of a vowel sequence followed by a consonant sequence, increment result. 61 Outer: 62 for i < lenS { 63 64 for !isConsonant(s, i) { 65 66 //DEBUG 67 //log.Printf("[measure([%s])] VOWEL [%d] -> [%s]", string(s), i, string(s[i])) 68 69 i++ 70 if i >= lenS { 71 /////////// BREAK 72 break Outer 73 } 74 } 75 for isConsonant(s, i) { 76 77 //DEBUG 78 //log.Printf("[measure([%s])] CONSONANT [%d] -> [%s]", string(s), i, string(s[i])) 79 80 i++ 81 if i >= lenS { 82 result++ 83 /////////// BREAK 84 break Outer 85 } 86 } 87 result++ 88 } 89 90 // Return 91 return result 92 } 93 94 func hasSuffix(s, suffix []rune) bool { 95 96 lenSMinusOne := len(s) - 1 97 lenSuffixMinusOne := len(suffix) - 1 98 99 if lenSMinusOne <= lenSuffixMinusOne { 100 return false 101 } else if s[lenSMinusOne] != suffix[lenSuffixMinusOne] { // I suspect checking this first should speed this function up in practice. 102 /////// RETURN 103 return false 104 } else { 105 106 for i := 0; i < lenSuffixMinusOne; i++ { 107 108 if suffix[i] != s[lenSMinusOne-lenSuffixMinusOne+i] { 109 /////////////// RETURN 110 return false 111 } 112 113 } 114 115 } 116 117 return true 118 } 119 120 func containsVowel(s []rune) bool { 121 122 lenS := len(s) 123 124 for i := 0; i < lenS; i++ { 125 126 if !isConsonant(s, i) { 127 /////////// RETURN 128 return true 129 } 130 131 } 132 133 return false 134 } 135 136 func hasRepeatDoubleConsonantSuffix(s []rune) bool { 137 138 // Initialize. 139 lenS := len(s) 140 141 result := false 142 143 // Do it! 144 if 2 > lenS { 145 result = false 146 } else if s[lenS-1] == s[lenS-2] && isConsonant(s, lenS-1) { // Will using isConsonant() cause a problem with "YY"? 147 result = true 148 } else { 149 result = false 150 } 151 152 // Return, 153 return result 154 } 155 156 func hasConsonantVowelConsonantSuffix(s []rune) bool { 157 158 // Initialize. 159 lenS := len(s) 160 161 result := false 162 163 // Do it! 164 if 3 > lenS { 165 result = false 166 } else if isConsonant(s, lenS-3) && !isConsonant(s, lenS-2) && isConsonant(s, lenS-1) { 167 result = true 168 } else { 169 result = false 170 } 171 172 // Return 173 return result 174 } 175 176 func step1a(s []rune) []rune { 177 178 // Initialize. 179 var result []rune = s 180 181 lenS := len(s) 182 183 // Do it! 184 if suffix := []rune("sses"); hasSuffix(s, suffix) { 185 186 lenTrim := 2 187 188 subSlice := s[:lenS-lenTrim] 189 190 result = subSlice 191 } else if suffix := []rune("ies"); hasSuffix(s, suffix) { 192 lenTrim := 2 193 194 subSlice := s[:lenS-lenTrim] 195 196 result = subSlice 197 } else if suffix := []rune("ss"); hasSuffix(s, suffix) { 198 199 result = s 200 } else if suffix := []rune("s"); hasSuffix(s, suffix) { 201 202 lenSuffix := 1 203 204 subSlice := s[:lenS-lenSuffix] 205 206 result = subSlice 207 } 208 209 // Return. 210 return result 211 } 212 213 func step1b(s []rune) []rune { 214 215 // Initialize. 216 var result []rune = s 217 218 lenS := len(s) 219 220 // Do it! 221 if suffix := []rune("eed"); hasSuffix(s, suffix) { 222 lenSuffix := len(suffix) 223 224 subSlice := s[:lenS-lenSuffix] 225 226 m := measure(subSlice) 227 228 if 0 < m { 229 lenTrim := 1 230 231 result = s[:lenS-lenTrim] 232 } 233 } else if suffix := []rune("ed"); hasSuffix(s, suffix) { 234 lenSuffix := len(suffix) 235 236 subSlice := s[:lenS-lenSuffix] 237 238 if containsVowel(subSlice) { 239 240 if suffix2 := []rune("at"); hasSuffix(subSlice, suffix2) { 241 lenTrim := -1 242 243 result = s[:lenS-lenSuffix-lenTrim] 244 } else if suffix2 := []rune("bl"); hasSuffix(subSlice, suffix2) { 245 lenTrim := -1 246 247 result = s[:lenS-lenSuffix-lenTrim] 248 } else if suffix2 := []rune("iz"); hasSuffix(subSlice, suffix2) { 249 lenTrim := -1 250 251 result = s[:lenS-lenSuffix-lenTrim] 252 } else if c := subSlice[len(subSlice)-1]; 'l' != c && 's' != c && 'z' != c && hasRepeatDoubleConsonantSuffix(subSlice) { 253 lenTrim := 1 254 255 lenSubSlice := len(subSlice) 256 257 result = subSlice[:lenSubSlice-lenTrim] 258 } else if c := subSlice[len(subSlice)-1]; 1 == measure(subSlice) && hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c { 259 lenTrim := -1 260 261 result = s[:lenS-lenSuffix-lenTrim] 262 263 result[len(result)-1] = 'e' 264 } else { 265 result = subSlice 266 } 267 268 } 269 } else if suffix := []rune("ing"); hasSuffix(s, suffix) { 270 lenSuffix := len(suffix) 271 272 subSlice := s[:lenS-lenSuffix] 273 274 if containsVowel(subSlice) { 275 276 if suffix2 := []rune("at"); hasSuffix(subSlice, suffix2) { 277 lenTrim := -1 278 279 result = s[:lenS-lenSuffix-lenTrim] 280 281 result[len(result)-1] = 'e' 282 } else if suffix2 := []rune("bl"); hasSuffix(subSlice, suffix2) { 283 lenTrim := -1 284 285 result = s[:lenS-lenSuffix-lenTrim] 286 287 result[len(result)-1] = 'e' 288 } else if suffix2 := []rune("iz"); hasSuffix(subSlice, suffix2) { 289 lenTrim := -1 290 291 result = s[:lenS-lenSuffix-lenTrim] 292 293 result[len(result)-1] = 'e' 294 } else if c := subSlice[len(subSlice)-1]; 'l' != c && 's' != c && 'z' != c && hasRepeatDoubleConsonantSuffix(subSlice) { 295 lenTrim := 1 296 297 lenSubSlice := len(subSlice) 298 299 result = subSlice[:lenSubSlice-lenTrim] 300 } else if c := subSlice[len(subSlice)-1]; 1 == measure(subSlice) && hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c { 301 lenTrim := -1 302 303 result = s[:lenS-lenSuffix-lenTrim] 304 305 result[len(result)-1] = 'e' 306 } else { 307 result = subSlice 308 } 309 310 } 311 } 312 313 // Return. 314 return result 315 } 316 317 func step1c(s []rune) []rune { 318 319 // Initialize. 320 lenS := len(s) 321 322 result := s 323 324 // Do it! 325 if 2 > lenS { 326 /////////// RETURN 327 return result 328 } 329 330 if 'y' == s[lenS-1] && containsVowel(s[:lenS-1]) { 331 332 result[lenS-1] = 'i' 333 334 } else if 'Y' == s[lenS-1] && containsVowel(s[:lenS-1]) { 335 336 result[lenS-1] = 'I' 337 338 } 339 340 // Return. 341 return result 342 } 343 344 func step2(s []rune) []rune { 345 346 // Initialize. 347 lenS := len(s) 348 349 result := s 350 351 // Do it! 352 if suffix := []rune("ational"); hasSuffix(s, suffix) { 353 if 0 < measure(s[:lenS-len(suffix)]) { 354 result[lenS-5] = 'e' 355 result = result[:lenS-4] 356 } 357 } else if suffix := []rune("tional"); hasSuffix(s, suffix) { 358 if 0 < measure(s[:lenS-len(suffix)]) { 359 result = result[:lenS-2] 360 } 361 } else if suffix := []rune("enci"); hasSuffix(s, suffix) { 362 if 0 < measure(s[:lenS-len(suffix)]) { 363 result[lenS-1] = 'e' 364 } 365 } else if suffix := []rune("anci"); hasSuffix(s, suffix) { 366 if 0 < measure(s[:lenS-len(suffix)]) { 367 result[lenS-1] = 'e' 368 } 369 } else if suffix := []rune("izer"); hasSuffix(s, suffix) { 370 if 0 < measure(s[:lenS-len(suffix)]) { 371 result = s[:lenS-1] 372 } 373 } else if suffix := []rune("bli"); hasSuffix(s, suffix) { // --DEPARTURE-- 374 // } else if suffix := []rune("abli") ; hasSuffix(s, suffix) { 375 if 0 < measure(s[:lenS-len(suffix)]) { 376 result[lenS-1] = 'e' 377 } 378 } else if suffix := []rune("alli"); hasSuffix(s, suffix) { 379 if 0 < measure(s[:lenS-len(suffix)]) { 380 result = s[:lenS-2] 381 } 382 } else if suffix := []rune("entli"); hasSuffix(s, suffix) { 383 if 0 < measure(s[:lenS-len(suffix)]) { 384 result = s[:lenS-2] 385 } 386 } else if suffix := []rune("eli"); hasSuffix(s, suffix) { 387 if 0 < measure(s[:lenS-len(suffix)]) { 388 result = s[:lenS-2] 389 } 390 } else if suffix := []rune("ousli"); hasSuffix(s, suffix) { 391 if 0 < measure(s[:lenS-len(suffix)]) { 392 result = s[:lenS-2] 393 } 394 } else if suffix := []rune("ization"); hasSuffix(s, suffix) { 395 if 0 < measure(s[:lenS-len(suffix)]) { 396 result[lenS-5] = 'e' 397 398 result = s[:lenS-4] 399 } 400 } else if suffix := []rune("ation"); hasSuffix(s, suffix) { 401 if 0 < measure(s[:lenS-len(suffix)]) { 402 result[lenS-3] = 'e' 403 404 result = s[:lenS-2] 405 } 406 } else if suffix := []rune("ator"); hasSuffix(s, suffix) { 407 if 0 < measure(s[:lenS-len(suffix)]) { 408 result[lenS-2] = 'e' 409 410 result = s[:lenS-1] 411 } 412 } else if suffix := []rune("alism"); hasSuffix(s, suffix) { 413 if 0 < measure(s[:lenS-len(suffix)]) { 414 result = s[:lenS-3] 415 } 416 } else if suffix := []rune("iveness"); hasSuffix(s, suffix) { 417 if 0 < measure(s[:lenS-len(suffix)]) { 418 result = s[:lenS-4] 419 } 420 } else if suffix := []rune("fulness"); hasSuffix(s, suffix) { 421 if 0 < measure(s[:lenS-len(suffix)]) { 422 result = s[:lenS-4] 423 } 424 } else if suffix := []rune("ousness"); hasSuffix(s, suffix) { 425 if 0 < measure(s[:lenS-len(suffix)]) { 426 result = s[:lenS-4] 427 } 428 } else if suffix := []rune("aliti"); hasSuffix(s, suffix) { 429 if 0 < measure(s[:lenS-len(suffix)]) { 430 result = s[:lenS-3] 431 } 432 } else if suffix := []rune("iviti"); hasSuffix(s, suffix) { 433 if 0 < measure(s[:lenS-len(suffix)]) { 434 result[lenS-3] = 'e' 435 436 result = result[:lenS-2] 437 } 438 } else if suffix := []rune("biliti"); hasSuffix(s, suffix) { 439 if 0 < measure(s[:lenS-len(suffix)]) { 440 result[lenS-5] = 'l' 441 result[lenS-4] = 'e' 442 443 result = result[:lenS-3] 444 } 445 } else if suffix := []rune("logi"); hasSuffix(s, suffix) { // --DEPARTURE-- 446 if 0 < measure(s[:lenS-len(suffix)]) { 447 lenTrim := 1 448 449 result = s[:lenS-lenTrim] 450 } 451 } 452 453 // Return. 454 return result 455 } 456 457 func step3(s []rune) []rune { 458 459 // Initialize. 460 lenS := len(s) 461 result := s 462 463 // Do it! 464 if suffix := []rune("icate"); hasSuffix(s, suffix) { 465 lenSuffix := len(suffix) 466 467 if 0 < measure(s[:lenS-lenSuffix]) { 468 result = result[:lenS-3] 469 } 470 } else if suffix := []rune("ative"); hasSuffix(s, suffix) { 471 lenSuffix := len(suffix) 472 473 subSlice := s[:lenS-lenSuffix] 474 475 m := measure(subSlice) 476 477 if 0 < m { 478 result = subSlice 479 } 480 } else if suffix := []rune("alize"); hasSuffix(s, suffix) { 481 lenSuffix := len(suffix) 482 483 if 0 < measure(s[:lenS-lenSuffix]) { 484 result = result[:lenS-3] 485 } 486 } else if suffix := []rune("iciti"); hasSuffix(s, suffix) { 487 lenSuffix := len(suffix) 488 489 if 0 < measure(s[:lenS-lenSuffix]) { 490 result = result[:lenS-3] 491 } 492 } else if suffix := []rune("ical"); hasSuffix(s, suffix) { 493 lenSuffix := len(suffix) 494 495 if 0 < measure(s[:lenS-lenSuffix]) { 496 result = result[:lenS-2] 497 } 498 } else if suffix := []rune("ful"); hasSuffix(s, suffix) { 499 lenSuffix := len(suffix) 500 501 subSlice := s[:lenS-lenSuffix] 502 503 m := measure(subSlice) 504 505 if 0 < m { 506 result = subSlice 507 } 508 } else if suffix := []rune("ness"); hasSuffix(s, suffix) { 509 lenSuffix := len(suffix) 510 511 subSlice := s[:lenS-lenSuffix] 512 513 m := measure(subSlice) 514 515 if 0 < m { 516 result = subSlice 517 } 518 } 519 520 // Return. 521 return result 522 } 523 524 func step4(s []rune) []rune { 525 526 // Initialize. 527 lenS := len(s) 528 result := s 529 530 // Do it! 531 if suffix := []rune("al"); hasSuffix(s, suffix) { 532 lenSuffix := len(suffix) 533 534 subSlice := s[:lenS-lenSuffix] 535 536 m := measure(subSlice) 537 538 if 1 < m { 539 result = result[:lenS-lenSuffix] 540 } 541 } else if suffix := []rune("ance"); hasSuffix(s, suffix) { 542 lenSuffix := len(suffix) 543 544 subSlice := s[:lenS-lenSuffix] 545 546 m := measure(subSlice) 547 548 if 1 < m { 549 result = result[:lenS-lenSuffix] 550 } 551 } else if suffix := []rune("ence"); hasSuffix(s, suffix) { 552 lenSuffix := len(suffix) 553 554 subSlice := s[:lenS-lenSuffix] 555 556 m := measure(subSlice) 557 558 if 1 < m { 559 result = result[:lenS-lenSuffix] 560 } 561 } else if suffix := []rune("er"); hasSuffix(s, suffix) { 562 lenSuffix := len(suffix) 563 564 subSlice := s[:lenS-lenSuffix] 565 566 m := measure(subSlice) 567 568 if 1 < m { 569 result = subSlice 570 } 571 } else if suffix := []rune("ic"); hasSuffix(s, suffix) { 572 lenSuffix := len(suffix) 573 574 subSlice := s[:lenS-lenSuffix] 575 576 m := measure(subSlice) 577 578 if 1 < m { 579 result = subSlice 580 } 581 } else if suffix := []rune("able"); hasSuffix(s, suffix) { 582 lenSuffix := len(suffix) 583 584 subSlice := s[:lenS-lenSuffix] 585 586 m := measure(subSlice) 587 588 if 1 < m { 589 result = subSlice 590 } 591 } else if suffix := []rune("ible"); hasSuffix(s, suffix) { 592 lenSuffix := len(suffix) 593 594 subSlice := s[:lenS-lenSuffix] 595 596 m := measure(subSlice) 597 598 if 1 < m { 599 result = subSlice 600 } 601 } else if suffix := []rune("ant"); hasSuffix(s, suffix) { 602 lenSuffix := len(suffix) 603 604 subSlice := s[:lenS-lenSuffix] 605 606 m := measure(subSlice) 607 608 if 1 < m { 609 result = subSlice 610 } 611 } else if suffix := []rune("ement"); hasSuffix(s, suffix) { 612 lenSuffix := len(suffix) 613 614 subSlice := s[:lenS-lenSuffix] 615 616 m := measure(subSlice) 617 618 if 1 < m { 619 result = subSlice 620 } 621 } else if suffix := []rune("ment"); hasSuffix(s, suffix) { 622 lenSuffix := len(suffix) 623 624 subSlice := s[:lenS-lenSuffix] 625 626 m := measure(subSlice) 627 628 if 1 < m { 629 result = subSlice 630 } 631 } else if suffix := []rune("ent"); hasSuffix(s, suffix) { 632 lenSuffix := len(suffix) 633 634 subSlice := s[:lenS-lenSuffix] 635 636 m := measure(subSlice) 637 638 if 1 < m { 639 result = subSlice 640 } 641 } else if suffix := []rune("ion"); hasSuffix(s, suffix) { 642 lenSuffix := len(suffix) 643 644 subSlice := s[:lenS-lenSuffix] 645 646 m := measure(subSlice) 647 648 c := subSlice[len(subSlice)-1] 649 650 if 1 < m && ('s' == c || 't' == c) { 651 result = subSlice 652 } 653 } else if suffix := []rune("ou"); hasSuffix(s, suffix) { 654 lenSuffix := len(suffix) 655 656 subSlice := s[:lenS-lenSuffix] 657 658 m := measure(subSlice) 659 660 if 1 < m { 661 result = subSlice 662 } 663 } else if suffix := []rune("ism"); hasSuffix(s, suffix) { 664 lenSuffix := len(suffix) 665 666 subSlice := s[:lenS-lenSuffix] 667 668 m := measure(subSlice) 669 670 if 1 < m { 671 result = subSlice 672 } 673 } else if suffix := []rune("ate"); hasSuffix(s, suffix) { 674 lenSuffix := len(suffix) 675 676 subSlice := s[:lenS-lenSuffix] 677 678 m := measure(subSlice) 679 680 if 1 < m { 681 result = subSlice 682 } 683 } else if suffix := []rune("iti"); hasSuffix(s, suffix) { 684 lenSuffix := len(suffix) 685 686 subSlice := s[:lenS-lenSuffix] 687 688 m := measure(subSlice) 689 690 if 1 < m { 691 result = subSlice 692 } 693 } else if suffix := []rune("ous"); hasSuffix(s, suffix) { 694 lenSuffix := len(suffix) 695 696 subSlice := s[:lenS-lenSuffix] 697 698 m := measure(subSlice) 699 700 if 1 < m { 701 result = subSlice 702 } 703 } else if suffix := []rune("ive"); hasSuffix(s, suffix) { 704 lenSuffix := len(suffix) 705 706 subSlice := s[:lenS-lenSuffix] 707 708 m := measure(subSlice) 709 710 if 1 < m { 711 result = subSlice 712 } 713 } else if suffix := []rune("ize"); hasSuffix(s, suffix) { 714 lenSuffix := len(suffix) 715 716 subSlice := s[:lenS-lenSuffix] 717 718 m := measure(subSlice) 719 720 if 1 < m { 721 result = subSlice 722 } 723 } 724 725 // Return. 726 return result 727 } 728 729 func step5a(s []rune) []rune { 730 731 // Initialize. 732 lenS := len(s) 733 result := s 734 735 // Do it! 736 if 'e' == s[lenS-1] { 737 lenSuffix := 1 738 739 subSlice := s[:lenS-lenSuffix] 740 if len(subSlice) == 0 { 741 return result 742 } 743 m := measure(subSlice) 744 745 if 1 < m { 746 result = subSlice 747 } else if 1 == m { 748 if c := subSlice[len(subSlice)-1]; !(hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c) { 749 result = subSlice 750 } 751 } 752 } 753 754 // Return. 755 return result 756 } 757 758 func step5b(s []rune) []rune { 759 760 // Initialize. 761 lenS := len(s) 762 result := s 763 764 // Do it! 765 if 2 < lenS && 'l' == s[lenS-2] && 'l' == s[lenS-1] { 766 767 lenSuffix := 1 768 769 subSlice := s[:lenS-lenSuffix] 770 771 m := measure(subSlice) 772 773 if 1 < m { 774 result = subSlice 775 } 776 } 777 778 // Return. 779 return result 780 } 781 782 func StemString(s string) string { 783 784 // Convert string to []rune 785 runeArr := []rune(s) 786 787 // Stem. 788 runeArr = Stem(runeArr) 789 790 // Convert []rune to string 791 str := string(runeArr) 792 793 // Return. 794 return str 795 } 796 797 func Stem(s []rune) []rune { 798 799 // Initialize. 800 lenS := len(s) 801 802 // Short circuit. 803 if 0 == lenS { 804 /////////// RETURN 805 return s 806 } 807 808 // Make all runes lowercase. 809 for i := 0; i < lenS; i++ { 810 s[i] = unicode.ToLower(s[i]) 811 } 812 813 // Stem 814 result := StemWithoutLowerCasing(s) 815 816 // Return. 817 return result 818 } 819 820 func StemWithoutLowerCasing(s []rune) []rune { 821 822 // Initialize. 823 lenS := len(s) 824 825 // Words that are of length 2 or less is already stemmed. 826 // Don't do anything. 827 if 2 >= lenS { 828 /////////// RETURN 829 return s 830 } 831 832 // Stem 833 s = step1a(s) 834 s = step1b(s) 835 s = step1c(s) 836 s = step2(s) 837 s = step3(s) 838 s = step4(s) 839 s = step5a(s) 840 s = step5b(s) 841 842 // Return. 843 return s 844 }