parse.go (3630B)
1 package gemtextparser 2 3 import ( 4 "fmt" 5 "net/url" 6 "strings" 7 ) 8 9 func stripLeadingWhiteSpace(text string) string { 10 11 for len(text) > 0 { 12 if text[0] == ' ' || text[0] == '\t' { 13 if len(text) > 1 { 14 text = text[1:] 15 } else { 16 text = "" 17 return text 18 } 19 } else { 20 return text 21 } 22 } 23 24 return text 25 } 26 27 func strip(s string) string { 28 var result strings.Builder 29 for i := 0; i < len(s); i++ { 30 b := s[i] 31 if ('a' <= b && b <= 'z') || 32 ('A' <= b && b <= 'Z') || 33 ('0' <= b && b <= '9') || 34 b == ' ' { 35 result.WriteByte(b) 36 } 37 } 38 return result.String() 39 } 40 41 func StemmedDocument(body string) []string { 42 words := ParseWords(body) 43 results := []string{} 44 for _, word := range words { 45 rl := []rune(word) 46 word := string(Stem(rl)) 47 results = append(results, word) 48 } 49 50 return results 51 } 52 53 // This is only gemtext because it doesn't parse "words" that exist in pre-formatted sections 54 // because that's frequently ascii art and such. 55 56 func ParseWords(body string) []string { 57 58 lines := strings.Split(body, "\n") 59 60 escaped := false 61 escape := "```" 62 results := []string{} 63 64 for _, item := range lines { 65 66 if len(item) >= 3 && strings.Compare(escape, item[:3]) == 0 { 67 escaped = !escaped 68 } 69 70 // Keep link name, remove gemini:// stuff from it 71 if len(item) > 3 { 72 if item[0] == '=' && item[1] == '>' { 73 74 link := stripLeadingWhiteSpace(item[2:]) 75 76 indexOfSpace := strings.Index(link, " ") 77 indexOfTab := strings.Index(link, "\t") 78 79 indexOfSpaceOrTab := len(link) 80 81 if indexOfSpace != -1 { 82 indexOfSpaceOrTab = indexOfSpace 83 } 84 if indexOfTab != -1 { 85 if indexOfTab < indexOfSpace || indexOfSpace == -1 { 86 indexOfSpaceOrTab = indexOfTab 87 } 88 } 89 90 item = link[indexOfSpaceOrTab:] 91 92 } 93 } 94 95 if !escaped { 96 item = strip(item) 97 results = append(results, strings.Fields(item)...) 98 continue 99 } 100 101 } 102 103 return results 104 } 105 106 func ParseLinks(body string, currentUrl string) []string { 107 108 base, err := url.Parse(currentUrl) 109 110 if err != nil { 111 panic(err) 112 } 113 114 lines := strings.Split(body, "\n") 115 116 links := []string{} 117 118 escaped := false 119 escape := "```" 120 121 for _, item := range lines { 122 123 // must start with escape characters, the rest doesn't matter to us 124 // > Any line whose first three characters are "```" (...) are preformatted toggle lines 125 if len(item) >= 3 && strings.Compare(escape, item[:3]) == 0 { 126 escaped = !escaped 127 } 128 129 if len(item) > 3 && !escaped { 130 if item[0] == '=' && item[1] == '>' { 131 132 // sometimes links end with a \r, but that isn't valid so we won't allow it 133 links = append(links, stripLeadingWhiteSpace(item[2:])) 134 } 135 } 136 137 } 138 139 geminiLinks := []string{} 140 141 for _, item := range links { 142 143 // this is for finding the text associated with the link 144 145 indexOfSpace := strings.Index(item, " ") 146 indexOfTab := strings.Index(item, "\t") 147 148 // default if there aren't any 149 indexOfSpaceOrTab := len(item) 150 151 if indexOfSpace != -1 { 152 indexOfSpaceOrTab = indexOfSpace 153 } 154 155 if indexOfTab != -1 { 156 if indexOfTab < indexOfSpace || indexOfSpace == -1 { 157 indexOfSpaceOrTab = indexOfTab 158 } 159 } 160 161 if len(item) >= 10 && strings.Compare(item[:9], "gemini://") == 0 { 162 geminiLinks = append(geminiLinks, item[0:indexOfSpaceOrTab]) 163 } 164 165 // there are urls that aren't relative that don't have // like like mailto: and monero: 166 if strings.Contains(item, ":") == false { 167 // relative link 168 169 u, err := url.Parse(item[0:indexOfSpaceOrTab]) 170 171 if err != nil { 172 fmt.Printf("Unable to parse link: %s, %s\n", item, err) 173 continue 174 } 175 176 geminiLinks = append(geminiLinks, base.ResolveReference(u).String()) 177 } 178 } 179 180 return geminiLinks 181 }