gemini-search

A simple search engine for Geminispace
git clone git://git.laack.co/gemini-search.git
Log | Files | Refs | README

parse.go (3630B)


      1 package gemtextparser
      2 
      3 import (
      4 	"fmt"
      5 	"net/url"
      6 	"strings"
      7 )
      8 
      9 func stripLeadingWhiteSpace(text string) string {
     10 
     11 	for len(text) > 0 {
     12 		if text[0] == ' ' || text[0] == '\t' {
     13 			if len(text) > 1 {
     14 				text = text[1:]
     15 			} else {
     16 				text = ""
     17 				return text
     18 			}
     19 		} else {
     20 			return text
     21 		}
     22 	}
     23 
     24 	return text
     25 }
     26 
     27 func strip(s string) string {
     28 	var result strings.Builder
     29 	for i := 0; i < len(s); i++ {
     30 		b := s[i]
     31 		if ('a' <= b && b <= 'z') ||
     32 			('A' <= b && b <= 'Z') ||
     33 			('0' <= b && b <= '9') ||
     34 			b == ' ' {
     35 			result.WriteByte(b)
     36 		}
     37 	}
     38 	return result.String()
     39 }
     40 
     41 func StemmedDocument(body string) []string {
     42 	words := ParseWords(body)
     43 	results := []string{}
     44 	for _, word := range words {
     45 		rl := []rune(word)
     46 		word := string(Stem(rl))
     47 		results = append(results, word)
     48 	}
     49 
     50 	return results
     51 }
     52 
     53 // This is only gemtext because it doesn't parse "words" that exist in pre-formatted sections
     54 // because that's frequently ascii art and such.
     55 
     56 func ParseWords(body string) []string {
     57 
     58 	lines := strings.Split(body, "\n")
     59 
     60 	escaped := false
     61 	escape := "```"
     62 	results := []string{}
     63 
     64 	for _, item := range lines {
     65 
     66 		if len(item) >= 3 && strings.Compare(escape, item[:3]) == 0 {
     67 			escaped = !escaped
     68 		}
     69 
     70 		// Keep link name, remove gemini:// stuff from it
     71 		if len(item) > 3 {
     72 			if item[0] == '=' && item[1] == '>' {
     73 
     74 				link := stripLeadingWhiteSpace(item[2:])
     75 
     76 				indexOfSpace := strings.Index(link, " ")
     77 				indexOfTab := strings.Index(link, "\t")
     78 
     79 				indexOfSpaceOrTab := len(link)
     80 
     81 				if indexOfSpace != -1 {
     82 					indexOfSpaceOrTab = indexOfSpace
     83 				}
     84 				if indexOfTab != -1 {
     85 					if indexOfTab < indexOfSpace || indexOfSpace == -1 {
     86 						indexOfSpaceOrTab = indexOfTab
     87 					}
     88 				}
     89 
     90 				item = link[indexOfSpaceOrTab:]
     91 
     92 			}
     93 		}
     94 
     95 		if !escaped {
     96 			item = strip(item)
     97 			results = append(results, strings.Fields(item)...)
     98 			continue
     99 		}
    100 
    101 	}
    102 
    103 	return results
    104 }
    105 
    106 func ParseLinks(body string, currentUrl string) []string {
    107 
    108 	base, err := url.Parse(currentUrl)
    109 
    110 	if err != nil {
    111 		panic(err)
    112 	}
    113 
    114 	lines := strings.Split(body, "\n")
    115 
    116 	links := []string{}
    117 
    118 	escaped := false
    119 	escape := "```"
    120 
    121 	for _, item := range lines {
    122 
    123 		// must start with escape characters, the rest doesn't matter to us
    124 		// > Any line whose first three characters are "```" (...) are preformatted toggle lines
    125 		if len(item) >= 3 && strings.Compare(escape, item[:3]) == 0 {
    126 			escaped = !escaped
    127 		}
    128 
    129 		if len(item) > 3 && !escaped {
    130 			if item[0] == '=' && item[1] == '>' {
    131 
    132 				// sometimes links end with a \r, but that isn't valid so we won't allow it
    133 				links = append(links, stripLeadingWhiteSpace(item[2:]))
    134 			}
    135 		}
    136 
    137 	}
    138 
    139 	geminiLinks := []string{}
    140 
    141 	for _, item := range links {
    142 
    143 		// this is for finding the text associated with the link
    144 
    145 		indexOfSpace := strings.Index(item, " ")
    146 		indexOfTab := strings.Index(item, "\t")
    147 
    148 		// default if there aren't any
    149 		indexOfSpaceOrTab := len(item)
    150 
    151 		if indexOfSpace != -1 {
    152 			indexOfSpaceOrTab = indexOfSpace
    153 		}
    154 
    155 		if indexOfTab != -1 {
    156 			if indexOfTab < indexOfSpace || indexOfSpace == -1 {
    157 				indexOfSpaceOrTab = indexOfTab
    158 			}
    159 		}
    160 
    161 		if len(item) >= 10 && strings.Compare(item[:9], "gemini://") == 0 {
    162 			geminiLinks = append(geminiLinks, item[0:indexOfSpaceOrTab])
    163 		}
    164 
    165 		// there are urls that aren't relative that don't have // like like mailto: and monero:
    166 		if strings.Contains(item, ":") == false {
    167 			// relative link
    168 
    169 			u, err := url.Parse(item[0:indexOfSpaceOrTab])
    170 
    171 			if err != nil {
    172 				fmt.Printf("Unable to parse link: %s, %s\n", item, err)
    173 				continue
    174 			}
    175 
    176 			geminiLinks = append(geminiLinks, base.ResolveReference(u).String())
    177 		}
    178 	}
    179 
    180 	return geminiLinks
    181 }