information-retrieval

Exploration of information retrieval topics
git clone git://git.laack.co/information-retrieval.git
Log | Files | Refs

web_search.py (3313B)


      1 import requests
      2 from smolagents import Tool
      3 
      4 class WebSearchTool(Tool):
      5     name = "web_search"
      6     description = "Performs a web search for a query and returns a string of the top search results formatted as markdown with titles, links, and descriptions."
      7     inputs = {"query": {"type": "string", "description": "The search query to perform."}}
      8     output_type = "string"
      9     def forward(self, query: str) -> str:
     10         src_params = {
     11             'q': query,
     12             'format': 'json'
     13         }
     14         search_url = 'https://searx.laack.co/search'
     15         
     16         try:
     17             response = requests.get(search_url, params=src_params)
     18             response.raise_for_status()
     19             res_list = response.json()['results']
     20         except (requests.RequestException, KeyError) as e:
     21             return f"Search failed: {e}"
     22         
     23         markdown_results = []
     24         for result in res_list:
     25             title = result.get('title', 'No title')
     26             url = result.get('url', '')
     27             content = result.get('content', 'No description')
     28             markdown_results.append(f"### [{title}]({url})\n{content}\n")
     29         
     30         return "\n".join(markdown_results) if markdown_results else "No results found."
     31 
     32 import requests
     33 from smolagents import Tool
     34 from bs4 import BeautifulSoup
     35 from pypdf import PdfReader
     36 from io import BytesIO
     37 
     38 class WebVisitTool(Tool):
     39     name = "visit_webpage"
     40     description = "Visits a webpage or PDF at the given URL and returns its text content. Supports HTML pages and PDF documents."
     41     inputs = {"url": {"type": "string", "description": "The URL of the webpage or PDF to visit."}}
     42     output_type = "string"
     43     
     44     def forward(self, url: str) -> str:
     45         headers = {
     46             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
     47         }
     48         
     49         try:
     50             response = requests.get(url, headers=headers, timeout=15)
     51             response.raise_for_status()
     52             
     53             content_type = response.headers.get('Content-Type', '').lower()
     54             
     55             if 'application/pdf' in content_type or url.lower().endswith('.pdf'):
     56                 return self._parse_pdf(response.content)
     57             
     58             return self._parse_html(response.text)
     59             
     60         except requests.RequestException as e:
     61             return f"Failed to fetch URL: {e}"
     62     
     63     def _parse_pdf(self, content: bytes) -> str:
     64         try:
     65             reader = PdfReader(BytesIO(content))
     66             text_parts = []
     67             for page in reader.pages:
     68                 text_parts.append(page.extract_text() or "")
     69             text = "\n".join(text_parts)
     70             return self._truncate(text)
     71         except Exception as e:
     72             return f"Failed to parse PDF: {e}"
     73     
     74     def _parse_html(self, html: str) -> str:
     75         soup = BeautifulSoup(html, 'html.parser')
     76         for element in soup(['script', 'style', 'nav', 'footer', 'header']):
     77             element.decompose()
     78         text = soup.get_text(separator='\n', strip=True)
     79         return self._truncate(text)
     80     
     81     def _truncate(self, text: str, max_chars: int = 15000) -> str:
     82         if len(text) > max_chars:
     83             return text[:max_chars] + "\n\n[Content truncated...]"
     84         return text if text else "No readable content found."