spider.py (9597B)
1 # TODO: Create another database (it should be unique in case it needs to be flushed, it is basically tmp after all) 2 # that is a priority queue for links to be parsed. This requires consideration for how we determine what should be 3 # searched and when it should be searched. 4 5 # MVP: Create db to store links to follow. Add to the bottom and take from the top. 6 7 # Q: How should we handle the query timelines? 8 # A: We have a day check for selections, but I don't like this. We should instead be 9 # using a queue to remove this additional logic. We can reuse that to populate the queue, but that is 10 # unrelated. 11 12 import urllib.robotparser 13 from urllib.parse import urlparse 14 import urllib.request 15 import requests 16 import os 17 import datetime 18 import uuid 19 from urllib.parse import urljoin, urlparse 20 from concurrent.futures import ThreadPoolExecutor, as_completed 21 from bs4 import BeautifulSoup 22 import sys 23 import sqlite3 24 from prune import process_file 25 import time 26 27 # Layout: 28 # - sites 29 # - date 30 # - each file is one site with a UUID as the filename 31 # - database 32 # - manifest.db 33 # this is the database that maps UUIDs with urls and dates 34 # the urls might not be unique as we could have multiple copies of the same site from different times 35 # no, this is not 3nf, no I don't care, this is faster. 36 # tables: 37 # site 38 # url, filepath, date, indexed 39 # tf 40 # TODO 41 # - urls.db 42 # - this is only for url lookups. this table can be considered ephemeral 43 # tables: 44 # url, priority, (possible add distance from authority here as well, uncertain) 45 46 47 # there seems to be a memory leak somewhere which is limited by max workers, but this really bugs me. 48 49 50 # bytes 51 MAX_SIZE = 2_000_000 52 MAX_WORKERS = 50 53 MAX_URLS_PER_SITE = 100 54 NOT_INDEXED = 0 55 INDEXED = 1 56 REINDEX_FREQUENCY_DAYS = 7 57 58 59 # if seconds weight == 1 then 3600 for backlink weight means 60 # each additional backlink equates to a decrease of 1 hour 61 62 BACKLINK_WEIGHT = 3600 63 SECONDS_WEIGHT = 1 64 65 def should_queue(url, cur): 66 cutoff = time.time() - (REINDEX_FREQUENCY_DAYS * 86400) 67 cur.execute(""" 68 SELECT 1 FROM site WHERE url = ? AND date > ? LIMIT 1 69 """, (url, cutoff)) 70 return cur.fetchone() is None 71 72 def is_allowed(url, user_agent, timeout=1): 73 try: 74 parsed = urlparse(url) 75 robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt" 76 rp = urllib.robotparser.RobotFileParser() 77 rp.set_url(robots_url) 78 with urllib.request.urlopen(robots_url, timeout=timeout) as response: 79 rp.parse(response.read().decode('utf-8').splitlines()) 80 return rp.can_fetch(user_agent, url) 81 except Exception: 82 return True 83 84 # you should always repect robots.txt, but if you are trying to do something with this spider I guess you can 85 # disable it. please don't do this en-masse though, that's naughty. 86 # TODO: Check the size with a HEAD request prior to reading into memory. 87 def search_url(url, filepath, respect_robots_txt=True): 88 user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0' 89 if respect_robots_txt: 90 if not is_allowed(url,user_agent): 91 print(f"Can't crawl {url} due to robots.txt violation") 92 return "", "", set() 93 94 links = set() 95 96 headers = { 97 'User-Agent': user_agent, 98 } 99 100 try: 101 source_code = requests.get(url, headers=headers, timeout=1) # natural limit to file size in memory 102 if not source_code.ok: 103 print(f'Status code not 2xx for {url}, returning.') 104 return "", "", set() 105 106 content_type = source_code.headers.get('Content-Type', '') 107 if 'text/html' not in content_type: 108 print(f'Content type for {url} not html, returning.') 109 return "", "", set() 110 111 soup = BeautifulSoup(source_code.content, 'html.parser') 112 content = soup.prettify() 113 114 if len(content.encode('utf-8')) < MAX_SIZE: 115 with open(filepath, 'w') as f: 116 f.write(content) 117 print(f'Wrote {url} to {filepath}') 118 deleted = process_file(filepath) 119 120 # we don't want to spider from bad sites. 121 # process_file does some regexp checks on the site to see if it is short / bad in some other way. 122 if deleted: 123 return "", "", set() 124 else: 125 print(f'skipping fs write for {url}, too large') 126 except Exception as e: 127 print(e) 128 return "", "", set() 129 130 current_url_without_fragment = urlparse(url)._replace(fragment='').geturl() 131 132 # find all links < max_urls_per_site that direct to a different page. 133 for link in soup.find_all('a', href=True): 134 href = link.get('href') 135 136 if href.startswith('#'): 137 continue 138 139 absolute_url = urljoin(url, href) 140 parsed = urlparse(absolute_url) 141 142 url_without_fragment = parsed._replace(fragment='').geturl() 143 144 if url_without_fragment == current_url_without_fragment: 145 continue 146 147 if parsed.scheme in ('http', 'https'): 148 if len(links) < MAX_URLS_PER_SITE: 149 links.add(absolute_url) 150 151 return filepath, url, links 152 153 # pop links so multiple processes can run concurrently 154 def get_links(num_links, cur_link, con_links, backlink_weight, seconds_weight): 155 156 # this is kind of bad... if there is a concurrent spider running and it picks up the same 157 # link that is currently being processed, it will get added to the queue right after 158 # being popped, but before being written to the sites table... 159 160 # TODO: Add mutex to fix issue here 161 162 cur_link.execute(""" 163 DELETE FROM link 164 WHERE url IN ( 165 SELECT url FROM link 166 ORDER BY (backlink_count * ?) + ( ? * (? - added)) DESC 167 LIMIT ? 168 ) 169 RETURNING url 170 """, (backlink_weight, seconds_weight, int(time.time()), num_links)) 171 172 urls = {row[0] for row in cur_link.fetchall()} 173 con_links.commit() 174 return urls 175 176 177 if __name__ == "__main__": 178 seed_filename = "" 179 if len(sys.argv) == 2: 180 seed_filename = sys.argv[1] 181 con = sqlite3.connect('database/manifest.db', timeout=60) 182 con.execute('PRAGMA journal_mode=WAL') 183 cur = con.cursor() 184 185 cur.execute("CREATE TABLE IF NOT EXISTS site(url, filepath, date, indexed, language)") 186 cur.execute("CREATE INDEX IF NOT EXISTS idx_site_url ON site(url)") 187 cur.execute("CREATE INDEX IF NOT EXISTS idx_site_indexed ON site(indexed)") 188 cur.execute("CREATE INDEX IF NOT EXISTS idx_site_filepath ON site(filepath)") 189 cur.execute("CREATE INDEX IF NOT EXISTS idx_site_language ON site(filepath)") 190 191 192 con_links = sqlite3.connect('database/urls.db', timeout=60) 193 con_links.execute('PRAGMA journal_mode=WAL') 194 cur_link = con_links.cursor() 195 cur_link.execute("CREATE TABLE IF NOT EXISTS link(url UNIQUE, backlink_count, added)") 196 cur_link.execute("CREATE INDEX IF NOT EXISTS idx_link_url ON link(url)") 197 cur_link.execute("CREATE INDEX IF NOT EXISTS idx_link_added ON link(added)") 198 cur_link.execute("CREATE INDEX IF NOT EXISTS idx_link_backlink_count ON link(backlink_count)") 199 200 urls = set() 201 202 if seed_filename != "": 203 urlLs = [] 204 with open(seed_filename, 'r') as f: 205 urlLs = f.readlines() 206 for i in range(len(urlLs)): 207 urls.add(urlLs[i].strip()) 208 print(f"Loaded seed file with {len(urls)} urls") 209 save_location = 'sites/' 210 211 # TODO: better stopping. only stops when all links have been traversed 212 while True: 213 if len(urls) == 0: 214 urls = get_links(MAX_WORKERS, cur_link, con_links, BACKLINK_WEIGHT, SECONDS_WEIGHT) 215 if len(urls) == 0: 216 print("NO MORE QUEUED LINKS TO SEARCH, EXITING") 217 break 218 print(f"Loaded {len(urls)} urls from queue") 219 220 with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: 221 now = datetime.datetime.now() 222 pth = save_location + now.strftime("%Y-%m-%d") + "/" 223 os.makedirs(pth, exist_ok=True) 224 futures = { 225 executor.submit(search_url, url, pth+ str(uuid.uuid4())): url 226 for url in urls 227 } 228 229 for future in as_completed(futures): 230 filepath, url, links = future.result() 231 if filepath != '' and url != '': 232 # insert into site list 233 cur.execute(""" 234 INSERT INTO site (url, filepath, date, indexed) 235 VALUES (?, ?, ?, ?) 236 """, (url, filepath, int(datetime.datetime.now().timestamp()), NOT_INDEXED)) 237 con.commit() 238 for link in links: 239 # TODO: Make priority better, also speed this up with transactions 240 # link(url UNIQUE, backlink_count, added) 241 if should_queue(link, cur): 242 cur_link.execute(""" 243 INSERT INTO link VALUES (?, 1, ?) 244 ON CONFLICT(url) DO UPDATE SET backlink_count = backlink_count + 1 245 """, (link, int(datetime.datetime.now().timestamp()))) 246 con_links.commit() 247 else: 248 print(f"Skipping '{link}' for indexing") 249 urls = set() 250 251 252 cur_link.close() 253 254 con_links.close() 255 cur.close() 256 con.close()