information-retrieval

Exploration of information retrieval topics
git clone git://git.laack.co/information-retrieval.git
Log | Files | Refs

spider.py (9597B)


      1 # TODO: Create another database (it should be unique in case it needs to be flushed, it is basically tmp after all)
      2 # that is a priority queue for links to be parsed. This requires consideration for how we determine what should be
      3 # searched and when it should be searched.
      4 
      5 # MVP: Create db to store links to follow. Add to the bottom and take from the top.
      6 
      7 # Q: How should we handle the query timelines? 
      8 # A: We have a day check for selections, but I don't like this. We should instead be 
      9 # using a queue to remove this additional logic. We can reuse that to populate the queue, but that is
     10 # unrelated.
     11 
     12 import urllib.robotparser
     13 from urllib.parse import urlparse
     14 import urllib.request
     15 import requests
     16 import os
     17 import datetime
     18 import uuid
     19 from urllib.parse import urljoin, urlparse
     20 from concurrent.futures import ThreadPoolExecutor, as_completed
     21 from bs4 import BeautifulSoup
     22 import sys
     23 import sqlite3
     24 from prune import process_file
     25 import time
     26 
     27 # Layout:
     28 # - sites
     29     # - date
     30         # - each file is one site with a UUID as the filename
     31 # - database
     32     # - manifest.db
     33         # this is the database that maps UUIDs with urls and dates
     34         # the urls might not be unique as we could have multiple copies of the same site from different times
     35             # no, this is not 3nf, no I don't care, this is faster.
     36         # tables:
     37             # site
     38                 # url, filepath, date, indexed
     39             # tf
     40                 # TODO
     41     #   - urls.db
     42             # - this is only for url lookups. this table can be considered ephemeral
     43             # tables:
     44                 # url, priority, (possible add distance from authority here as well, uncertain)
     45 
     46 
     47 # there seems to be a memory leak somewhere which is limited by max workers, but this really bugs me.
     48 
     49 
     50 # bytes
     51 MAX_SIZE = 2_000_000
     52 MAX_WORKERS = 50
     53 MAX_URLS_PER_SITE = 100
     54 NOT_INDEXED = 0
     55 INDEXED = 1
     56 REINDEX_FREQUENCY_DAYS = 7
     57 
     58 
     59 # if seconds weight == 1 then 3600 for backlink weight means
     60 # each additional backlink equates to a decrease of 1 hour
     61 
     62 BACKLINK_WEIGHT = 3600
     63 SECONDS_WEIGHT = 1
     64 
     65 def should_queue(url, cur):
     66     cutoff = time.time() - (REINDEX_FREQUENCY_DAYS * 86400)
     67     cur.execute("""
     68     SELECT 1 FROM site WHERE url = ? AND date > ? LIMIT 1
     69     """, (url, cutoff))
     70     return cur.fetchone() is None
     71 
     72 def is_allowed(url, user_agent, timeout=1):
     73     try:
     74         parsed = urlparse(url)
     75         robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
     76         rp = urllib.robotparser.RobotFileParser()
     77         rp.set_url(robots_url)
     78         with urllib.request.urlopen(robots_url, timeout=timeout) as response:
     79             rp.parse(response.read().decode('utf-8').splitlines())
     80         return rp.can_fetch(user_agent, url)
     81     except Exception:
     82         return True
     83 
     84 # you should always repect robots.txt, but if you are trying to do something with this spider I guess you can
     85 # disable it. please don't do this en-masse though, that's naughty.
     86 # TODO: Check the size with a HEAD request prior to reading into memory. 
     87 def search_url(url, filepath, respect_robots_txt=True):
     88     user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0'
     89     if respect_robots_txt:
     90         if not is_allowed(url,user_agent):
     91             print(f"Can't crawl {url} due to robots.txt violation")
     92             return "", "", set()
     93 
     94     links = set()
     95 
     96     headers = {
     97         'User-Agent': user_agent,
     98     }
     99 
    100     try:
    101         source_code = requests.get(url, headers=headers, timeout=1) # natural limit to file size in memory
    102         if not source_code.ok:
    103             print(f'Status code not 2xx for {url}, returning.')
    104             return "", "", set()
    105         
    106         content_type = source_code.headers.get('Content-Type', '')
    107         if 'text/html' not in content_type:
    108             print(f'Content type for {url} not html, returning.')
    109             return "", "", set()
    110 
    111         soup = BeautifulSoup(source_code.content, 'html.parser')
    112         content = soup.prettify()
    113 
    114         if len(content.encode('utf-8')) < MAX_SIZE:
    115             with open(filepath, 'w') as f:
    116                 f.write(content)
    117                 print(f'Wrote {url} to {filepath}')
    118             deleted = process_file(filepath)
    119 
    120             # we don't want to spider from bad sites.
    121             # process_file does some regexp checks on the site to see if it is short / bad in some other way.
    122             if deleted:
    123                 return "", "", set()
    124         else:
    125             print(f'skipping fs write for {url}, too large')
    126     except Exception as e:
    127         print(e)
    128         return "", "", set()
    129 
    130     current_url_without_fragment = urlparse(url)._replace(fragment='').geturl()
    131 
    132     # find all links < max_urls_per_site that direct to a different page.
    133     for link in soup.find_all('a', href=True):
    134             href = link.get('href')
    135 
    136             if href.startswith('#'):
    137                 continue
    138 
    139             absolute_url = urljoin(url, href)
    140             parsed = urlparse(absolute_url)
    141 
    142             url_without_fragment = parsed._replace(fragment='').geturl()
    143             
    144             if url_without_fragment == current_url_without_fragment:
    145                 continue
    146 
    147             if parsed.scheme in ('http', 'https'):
    148                 if len(links) < MAX_URLS_PER_SITE:
    149                     links.add(absolute_url)
    150 
    151     return filepath, url, links
    152 
    153 # pop links so multiple processes can run concurrently
    154 def get_links(num_links, cur_link, con_links, backlink_weight, seconds_weight):
    155 
    156     # this is kind of bad... if there is a concurrent spider running and it picks up the same
    157     # link that is currently being processed, it will get added to the queue right after
    158     # being popped, but before being written to the sites table...
    159 
    160     # TODO: Add mutex to fix issue here
    161 
    162     cur_link.execute("""
    163         DELETE FROM link 
    164         WHERE url IN (
    165             SELECT url FROM link 
    166             ORDER BY (backlink_count * ?) + ( ? * (? - added)) DESC
    167             LIMIT ?
    168         )
    169         RETURNING url
    170     """, (backlink_weight, seconds_weight, int(time.time()), num_links))
    171 
    172     urls = {row[0] for row in cur_link.fetchall()}
    173     con_links.commit()
    174     return urls
    175 
    176 
    177 if __name__ == "__main__":
    178     seed_filename = ""
    179     if len(sys.argv) == 2:
    180         seed_filename = sys.argv[1]
    181     con = sqlite3.connect('database/manifest.db', timeout=60)
    182     con.execute('PRAGMA journal_mode=WAL')
    183     cur = con.cursor()
    184 
    185     cur.execute("CREATE TABLE IF NOT EXISTS site(url, filepath, date, indexed, language)")
    186     cur.execute("CREATE INDEX IF NOT EXISTS idx_site_url ON site(url)")
    187     cur.execute("CREATE INDEX IF NOT EXISTS idx_site_indexed ON site(indexed)")
    188     cur.execute("CREATE INDEX IF NOT EXISTS idx_site_filepath ON site(filepath)")
    189     cur.execute("CREATE INDEX IF NOT EXISTS idx_site_language ON site(filepath)")
    190 
    191 
    192     con_links = sqlite3.connect('database/urls.db', timeout=60)
    193     con_links.execute('PRAGMA journal_mode=WAL')
    194     cur_link = con_links.cursor()
    195     cur_link.execute("CREATE TABLE IF NOT EXISTS link(url UNIQUE, backlink_count, added)")
    196     cur_link.execute("CREATE INDEX IF NOT EXISTS idx_link_url ON link(url)")
    197     cur_link.execute("CREATE INDEX IF NOT EXISTS idx_link_added ON link(added)")
    198     cur_link.execute("CREATE INDEX IF NOT EXISTS idx_link_backlink_count ON link(backlink_count)")
    199 
    200     urls = set()
    201 
    202     if seed_filename != "":
    203         urlLs = []
    204         with open(seed_filename, 'r') as f:
    205             urlLs = f.readlines()
    206         for i in range(len(urlLs)):
    207             urls.add(urlLs[i].strip())
    208         print(f"Loaded seed file with {len(urls)} urls")
    209     save_location = 'sites/'
    210 
    211     # TODO: better stopping. only stops when all links have been traversed
    212     while True:
    213         if len(urls) == 0:
    214             urls = get_links(MAX_WORKERS, cur_link, con_links, BACKLINK_WEIGHT, SECONDS_WEIGHT)
    215             if len(urls) == 0:
    216                 print("NO MORE QUEUED LINKS TO SEARCH, EXITING")
    217                 break
    218             print(f"Loaded {len(urls)} urls from queue")
    219 
    220         with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    221             now = datetime.datetime.now()
    222             pth = save_location + now.strftime("%Y-%m-%d") + "/" 
    223             os.makedirs(pth, exist_ok=True)
    224             futures = {
    225                 executor.submit(search_url, url, pth+ str(uuid.uuid4())): url
    226                 for url in urls
    227             }
    228             
    229             for future in as_completed(futures):
    230                 filepath, url, links = future.result()
    231                 if filepath != '' and url != '':
    232                     # insert into site list
    233                     cur.execute("""
    234                         INSERT INTO site (url, filepath, date, indexed) 
    235                         VALUES (?, ?, ?, ?)
    236                     """, (url, filepath, int(datetime.datetime.now().timestamp()), NOT_INDEXED))
    237                     con.commit()
    238                 for link in links:
    239                     # TODO: Make priority better, also speed this up with transactions
    240                     # link(url UNIQUE, backlink_count, added)
    241                     if should_queue(link, cur):
    242                         cur_link.execute("""
    243                             INSERT INTO link VALUES (?, 1, ?)
    244                             ON CONFLICT(url) DO UPDATE SET backlink_count = backlink_count + 1
    245                         """, (link, int(datetime.datetime.now().timestamp())))
    246                         con_links.commit()
    247                     else:
    248                         print(f"Skipping '{link}' for indexing")
    249         urls = set()
    250 
    251 
    252     cur_link.close()
    253 
    254     con_links.close()
    255     cur.close()
    256     con.close()