blog

Personal blog
git clone git://git.laack.co/blog.git
Log | Files | Refs

duration.py (1416B)


      1 import pandas as pd
      2 import matplotlib.pyplot as plt
      3 import os
      4 
      5 files = [os.path.join(dp, f) for dp, _, filenames in os.walk("results") for f in filenames if os.path.splitext(f)[1] == '.csv']
      6 durations = []
      7 seen_urls = set()
      8 
      9 for file in files:
     10     print(f'Processing {file}')
     11     df = pd.read_csv(f'{file}', low_memory=False)
     12     
     13     if 'url' in df.columns:
     14         initial_count = len(df)
     15         
     16         df = df[~df['url'].isin(seen_urls)]
     17         
     18         df = df.drop_duplicates(subset=['url'], keep='first')
     19         
     20         new_urls = df['url'].dropna()
     21         seen_urls.update(new_urls)
     22         
     23         print(f"Removed {initial_count - len(df)} duplicate URLs")
     24     else:
     25         print("Warning: 'url' column not found in this file")
     26     
     27     duration = df['duration_seconds']
     28     
     29     duration = duration.dropna()
     30     
     31     duration = pd.to_numeric(duration, errors='coerce')
     32     
     33     duration = duration.dropna()
     34     
     35     duration = duration[duration == duration.astype(int)]
     36     
     37     duration = duration.astype(int)
     38     
     39     duration_list = duration.to_list()
     40     durations.extend(duration_list)
     41     print(f"Processed {len(duration_list)} valid integer rows")
     42 
     43 print(f"Total unique URLs processed: {len(seen_urls)}")
     44 print(f"Mean: {sum(durations) / len(durations)}")
     45 print(f"Total valid durations: {len(durations)}")
     46 plt.hist(durations, range=(0, 1000), bins=100)
     47 plt.savefig("durations_histogram.png")