import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import xarray as xr
import aiohttp
import time
from contextlib import contextmanager
import matplotlib.pyplot as plt
import seaborn as sns
import intake
import osimport fsspec.implementations.http as fshttp
from pelicanfs.core import PelicanFileSystem, PelicanMap, OSDFFileSystem # import dask
# from dask_jobqueue import PBSCluster
# from dask.distributed import Client
# from dask.distributed import performance_reportinit_year0 = '1991'
init_year1 = '2020'
final_year0 = '2071'
final_year1 = '2100'# File paths
rda_url = 'https://data.rda.ucar.edu/'
database_num = 'd345001'
cam6_dart_url = rda_url + database_num
#
https_catalog = cam6_dart_url + '/catalogs/https/'+ database_num +'-https-zarr.json'
osdf_catalog = cam6_dart_url + '/catalogs/osdf/'+ database_num +'-osdf-zarr.json'Open intake catalog¶
df_https_test = intake.open_esm_datastore(https_catalog)
df_https_test.df['path'].valuesarray(['https://data.rda.ucar.edu/d345001/hourly6/HR.zarr',
'https://data.rda.ucar.edu/d345001/hourly6/TSA.zarr',
'https://data.rda.ucar.edu/d345001/hourly6/EFLX_LH_TOT.zarr',
'https://data.rda.ucar.edu/d345001/hourly6/ER.zarr',
'https://data.rda.ucar.edu/d345001/weekly/VS.zarr',
'https://data.rda.ucar.edu/d345001/weekly/PS.zarr',
'https://data.rda.ucar.edu/d345001/weekly/Q.zarr',
'https://data.rda.ucar.edu/d345001/weekly/US.zarr',
'https://data.rda.ucar.edu/d345001/weekly/CLDICE.zarr',
'https://data.rda.ucar.edu/d345001/weekly/T.zarr',
'https://data.rda.ucar.edu/d345001/weekly/CLDLIQ.zarr'],
dtype=object)df_osdf_test = intake.open_esm_datastore(osdf_catalog)
df_osdf_test.df['path'].valuesarray(['osdf:///ncar/rda/d345001/hourly6/HR.zarr',
'osdf:///ncar/rda/d345001/hourly6/TSA.zarr',
'osdf:///ncar/rda/d345001/hourly6/EFLX_LH_TOT.zarr',
'osdf:///ncar/rda/d345001/hourly6/ER.zarr',
'osdf:///ncar/rda/d345001/weekly/VS.zarr',
'osdf:///ncar/rda/d345001/weekly/PS.zarr',
'osdf:///ncar/rda/d345001/weekly/Q.zarr',
'osdf:///ncar/rda/d345001/weekly/US.zarr',
'osdf:///ncar/rda/d345001/weekly/CLDICE.zarr',
'osdf:///ncar/rda/d345001/weekly/T.zarr',
'osdf:///ncar/rda/d345001/weekly/CLDLIQ.zarr'], dtype=object)data_var = 'PS'
col_subset_https = df_https_test.search(variable=data_var)
col_subset_osdf = df_osdf_test.search(variable=data_var)dsets_https = col_subset_https.to_dataset_dict(zarr_kwargs={"consolidated": True})
#
print(f"\nDataset dictionary keys:\n {dsets_https.keys()}")
# Load the first dataset and display a summary.
dataset_key = list(dsets_https.keys())[0]
#
ds_https = dsets_https[dataset_key]Loading...
dsets_osdf = col_subset_osdf.to_dataset_dict()
#
ds_osdf0 = dsets_osdf[dataset_key]
ds_https0 = dsets_https[dataset_key]Loading...
ds_osdf = ds_osdf0.PS
ds_https = ds_https0.PS
ds_httpsLoading...
%%time
ds_osdf.isel(member_id=0,time=0).plot()CPU times: user 3.81 s, sys: 5.02 s, total: 8.83 s
Wall time: 6.56 s

Data Access Speed tests¶
- We will now test how long it takes to access data (via OSDF and https-only prrotocols) for various sizes using the above array
Prepare data subsets¶
ds_osdf_1Kb = ds_osdf.isel(lat=0,lon=0,member_id=0).isel(time=np.arange(130))
ds_https_1Kb = ds_https.isel(lat=0,lon=0,member_id=0).isel(time=np.arange(130))
#ds_https_1Kbds_osdf_1Mb = ds_osdf.isel(time=0).isel(member_id =1+ np.arange(3))
ds_https_1Mb = ds_https.isel(time=0).isel(member_id =1+ np.arange(3))
# ds_osdf_1Mbds_osdf_10Mb = ds_osdf.isel(member_id =4).isel(time=np.arange(24))
ds_https_10Mb = ds_https.isel(member_id =4).isel(time=np.arange(24))
# ds_osdf_10Mbds_osdf_100Mb = ds_osdf.isel(member_id =5).isel(time=np.arange(238))
ds_https_100Mb = ds_https.isel(member_id =5).isel(time=np.arange(238))
#ds_osdf_100Mbds_osdf_200Mb = ds_osdf.isel(member_id = 6)
ds_https_200Mb = ds_https.isel(member_id =6)
#ds_https_200Mb ds_osdf_400Mb = ds_osdf.isel(member_id = 7 +np.arange(2))
ds_https_400Mb = ds_https.isel(member_id =7 + np.arange(2))
# ds_https_400Mbds_osdf_600Mb = ds_osdf.isel(member_id = 10 +np.arange(3))
ds_https_600Mb = ds_https.isel(member_id = 10 + np.arange(3))
# ds_https_600Mbds_osdf_800Mb = ds_osdf.isel(member_id = 14 +np.arange(4))
ds_https_800Mb = ds_https.isel(member_id = 14 + np.arange(4))
# ds_https_800Mbds_osdf_1Gb = ds_osdf.isel(member_id = 19 + np.arange(6)).isel(time = np.arange(410))
ds_https_1Gb = ds_https.isel(member_id = 19 + np.arange(6)).isel(time = np.arange(410))
# ds_osdf_1Gb# ds_osdf_10Gb = ds_osdf.isel(member_id = 12 + np.arange(52))
# ds_https_10Gb = ds_https.isel(member_id = 12 + np.arange(52))
# ds_osdf_10GbAccess data and plot¶
# Define file path for CSV
csv_file_path = "ncar_benchmark_ap40_nodask.csv"ds_osdf_list = [ds_osdf_1Mb,ds_osdf_10Mb,ds_osdf_100Mb,ds_osdf_200Mb,ds_osdf_400Mb,
ds_osdf_600Mb,ds_osdf_800Mb,ds_osdf_1Gb]
ds_https_list = [ds_https_1Mb,ds_https_10Mb,ds_https_100Mb,ds_https_200Mb,ds_https_400Mb,
ds_https_600Mb,ds_https_800Mb,ds_https_1Gb]# Number of data access calls
num_calls = 8 # Modify this as needed
n_workers = 4 # Set this to your preferred number of workers# DiagnosticTimer class to keep track of runtimes
class DiagnosticTimer:
def __init__(self):
self.diagnostics = []
@contextmanager
def time(self, **kwargs):
tic = time.time()
yield
toc = time.time()
kwargs["runtime"] = toc - tic
self.diagnostics.append(kwargs)
def dataframe(self):
return pd.DataFrame(self.diagnostics)
# Initialize the DiagnosticTimer
diag_timer = DiagnosticTimer()# Function to check existing CSV file and determine missing runs
def load_existing_results():
if os.path.exists(csv_file_path):
# Load existing CSV into DataFrame
existing_df = pd.read_csv(csv_file_path)
else:
# Create an empty DataFrame if the file does not exist
existing_df = pd.DataFrame(columns=["dataset_size", "protocol", "call_number", "runtime", "MBps"])
return existing_df
def filter_missing_runs(datasets, protocol_name, existing_df):
# Convert dataset sizes to MB for checking, using a list of tuples
dataset_sizes_mb = [(dataset, dataset.nbytes / (1024 ** 2)) for dataset in datasets]
# Identify missing dataset sizes and calls
filtered_datasets = []
for dataset, dataset_size_mb in dataset_sizes_mb:
for call_num in range(1, num_calls + 1):
# Check if this dataset size and call number combination already exists
if not ((existing_df["dataset_size"] == dataset_size_mb) &
(existing_df["protocol"] == protocol_name) &
(existing_df["call_number"] == call_num)).any():
filtered_datasets.append((dataset, dataset_size_mb, call_num))
return filtered_datasetsdef benchmark_protocol(datasets, protocol_name, cluster=None):
existing_df = load_existing_results() # Load existing results as a checkpoint
# Filter for missing runs based on existing results
missing_runs = filter_missing_runs(datasets, protocol_name, existing_df)
diag_timer = DiagnosticTimer() # Initialize the diagnostic timer
# Process each dataset and call
for (dataset, dataset_size_mb, call_num) in missing_runs:
# Restart the Dask cluster if provided
if cluster is not None:
cluster.scale(0) # Scale down to release worker memory
cluster.scale(n_workers) # Scale up to required number of workers
client.wait_for_workers(n_workers) # Wait for workers to be ready
# Inform the start of processing for this dataset and call
print(f"Starting processing of dataset for protocol '{protocol_name}' (Size: {dataset_size_mb} MB) in call {call_num}")
# Only count the time for loading dataset into memory
dataset_copy = dataset.copy()
with diag_timer.time(dataset_size=dataset_size_mb, protocol=protocol_name, call_number=call_num):
dataset_copy.load() # Load the dataset into memory
# Convert the single call result to a DataFrame and add MBps column
call_result_df = diag_timer.dataframe().iloc[[-1]].copy() # Get the latest diagnostic entry
call_result_df["MBps"] = call_result_df["dataset_size"] / call_result_df["runtime"]
# Append this call's result to CSV
call_result_df.to_csv(csv_file_path, mode='a', header=not os.path.exists(csv_file_path), index=False)
print(f"Appended results for protocol '{protocol_name}', call {call_num} to '{csv_file_path}'")
# Print statement after finishing each call
print(f"Finished processing dataset for protocol '{protocol_name}' in call {call_num}")
# Run benchmark for each protocol
benchmark_protocol(ds_https_list, "HTTPS-only",cluster=None)
benchmark_protocol(ds_osdf_list, "OSDF-director",cluster=None)
# Convert diagnostics to a DataFrame for analysis
df_diagnostics = diag_timer.dataframe()
Starting processing of dataset for protocol 'HTTPS-only' (Size: 596.109375 MB) in call 7
Appended results for protocol 'HTTPS-only', call 7 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 7
Starting processing of dataset for protocol 'HTTPS-only' (Size: 596.109375 MB) in call 8
Appended results for protocol 'HTTPS-only', call 8 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 8
Starting processing of dataset for protocol 'HTTPS-only' (Size: 794.8125 MB) in call 1
Appended results for protocol 'HTTPS-only', call 1 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 1
Starting processing of dataset for protocol 'HTTPS-only' (Size: 794.8125 MB) in call 2
Appended results for protocol 'HTTPS-only', call 2 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 2
Starting processing of dataset for protocol 'HTTPS-only' (Size: 794.8125 MB) in call 3
Appended results for protocol 'HTTPS-only', call 3 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 3
Starting processing of dataset for protocol 'HTTPS-only' (Size: 794.8125 MB) in call 4
Appended results for protocol 'HTTPS-only', call 4 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 4
Starting processing of dataset for protocol 'HTTPS-only' (Size: 794.8125 MB) in call 5
Appended results for protocol 'HTTPS-only', call 5 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 5
Starting processing of dataset for protocol 'HTTPS-only' (Size: 794.8125 MB) in call 6
Appended results for protocol 'HTTPS-only', call 6 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 6
Starting processing of dataset for protocol 'HTTPS-only' (Size: 794.8125 MB) in call 7
Appended results for protocol 'HTTPS-only', call 7 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 7
Starting processing of dataset for protocol 'HTTPS-only' (Size: 794.8125 MB) in call 8
Appended results for protocol 'HTTPS-only', call 8 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 8
Starting processing of dataset for protocol 'HTTPS-only' (Size: 1037.8125 MB) in call 1
Appended results for protocol 'HTTPS-only', call 1 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 1
Starting processing of dataset for protocol 'HTTPS-only' (Size: 1037.8125 MB) in call 2
Appended results for protocol 'HTTPS-only', call 2 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 2
Starting processing of dataset for protocol 'HTTPS-only' (Size: 1037.8125 MB) in call 3
Appended results for protocol 'HTTPS-only', call 3 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 3
Starting processing of dataset for protocol 'HTTPS-only' (Size: 1037.8125 MB) in call 4
Appended results for protocol 'HTTPS-only', call 4 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 4
Starting processing of dataset for protocol 'HTTPS-only' (Size: 1037.8125 MB) in call 5
Appended results for protocol 'HTTPS-only', call 5 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 5
Starting processing of dataset for protocol 'HTTPS-only' (Size: 1037.8125 MB) in call 6
Appended results for protocol 'HTTPS-only', call 6 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 6
Starting processing of dataset for protocol 'HTTPS-only' (Size: 1037.8125 MB) in call 7
Appended results for protocol 'HTTPS-only', call 7 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 7
Starting processing of dataset for protocol 'HTTPS-only' (Size: 1037.8125 MB) in call 8
Appended results for protocol 'HTTPS-only', call 8 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'HTTPS-only' in call 8
Starting processing of dataset for protocol 'OSDF-director' (Size: 1.265625 MB) in call 1
Appended results for protocol 'OSDF-director', call 1 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 1
Starting processing of dataset for protocol 'OSDF-director' (Size: 1.265625 MB) in call 2
Appended results for protocol 'OSDF-director', call 2 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 2
Starting processing of dataset for protocol 'OSDF-director' (Size: 1.265625 MB) in call 3
Appended results for protocol 'OSDF-director', call 3 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 3
Starting processing of dataset for protocol 'OSDF-director' (Size: 1.265625 MB) in call 4
Appended results for protocol 'OSDF-director', call 4 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 4
Starting processing of dataset for protocol 'OSDF-director' (Size: 1.265625 MB) in call 5
Appended results for protocol 'OSDF-director', call 5 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 5
Starting processing of dataset for protocol 'OSDF-director' (Size: 1.265625 MB) in call 6
Appended results for protocol 'OSDF-director', call 6 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 6
Starting processing of dataset for protocol 'OSDF-director' (Size: 1.265625 MB) in call 7
Appended results for protocol 'OSDF-director', call 7 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 7
Starting processing of dataset for protocol 'OSDF-director' (Size: 1.265625 MB) in call 8
Appended results for protocol 'OSDF-director', call 8 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 8
Starting processing of dataset for protocol 'OSDF-director' (Size: 10.125 MB) in call 1
Appended results for protocol 'OSDF-director', call 1 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 1
Starting processing of dataset for protocol 'OSDF-director' (Size: 10.125 MB) in call 2
Appended results for protocol 'OSDF-director', call 2 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 2
Starting processing of dataset for protocol 'OSDF-director' (Size: 10.125 MB) in call 3
Appended results for protocol 'OSDF-director', call 3 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 3
Starting processing of dataset for protocol 'OSDF-director' (Size: 10.125 MB) in call 4
Appended results for protocol 'OSDF-director', call 4 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 4
Starting processing of dataset for protocol 'OSDF-director' (Size: 10.125 MB) in call 5
Appended results for protocol 'OSDF-director', call 5 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 5
Starting processing of dataset for protocol 'OSDF-director' (Size: 10.125 MB) in call 6
Appended results for protocol 'OSDF-director', call 6 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 6
Starting processing of dataset for protocol 'OSDF-director' (Size: 10.125 MB) in call 7
Appended results for protocol 'OSDF-director', call 7 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 7
Starting processing of dataset for protocol 'OSDF-director' (Size: 10.125 MB) in call 8
Appended results for protocol 'OSDF-director', call 8 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 8
Starting processing of dataset for protocol 'OSDF-director' (Size: 100.40625 MB) in call 1
Appended results for protocol 'OSDF-director', call 1 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 1
Starting processing of dataset for protocol 'OSDF-director' (Size: 100.40625 MB) in call 2
Appended results for protocol 'OSDF-director', call 2 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 2
Starting processing of dataset for protocol 'OSDF-director' (Size: 100.40625 MB) in call 3
Appended results for protocol 'OSDF-director', call 3 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 3
Starting processing of dataset for protocol 'OSDF-director' (Size: 100.40625 MB) in call 4
Appended results for protocol 'OSDF-director', call 4 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 4
Starting processing of dataset for protocol 'OSDF-director' (Size: 100.40625 MB) in call 5
Appended results for protocol 'OSDF-director', call 5 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 5
Starting processing of dataset for protocol 'OSDF-director' (Size: 100.40625 MB) in call 6
Appended results for protocol 'OSDF-director', call 6 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 6
Starting processing of dataset for protocol 'OSDF-director' (Size: 100.40625 MB) in call 7
Appended results for protocol 'OSDF-director', call 7 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 7
Starting processing of dataset for protocol 'OSDF-director' (Size: 100.40625 MB) in call 8
Appended results for protocol 'OSDF-director', call 8 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 8
Starting processing of dataset for protocol 'OSDF-director' (Size: 198.703125 MB) in call 1
Appended results for protocol 'OSDF-director', call 1 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 1
Starting processing of dataset for protocol 'OSDF-director' (Size: 198.703125 MB) in call 2
Appended results for protocol 'OSDF-director', call 2 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 2
Starting processing of dataset for protocol 'OSDF-director' (Size: 198.703125 MB) in call 3
Appended results for protocol 'OSDF-director', call 3 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 3
Starting processing of dataset for protocol 'OSDF-director' (Size: 198.703125 MB) in call 4
Appended results for protocol 'OSDF-director', call 4 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 4
Starting processing of dataset for protocol 'OSDF-director' (Size: 198.703125 MB) in call 5
Appended results for protocol 'OSDF-director', call 5 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 5
Starting processing of dataset for protocol 'OSDF-director' (Size: 198.703125 MB) in call 6
Appended results for protocol 'OSDF-director', call 6 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 6
Starting processing of dataset for protocol 'OSDF-director' (Size: 198.703125 MB) in call 7
Appended results for protocol 'OSDF-director', call 7 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 7
Starting processing of dataset for protocol 'OSDF-director' (Size: 198.703125 MB) in call 8
Appended results for protocol 'OSDF-director', call 8 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 8
Starting processing of dataset for protocol 'OSDF-director' (Size: 397.40625 MB) in call 1
Appended results for protocol 'OSDF-director', call 1 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 1
Starting processing of dataset for protocol 'OSDF-director' (Size: 397.40625 MB) in call 2
Appended results for protocol 'OSDF-director', call 2 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 2
Starting processing of dataset for protocol 'OSDF-director' (Size: 397.40625 MB) in call 3
Appended results for protocol 'OSDF-director', call 3 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 3
Starting processing of dataset for protocol 'OSDF-director' (Size: 397.40625 MB) in call 4
Appended results for protocol 'OSDF-director', call 4 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 4
Starting processing of dataset for protocol 'OSDF-director' (Size: 397.40625 MB) in call 5
Appended results for protocol 'OSDF-director', call 5 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 5
Starting processing of dataset for protocol 'OSDF-director' (Size: 397.40625 MB) in call 6
Appended results for protocol 'OSDF-director', call 6 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 6
Starting processing of dataset for protocol 'OSDF-director' (Size: 397.40625 MB) in call 7
Appended results for protocol 'OSDF-director', call 7 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 7
Starting processing of dataset for protocol 'OSDF-director' (Size: 397.40625 MB) in call 8
Appended results for protocol 'OSDF-director', call 8 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 8
Starting processing of dataset for protocol 'OSDF-director' (Size: 596.109375 MB) in call 1
Appended results for protocol 'OSDF-director', call 1 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 1
Starting processing of dataset for protocol 'OSDF-director' (Size: 596.109375 MB) in call 2
Appended results for protocol 'OSDF-director', call 2 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 2
Starting processing of dataset for protocol 'OSDF-director' (Size: 596.109375 MB) in call 3
Appended results for protocol 'OSDF-director', call 3 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 3
Starting processing of dataset for protocol 'OSDF-director' (Size: 596.109375 MB) in call 4
Appended results for protocol 'OSDF-director', call 4 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 4
Starting processing of dataset for protocol 'OSDF-director' (Size: 596.109375 MB) in call 5
Appended results for protocol 'OSDF-director', call 5 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 5
Starting processing of dataset for protocol 'OSDF-director' (Size: 596.109375 MB) in call 6
Appended results for protocol 'OSDF-director', call 6 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 6
Starting processing of dataset for protocol 'OSDF-director' (Size: 596.109375 MB) in call 7
Appended results for protocol 'OSDF-director', call 7 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 7
Starting processing of dataset for protocol 'OSDF-director' (Size: 596.109375 MB) in call 8
Appended results for protocol 'OSDF-director', call 8 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 8
Starting processing of dataset for protocol 'OSDF-director' (Size: 794.8125 MB) in call 1
Appended results for protocol 'OSDF-director', call 1 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 1
Starting processing of dataset for protocol 'OSDF-director' (Size: 794.8125 MB) in call 2
Appended results for protocol 'OSDF-director', call 2 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 2
Starting processing of dataset for protocol 'OSDF-director' (Size: 794.8125 MB) in call 3
Appended results for protocol 'OSDF-director', call 3 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 3
Starting processing of dataset for protocol 'OSDF-director' (Size: 794.8125 MB) in call 4
Appended results for protocol 'OSDF-director', call 4 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 4
Starting processing of dataset for protocol 'OSDF-director' (Size: 794.8125 MB) in call 5
Appended results for protocol 'OSDF-director', call 5 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 5
Starting processing of dataset for protocol 'OSDF-director' (Size: 794.8125 MB) in call 6
Appended results for protocol 'OSDF-director', call 6 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 6
Starting processing of dataset for protocol 'OSDF-director' (Size: 794.8125 MB) in call 7
Appended results for protocol 'OSDF-director', call 7 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 7
Starting processing of dataset for protocol 'OSDF-director' (Size: 794.8125 MB) in call 8
Appended results for protocol 'OSDF-director', call 8 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 8
Starting processing of dataset for protocol 'OSDF-director' (Size: 1037.8125 MB) in call 1
Appended results for protocol 'OSDF-director', call 1 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 1
Starting processing of dataset for protocol 'OSDF-director' (Size: 1037.8125 MB) in call 2
Appended results for protocol 'OSDF-director', call 2 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 2
Starting processing of dataset for protocol 'OSDF-director' (Size: 1037.8125 MB) in call 3
Appended results for protocol 'OSDF-director', call 3 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 3
Starting processing of dataset for protocol 'OSDF-director' (Size: 1037.8125 MB) in call 4
Appended results for protocol 'OSDF-director', call 4 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 4
Starting processing of dataset for protocol 'OSDF-director' (Size: 1037.8125 MB) in call 5
Appended results for protocol 'OSDF-director', call 5 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 5
Starting processing of dataset for protocol 'OSDF-director' (Size: 1037.8125 MB) in call 6
Appended results for protocol 'OSDF-director', call 6 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 6
Starting processing of dataset for protocol 'OSDF-director' (Size: 1037.8125 MB) in call 7
Appended results for protocol 'OSDF-director', call 7 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 7
Starting processing of dataset for protocol 'OSDF-director' (Size: 1037.8125 MB) in call 8
Appended results for protocol 'OSDF-director', call 8 to 'ncar_benchmark_ap40_nodask.csv'
Finished processing dataset for protocol 'OSDF-director' in call 8
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Cell In[29], line 9
6 df_diagnostics = diag_timer.dataframe()
8 # Calculate MB/s for each run
----> 9 df_diagnostics['MBps'] = df_diagnostics['dataset_size'] / df_diagnostics['runtime']
10 df_diagnostics
File ~/miniconda3/lib/python3.12/site-packages/pandas/core/frame.py:4102, in DataFrame.__getitem__(self, key)
4100 if self.columns.nlevels > 1:
4101 return self._getitem_multilevel(key)
-> 4102 indexer = self.columns.get_loc(key)
4103 if is_integer(indexer):
4104 indexer = [indexer]
File ~/miniconda3/lib/python3.12/site-packages/pandas/core/indexes/range.py:417, in RangeIndex.get_loc(self, key)
415 raise KeyError(key) from err
416 if isinstance(key, Hashable):
--> 417 raise KeyError(key)
418 self._check_indexing_error(key)
419 raise KeyError(key)
KeyError: 'dataset_size'# Calculate MB/s for each run
# df_diagnostics['MBps'] = df_diagnostics['dataset_size'] / df_diagnostics['runtime']
df_diagnostics = pd.read_csv(csv_file_path)
df_diagnosticsLoading...
# df_diagnostics = diag_timer.dataframe()
# df_diagnostics# Plotting MBps vs data size for each protocol and call type
# Define different alpha values for each protocol
alpha_values = {"HTTPS-only": 0.8, "OSDF-director": 0.6} # Adjust transparency as needed
marker_style = {"HTTPS-only": "o", "OSDF-director": "x"} # Define different markers for each protocol
edgecolors = {"HTTPS-only": "black", "OSDF-director": None}
#
# Define markers for each protocol
protocol_markers = {
"HTTPS-only": 'o', # Circle marker
"OSDF-director": 'x' # Cross marker
}
# Define alpha values for different access types
alpha_first_access = 0.8
alpha_subsequent_access = 0.5
fig, ax = plt.subplots(figsize=(10, 6))
for protocol in ["HTTPS-only", "OSDF-director"]:
# First access (call_number == 1)
first_access = df_diagnostics[(df_diagnostics['protocol'] == protocol) & (df_diagnostics['call_number'] == 1)]
ax.scatter(first_access['dataset_size'], first_access['MBps'],
label=f"{protocol} - First Access",
marker=protocol_markers[protocol],
alpha=alpha_first_access,
edgecolor=edgecolors[protocol])
# Subsequent access (call_number > 1)
subsequent_access = df_diagnostics[(df_diagnostics['protocol'] == protocol) & (df_diagnostics['call_number'] > 1)]
subsequent_access_avg = subsequent_access.groupby('dataset_size')['MBps'].mean()
# Add a slight jitter to dataset size to avoid overlapping
jitter = 0.1 * subsequent_access_avg.index.to_series().std() # Adjust jitter dynamically
jittered_index = subsequent_access_avg.index + np.random.uniform(-jitter, jitter, size=len(subsequent_access_avg.index))
ax.scatter(jittered_index, subsequent_access_avg.values,
label=f"{protocol} - Subsequent Access (Avg)",
marker=protocol_markers[protocol],
alpha=alpha_subsequent_access,
edgecolor=edgecolors[protocol])
# Customize plot appearance
ax.set_xlabel("Data Size (MB)")
ax.set_ylabel("Data Access Speed (MBps)")
ax.set_title("NCAR --> AP40, Dask: No, 8 requests")
ax.legend()
plt.show()
# Convert dataset size to categorical to control the order in the plot
#df_diagnostics['dataset_size'] = df_diagnostics['dataset_size'].astype("category")
df_diagnostics['dataset_size'] = df_diagnostics['dataset_size'].round(0).astype(int).astype("category")
# Set the order for dataset sizes to appear in ascending order
size_order = sorted(df_diagnostics['dataset_size'].unique())
# Create the box plot
plt.figure(figsize=(12, 6))
sns.boxplot(
data=df_diagnostics,
x="dataset_size",
y="MBps",
hue="protocol",
order=size_order
)
# Customize plot appearance
plt.xlabel("Data Size (MB)")
plt.ylabel("Data Access Speed (MBps)")
# plt.yscale('log')
plt.title("NCAR --> AP40, Dask: No, 8 requests")
plt.legend(title="Protocol")
plt.show()
# Convert dataset size to categorical to control the order in the plot
#df_diagnostics['dataset_size'] = df_diagnostics['dataset_size'].astype("category")
df_diagnostics['dataset_size'] = df_diagnostics['dataset_size'].round(0).astype(int).astype("category")
# Set the order for dataset sizes to appear in ascending order
size_order = sorted(df_diagnostics['dataset_size'].unique())
# Create the box plot
plt.figure(figsize=(12, 6))
sns.boxplot(
data=df_diagnostics,
x="dataset_size",
y="MBps",
hue="protocol",
order=size_order,
whis = [5,95]
)
# Customize plot appearance
plt.xlabel("Data Size (MB)")
plt.ylabel("Data Access Speed (MBps)")
# plt.yscale('log')
plt.title("NCAR --> AP40, Dask: No, 8 requests")
plt.legend(title="Protocol")
plt.show()