# Display output of plots directly in Notebook
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import intake
import numpy as np
import pandas as pd
import xarray as xr
# import s3fs
# import seaborn as sns
import re
# import nest_asyncio
# nest_asyncio.apply()
import aiohttpimport fsspec.implementations.http as fshttp
from pelicanfs.core import PelicanFileSystem, PelicanMap, OSDFFileSystem import dask
from dask_jobqueue import PBSCluster
from dask.distributed import Client
from dask.distributed import performance_reportinit_year0 = '1991'
init_year1 = '2020'
final_year0 = '2071'
final_year1 = '2100'# This overwrites the default scheduler with a single-threaded scheduler
dask.config.set(scheduler='synchronous') <dask.config.set at 0x1492f7245bb0># File paths
rda_scratch = '/gpfs/csfs1/collections/rda/scratch/harshah'
#
rda_data = '/gpfs/csfs1/collections/rda/data/'
new_intake_path = rda_data + 'harshah/intake_catalogs/cesm2-lens-osdf/aws-cesm2-le.json'
new_intake_csvpath = rda_data + 'harshah/intake_catalogs/cesm2-lens-osdf/aws-cesm2-le.csv'Set up osdf url to use with PelicanFS¶
- We should one of the two pelicanFS FSSpec protocols (‘osdf’ or ‘pelican’) instead of the https protocol.
- We will use the ‘osdf’ protocol and modify the existing CESM2-LENS catalog
- So, the urls will look like: osdf_discovery_url + namespace prefix + path to file or object
- In this case, the urls will look like osdf:///aws-opendata/us-west-2/ncar-cesm2-lens + path to individual zarr stores
s3_link = 's3://'
# osdf_url = 'https://osdf-director.osg-htc.org/aws-opendata/us-west-2/'
osdf_url = 'osdf:///aws-opendata/us-west-2/'
#
rda_url = 'https://data.rda.ucar.edu/'
new_intake_url = rda_url + 'harshah/intake_catalogs/cesm2-lens-osdf/aws-cesm2-le.json'%%time
pelfs = OSDFFileSystem(direct_reads =True) # OSDFFileSystem is already aware of the osdf discovery url
#pelfs = PelicanFileSystem("pelican://osg-htc.org")
# pelfs.ls('/ncar/rda/')CPU times: user 430 μs, sys: 0 ns, total: 430 μs
Wall time: 433 μs
%%time
# pelfs = PelicanFileSystem("https://osdf-director.osg-htc.org/")
# pelfs.ls('/aws-opendata/us-west-2/ncar-cesm2-lens/')CPU times: user 4 μs, sys: 0 ns, total: 4 μs
Wall time: 8.11 μs
# Try to create a file url following Emma's example
cesm2_lens_path = '/aws-opendata/us-west-2/ncar-cesm2-lens/'
#osdf_director = 'https://osdf-director.osg-htc.org/'
zarr_path = cesm2_lens_path + 'atm/daily/cesm2LE-historical-cmip6-TREFHT.zarr'
#
pel_zarr = PelicanMap(zarr_path, pelfs)
print(zarr_path)/aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-TREFHT.zarr
- Note the extra `/’ in between the (pelican_director url + cesm2_lens_path). This is required
%%time
# Now, try to access the zarr store using open_zarr
test_zarr = xr.open_zarr(pel_zarr).TREFHT
test_zarrLoading...
%%time
test_zarr.isel(member_id=0,time=0).plot()CPU times: user 432 ms, sys: 214 ms, total: 647 ms
Wall time: 31.6 s

%%time
#Using HTTPS protocol
# test = xr.open_dataset('https://osdf-director.osg-htc.org/aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-TREFHT.zarr',\
# engine='zarr').TREFHT
#Using pelicanfs' OSDF protocol
test = xr.open_dataset('osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-TREFHT.zarr',\
engine='zarr').TREFHT
testLoading...
# %%time
# test.valuesComments¶
- PelicanFs doesn’t seem to support the ‘ls’ command
- However, we can load the zarr store using the full url/ pelfs
- So, let us try using an intake catalog
- The original intake catalog can be found at ‘https://
raw .githubusercontent .com /NCAR /cesm2 -le -aws /main /intake -catalogs /aws -cesm2 -le .json’ - Let us modify this catalog, by pre-pending the s3 path to the object ncar-cesm2-lens with the osdf-director + namespace
- The size of this catalog is > 100 MB. So, let us not upload it to a github repo
- Let us save the catalog to a folder on NCAR’s disk storage
Modify the intake catalog¶
# Open collection description file using intake
catalog = intake.open_esm_datastore(
'https://raw.githubusercontent.com/NCAR/cesm2-le-aws/main/intake-catalogs/aws-cesm2-le.json'
)
catalogLoading...
# df_s3 = catalog.df
# df_s3['path']# # Change s3 paths to osdf paths
# df_s3['path'] = df_s3['path'].str.replace(s3_link, '')
# df_s3['path'] = osdf_url + df_s3['path']
# df_s3['path'].head().values# %%time
# df_s3.to_csv(new_intake_csvpath)Test the new intake catalog after spinning up a cluster¶
# Create a PBS cluster object
cluster = PBSCluster(
job_name = 'dask-wk24-hpc',
cores = 1,
memory = '4GiB',
processes = 1,
local_directory = rda_scratch+'/dask/spill',
resource_spec = 'select=1:ncpus=1:mem=4GB',
queue = 'casper',
walltime = '5:00:00',
log_directory = rda_scratch+'/dask/logs',
#interface = 'ib0'
interface = 'ext'
)client = Client(cluster)
clientLoading...
cluster.scale(4)
clusterLoading...
Access the data from the AWS bucket using intake to compare¶
osdf_catalog = intake.open_esm_datastore(
new_intake_url
)
osdf_catalogLoading...
osdf_catalog.df['path'].head().valuesarray(['osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FLNS.zarr',
'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FLNSC.zarr',
'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FLUT.zarr',
'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FSNS.zarr',
'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FSNSC.zarr'],
dtype=object)osdf_catalog_temp = osdf_catalog.search(variable ='TREFHTMX', frequency ='daily')
osdf_catalog_tempLoading...
%%time
#dsets = osdf_catalog_temp.to_dataset_dict(storage_options={'anon':True})
dsets = osdf_catalog_temp.to_dataset_dict()Loading...
%%time
dsets.keys()CPU times: user 3 μs, sys: 0 ns, total: 3 μs
Wall time: 5.01 μs
dict_keys(['atm.historical.daily.cmip6', 'atm.historical.daily.smbb', 'atm.ssp370.daily.cmip6', 'atm.ssp370.daily.smbb'])historical_smbb = dsets['atm.historical.daily.smbb']
historical_smbbLoading...
%%time
historical_smbb.TREFHTMX.isel(member_id=0,time=0)Loading...
%%time
historical_smbb.TREFHTMX.isel(member_id=0,time=0).plot()CPU times: user 154 ms, sys: 40.6 ms, total: 194 ms
Wall time: 8.17 s

%%time
historical_smbb.TREFHTMX.isel(member_id=0,time=0).valuesCPU times: user 4.14 s, sys: 1.56 s, total: 5.7 s
Wall time: 5min 2s
array([[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
...,
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan],
[nan, nan, nan, ..., nan, nan, nan]], dtype=float32)historical_smbb_init = historical_smbb.TREFHTMX.sel(time=slice(init_year0, init_year1))
historical_smbb_initLoading...
Data Access Speed tests¶
- We will now test how long it takes to access data (via OSDF) for various sizes using one of the above arrays
# Test 0 : Single data point, Mem: 4 bytes. Takes 1 min 30s when .compute() is called on this data array (which loads data into memory)
historical_smbb_init0 = historical_smbb_init.isel(lat=0,lon=0,time=0,member_id=0)#Test 1: Whole globe +all member_ids for 1 time step, Mem: 10.55 MiB ~ 10.55 MB
historical_smbb_init1 = historical_smbb_init.isel(time=0)
historical_smbb_init1Loading...
#Try using a specific cache
sdsc_cache='https://sdsc-cache.nationalresearchplatform.org:8443/aws-opendata/us-west-2/ncar-cesm2-lens/atm/monthly/'+\
'cesm2LE-historical-smbb-TREFHTMX.zarr'%%time
test_1 = xr.open_zarr(sdsc_cache).TREFHTMX.isel(time=0)
test_1Loading...
%%time
test_1.compute()Loading...
%%time
# Now, try to access the same array without specifying the cache
historical_smbb_init1.compute()Loading...
# Load the same object via the NCAR origin
ncar_glade_test = xr.open_zarr('osdf:///ncar/rda-transfer/chifan_AWS/ncar-cesm2-lens/atm/monthly/cesm2LE-historical-smbb-TREFHTMX.zarr').TREFHTMX.isel(time=0)
ncar_glade_test%%time
ncar_glade_test.compute()