Skip to article frontmatterSkip to article content

Access AWS CESM2 using the AWS open data origin data

# Display output of plots directly in Notebook
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import intake
import numpy as np
import pandas as pd
import xarray as xr
# import s3fs
# import seaborn as sns
import re
# import nest_asyncio
# nest_asyncio.apply()
import aiohttp
import fsspec.implementations.http as fshttp
from pelicanfs.core import PelicanFileSystem, PelicanMap, OSDFFileSystem 
import dask 
from dask_jobqueue import PBSCluster
from dask.distributed import Client
from dask.distributed import performance_report
init_year0  = '1991'
init_year1  = '2020'
final_year0 = '2071'
final_year1 = '2100'
# This overwrites the default scheduler with a single-threaded scheduler
dask.config.set(scheduler='synchronous')  
<dask.config.set at 0x1492f7245bb0>
# File paths
rda_scratch = '/gpfs/csfs1/collections/rda/scratch/harshah'
#
rda_data           = '/gpfs/csfs1/collections/rda/data/'
new_intake_path    = rda_data + 'harshah/intake_catalogs/cesm2-lens-osdf/aws-cesm2-le.json'
new_intake_csvpath = rda_data + 'harshah/intake_catalogs/cesm2-lens-osdf/aws-cesm2-le.csv'

Set up osdf url to use with PelicanFS

  • We should one of the two pelicanFS FSSpec protocols (‘osdf’ or ‘pelican’) instead of the https protocol.
  • We will use the ‘osdf’ protocol and modify the existing CESM2-LENS catalog
  • So, the urls will look like: osdf_discovery_url + namespace prefix + path to file or object
  • In this case, the urls will look like osdf:///aws-opendata/us-west-2/ncar-cesm2-lens + path to individual zarr stores
s3_link  = 's3://'
# osdf_url = 'https://osdf-director.osg-htc.org/aws-opendata/us-west-2/'
osdf_url = 'osdf:///aws-opendata/us-west-2/'
#
rda_url        =  'https://data.rda.ucar.edu/'
new_intake_url = rda_url + 'harshah/intake_catalogs/cesm2-lens-osdf/aws-cesm2-le.json'
%%time
pelfs = OSDFFileSystem(direct_reads =True) # OSDFFileSystem is already aware of the osdf discovery url
#pelfs = PelicanFileSystem("pelican://osg-htc.org")
# pelfs.ls('/ncar/rda/')
CPU times: user 430 μs, sys: 0 ns, total: 430 μs
Wall time: 433 μs
%%time
# pelfs = PelicanFileSystem("https://osdf-director.osg-htc.org/")
# pelfs.ls('/aws-opendata/us-west-2/ncar-cesm2-lens/')
CPU times: user 4 μs, sys: 0 ns, total: 4 μs
Wall time: 8.11 μs
# Try to create a file url following Emma's example
cesm2_lens_path  = '/aws-opendata/us-west-2/ncar-cesm2-lens/'
#osdf_director    = 'https://osdf-director.osg-htc.org/'
zarr_path        = cesm2_lens_path + 'atm/daily/cesm2LE-historical-cmip6-TREFHT.zarr'
#
pel_zarr         = PelicanMap(zarr_path, pelfs)
print(zarr_path)
/aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-TREFHT.zarr
  • Note the extra `/’ in between the (pelican_director url + cesm2_lens_path). This is required
%%time
# Now, try to access the zarr store using open_zarr
test_zarr = xr.open_zarr(pel_zarr).TREFHT 
test_zarr
Loading...
%%time
test_zarr.isel(member_id=0,time=0).plot()
CPU times: user 432 ms, sys: 214 ms, total: 647 ms
Wall time: 31.6 s
<Figure size 640x480 with 2 Axes>
%%time
#Using HTTPS protocol
# test = xr.open_dataset('https://osdf-director.osg-htc.org/aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-TREFHT.zarr',\
#                       engine='zarr').TREFHT
#Using pelicanfs' OSDF protocol
test = xr.open_dataset('osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-TREFHT.zarr',\
                       engine='zarr').TREFHT
test
Loading...
# %%time
# test.values

Comments

  • PelicanFs doesn’t seem to support the ‘ls’ command
  • However, we can load the zarr store using the full url/ pelfs
  • So, let us try using an intake catalog
  • The original intake catalog can be found at ‘https://raw.githubusercontent.com/NCAR/cesm2-le-aws/main/intake-catalogs/aws-cesm2-le.json
  • Let us modify this catalog, by pre-pending the s3 path to the object ncar-cesm2-lens with the osdf-director + namespace
  • The size of this catalog is > 100 MB. So, let us not upload it to a github repo
  • Let us save the catalog to a folder on NCAR’s disk storage

Modify the intake catalog

# Open collection description file using intake
catalog = intake.open_esm_datastore(
    'https://raw.githubusercontent.com/NCAR/cesm2-le-aws/main/intake-catalogs/aws-cesm2-le.json'
)
catalog
Loading...
# df_s3 = catalog.df
# df_s3['path']
# # Change s3 paths to osdf paths
# df_s3['path'] = df_s3['path'].str.replace(s3_link, '')
# df_s3['path'] = osdf_url + df_s3['path'] 
# df_s3['path'].head().values
# %%time
# df_s3.to_csv(new_intake_csvpath)

Test the new intake catalog after spinning up a cluster

# Create a PBS cluster object
cluster = PBSCluster(
    job_name = 'dask-wk24-hpc',
    cores = 1,
    memory = '4GiB',
    processes = 1,
    local_directory = rda_scratch+'/dask/spill',
    resource_spec = 'select=1:ncpus=1:mem=4GB',
    queue = 'casper',
    walltime = '5:00:00',
    log_directory = rda_scratch+'/dask/logs',
    #interface = 'ib0'
    interface = 'ext'
)
client = Client(cluster)
client
Loading...
cluster.scale(4)
cluster
Loading...

Access the data from the AWS bucket using intake to compare

osdf_catalog = intake.open_esm_datastore(
   new_intake_url 
)
osdf_catalog
Loading...
osdf_catalog.df['path'].head().values
array(['osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FLNS.zarr', 'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FLNSC.zarr', 'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FLUT.zarr', 'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FSNS.zarr', 'osdf:///aws-opendata/us-west-2/ncar-cesm2-lens/atm/daily/cesm2LE-historical-cmip6-FSNSC.zarr'], dtype=object)
osdf_catalog_temp = osdf_catalog.search(variable ='TREFHTMX', frequency ='daily')
osdf_catalog_temp
Loading...
%%time
#dsets = osdf_catalog_temp.to_dataset_dict(storage_options={'anon':True})
dsets = osdf_catalog_temp.to_dataset_dict()
Loading...
%%time
dsets.keys()
CPU times: user 3 μs, sys: 0 ns, total: 3 μs
Wall time: 5.01 μs
dict_keys(['atm.historical.daily.cmip6', 'atm.historical.daily.smbb', 'atm.ssp370.daily.cmip6', 'atm.ssp370.daily.smbb'])
historical_smbb = dsets['atm.historical.daily.smbb']
historical_smbb
Loading...
%%time
historical_smbb.TREFHTMX.isel(member_id=0,time=0)
Loading...
%%time
historical_smbb.TREFHTMX.isel(member_id=0,time=0).plot()
CPU times: user 154 ms, sys: 40.6 ms, total: 194 ms
Wall time: 8.17 s
<Figure size 640x480 with 2 Axes>
%%time
historical_smbb.TREFHTMX.isel(member_id=0,time=0).values
CPU times: user 4.14 s, sys: 1.56 s, total: 5.7 s
Wall time: 5min 2s
array([[nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], ..., [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan], [nan, nan, nan, ..., nan, nan, nan]], dtype=float32)
historical_smbb_init = historical_smbb.TREFHTMX.sel(time=slice(init_year0, init_year1))
historical_smbb_init
Loading...

Data Access Speed tests

  • We will now test how long it takes to access data (via OSDF) for various sizes using one of the above arrays
# Test 0 : Single data point, Mem: 4 bytes. Takes 1 min 30s when .compute() is called on this data array (which loads data into memory)
historical_smbb_init0 = historical_smbb_init.isel(lat=0,lon=0,time=0,member_id=0)
#Test 1: Whole globe +all member_ids for 1 time step, Mem: 10.55 MiB ~ 10.55 MB
historical_smbb_init1 = historical_smbb_init.isel(time=0)
historical_smbb_init1
Loading...
#Try using a specific cache
sdsc_cache='https://sdsc-cache.nationalresearchplatform.org:8443/aws-opendata/us-west-2/ncar-cesm2-lens/atm/monthly/'+\
            'cesm2LE-historical-smbb-TREFHTMX.zarr'
%%time
test_1 = xr.open_zarr(sdsc_cache).TREFHTMX.isel(time=0)
test_1
Loading...
%%time
test_1.compute()
Loading...
%%time
# Now, try to access the same array without specifying the cache
historical_smbb_init1.compute()
Loading...
# Load the same object via the NCAR origin
ncar_glade_test = xr.open_zarr('osdf:///ncar/rda-transfer/chifan_AWS/ncar-cesm2-lens/atm/monthly/cesm2LE-historical-smbb-TREFHTMX.zarr').TREFHTMX.isel(time=0)
ncar_glade_test
%%time
ncar_glade_test.compute()