DART-CAM6 reanalysis diagnostic plots

This notebook is adapted from the DART-CAM6 example notebook on AWS
https://ncar-dart-cam6.s3-us-west-2.amazonaws.com/examples/plot-ensemble-values.html

Input Data Access¶

This notebook illustrates how to make diagnostic plots hosted from the DART reanalysis stored on NCAR’s GDEX
https://gdex.ucar.edu/datasets/d345001/#
This data is open access and can be accessed via 2 protocols
1. HTTPS (if you have access to NCAR’s HPC)
1. OSDF using an intake-ESM catalog.

# Imports
import intake
import numpy as np
import pandas as pd
import xarray as xr
import seaborn as sns
import re
import os
import matplotlib.pyplot as plt

import fsspec.implementations.http as fshttp
from pelicanfs.core import PelicanFileSystem, PelicanMap, OSDFFileSystem

import dask 
from dask_jobqueue import PBSCluster
from dask.distributed import Client
from dask.distributed import performance_report

init_year0  = '1991'
init_year1  = '2020'
final_year0 = '2071'
final_year1 = '2100'

# Set up your scratch folder path
username       = os.environ["USER"]
glade_scratch  = "/glade/derecho/scratch/" + username
print(glade_scratch)

/glade/derecho/scratch/harshah

# catalog_url = 'https://osdf-data.gdex.ucar.edu/ncar-gdex/d345001/catalogs/d345001-osdf-zarr.json'
catalog_url = 'https://stratus.gdex.ucar.edu/d345001/catalogs/d345001-https-zarr.json' #NCAR's Object store
print(catalog_url)

https://stratus.gdex.ucar.edu/d345001/catalogs/d345001-https-zarr.json

Create a PBS cluster¶

# Create a PBS cluster object
cluster = PBSCluster(
    job_name = 'dask-wk24-hpc',
    cores = 1,
    memory = '8GiB',
    processes = 1,
    local_directory = glade_scratch+'/dask/spill',
    log_directory = glade_scratch + '/dask/logs/',
    resource_spec = 'select=1:ncpus=1:mem=8GB',
    queue = 'casper',
    walltime = '5:00:00',
    #interface = 'ib0'
    interface = 'ext'
)

# Create the client to load the Dashboard
client = Client(cluster)
n_workers = 8
cluster.scale(n_workers)
client.wait_for_workers(n_workers = n_workers)
cluster

Load DART Reanalysis data from GDEX using an intake catalog¶

col = intake.open_esm_datastore(catalog_url)
col

Load data into xarray using catalog¶

data_var = 'PS'

col_subset = col.search(variable=data_var)
col_subset

col_subset.df['path'].values

array(['https://stratus.gdex.ucar.edu/d345001/weekly/PS.zarr'],
      dtype=object)

col_subset.df

Convert catalog subset to a dictionary of xarray datasets, and use the first one.¶

dsets = col_subset.to_dataset_dict(zarr_kwargs={"consolidated": True})
print(f"\nDataset dictionary keys:\n {dsets.keys()}")

# Load the first dataset and display a summary.
dataset_key = list(dsets.keys())[0]
ds = dsets[dataset_key]
ds

# Load the first dataset and display a summary.
dataset_key = list(dsets.keys())[0]
store_name = dataset_key + ".zarr"

ds = dsets[dataset_key]
ds

Define Plot Functions¶

Get consistently shaped data slices for both 2D and 3D variables.¶

def getSlice(ds, data_var):
    '''If the data has vertical levels, choose the level closest
       to the Earth's surface for 2-D diagnostic plots.
    '''
    data_slice = ds[data_var]

    if 'lev' in data_slice.dims:
        lastLevel = ds.lev.values[-1]
        data_slice = data_slice.sel(lev = lastLevel)
        data_slice = data_slice.squeeze()

    return data_slice

Get lat/lon dimension names¶

def getSpatialDimensionNames(data_slice):
    '''Get the spatial dimension names for this data slice.
    '''
    # Determine lat/lon conventions for this slice.
    lat_dim = 'lat' if 'lat' in data_slice.dims else 'slat'
    lon_dim = 'lon' if 'lon' in data_slice.dims else 'slon'
    
    return [lat_dim, lon_dim]

Produce Time Series Spaghetti Plot of Ensemble Members¶

def plot_timeseries(ds, data_var, store_name):
    '''Create a spaghetti plot for a given variable.
    '''
    figWidth = 25 
    figHeight = 20
    linewidth = 0.5

    numPlotsPerPage = 3
    numPlotCols = 1
    
    # Plot the aggregate statistics across time.
    fig, axs = plt.subplots(3, 1, figsize=(figWidth, figHeight))

    data_slice = getSlice(ds, data_var)
    spatial_dims = getSpatialDimensionNames(data_slice)

    unit_string = ds[data_var].attrs['units']

    # Persist the slice so it's read from disk only once.
    # This is faster when data values are reused many times.
    data_slice = data_slice.persist()

    max_vals = data_slice.max(dim = spatial_dims).transpose()
    mean_vals = data_slice.mean(dim = spatial_dims).transpose()
    min_vals = data_slice.min(dim = spatial_dims).transpose()

    
    rangeMaxs = max_vals.max(dim = 'member_id')
    rangeMins = max_vals.min(dim = 'member_id')
    axs[0].set_facecolor('lightgrey')
    axs[0].fill_between(ds.time, rangeMins, rangeMaxs, linewidth=linewidth, color='white')
    axs[0].plot(ds.time, max_vals, linewidth=linewidth, color='red', alpha=0.1)
    axs[0].set_title('Ensemble Member Maxima Over Time', fontsize=20)
    axs[0].set_ylabel(unit_string)

    rangeMaxs = mean_vals.max(dim = 'member_id')
    rangeMins = mean_vals.min(dim = 'member_id')
    axs[1].set_facecolor('lightgrey')
    axs[1].fill_between(ds.time, rangeMins, rangeMaxs, linewidth=linewidth, color='white')
    axs[1].plot(ds.time, mean_vals, linewidth=linewidth, color='red', alpha=0.1)
    axs[1].set_title('Ensemble Member Means Over Time', fontsize=20)
    axs[1].set_ylabel(unit_string)

    rangeMaxs = min_vals.max(dim = 'member_id')
    rangeMins = min_vals.min(dim = 'member_id')
    axs[2].set_facecolor('lightgrey')
    axs[2].fill_between(ds.time, rangeMins, rangeMaxs, linewidth=linewidth, color='white')
    axs[2].plot(ds.time, min_vals, linewidth=linewidth, color='red', alpha=0.1)
    axs[2].set_title('Ensemble Member Minima Over Time', fontsize=20)
    axs[2].set_ylabel(unit_string)

    plt.suptitle(store_name, fontsize=25)
    
    return fig

Actually Create Spaghetti Plot Showing All Ensemble Members¶

%%time

store_name = f'{data_var}.zarr'
fig = plot_timeseries(ds, data_var, store_name)

CPU times: user 8.85 s, sys: 557 ms, total: 9.41 s
Wall time: 36.1 s

Release dask workers¶

cluster.close()

Simulations

Access CESM2 LENS data from NCAR’s Geoscience Data Exchange (GDEX) and compute GMST

Simulations

Load data