Precipitation plots from ERA5 - OSDF usage examples

Access ERA5 data from NCAR’s GDEX and plot mean total precipitation¶

import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
import intake
import intake_esm
import pandas as pd
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import os

Required Packages¶

Please make sure to installed the packages before moving forward

matplotlib
xarray
dask
cartopy
zarr < 3

# Set up your sratch folder path
username       = os.environ["USER"]
glade_scratch  = "/glade/derecho/scratch/" + username
print(glade_scratch)
#
era5_catalog_url ='https://osdf-director.osg-htc.org/ncar/gdex/d633000/catalogs/d633000-osdf.json'

/glade/derecho/scratch/harshah

USE_PBS_SCHEDULER = True

# Create a PBS cluster object
def get_pbs_cluster():
    """ Create cluster through dask_jobqueue.   
    """
    from dask_jobqueue import PBSCluster
    cluster = PBSCluster(
        job_name = 'dask-osdf-24',
        cores = 1,
        memory = '4GiB',
        processes = 1,
        local_directory = glade_scratch + '/dask/spill',
        log_directory = glade_scratch + '/dask/logs/',
        resource_spec = 'select=1:ncpus=1:mem=4GB',
        queue = 'casper',
        walltime = '3:00:00',
        #interface = 'ib0'
        interface = 'ext'
    )
    return cluster

def get_gateway_cluster():
    """ Create cluster through dask_gateway
    """
    from dask_gateway import Gateway

    gateway = Gateway()
    cluster = gateway.new_cluster()
    cluster.adapt(minimum=2, maximum=4)
    return cluster

def get_local_cluster():
    """ Create cluster using the Jupyter server's resources
    """
    from distributed import LocalCluster, performance_report
    cluster = LocalCluster()    

    cluster.scale(5)
    return cluster

# Obtain dask cluster in one of three ways
if USE_PBS_SCHEDULER:
    cluster = get_pbs_cluster()
elif USE_DASK_GATEWAY:
    cluster = get_gateway_cluster()
else:
    cluster = get_local_cluster()

# Connect to cluster
from distributed import Client
client = Client(cluster)

/glade/u/home/harshah/.conda/envs/osdf/lib/python3.11/site-packages/distributed/node.py:188: UserWarning: Port 8787 is already in use.
Perhaps you already have a cluster running?
Hosting the HTTP server on port 34923 instead
  warnings.warn(

# Scale the cluster and display cluster dashboard URL
n_workers =5
cluster.scale(n_workers)
client.wait_for_workers(n_workers = n_workers)
cluster

%%time
era5_cat = intake.open_esm_datastore(era5_catalog_url)
era5_cat

CPU times: user 125 ms, sys: 7.47 ms, total: 133 ms
Wall time: 5.66 s

era5_cat.df

era5_cat.df[['variable','long_name']].drop_duplicates()

cat_subset = era5_cat.search(variable='MTPR')
cat_subset.df

##############################################################

# Define the xarray_open_kwargs with a compatible engine, for example, 'scipy'
xarray_open_kwargs = {
    'engine': 'h5netcdf',
    'chunks': {},  # Specify any chunking if needed
    'backend_kwargs': {}  # Any additional backend arguments if required
}

%%time
# dset_temp = temp_cat.to_dataset_dict(xarray_open_kwargs=xarray_open_kwargs)
dset_subset = cat_subset.to_dataset_dict(xarray_open_kwargs={'engine':'zarr','backend_kwargs':{'consolidated': True,'zarr_format': 2}})


--> The keys in the returned dictionary of datasets are constructed as follows:
	'variable.short_name'

CPU times: user 532 ms, sys: 63.9 ms, total: 596 ms
Wall time: 10.3 s

dset_subset

{'MTPR.mtpr': <xarray.Dataset> Size: 3TB
 Dimensions:                (forecast_initial_time: 61666, forecast_hour: 12,
                             latitude: 721, longitude: 1440)
 Coordinates:
   * forecast_initial_time  (forecast_initial_time) datetime64[ns] 493kB 1940-...
   * forecast_hour          (forecast_hour) int32 48B 1 2 3 4 5 6 7 8 9 10 11 12
   * latitude               (latitude) float64 6kB 90.0 89.75 ... -89.75 -90.0
   * longitude              (longitude) float64 12kB 0.0 0.25 0.5 ... 359.5 359.8
 Data variables:
     MTPR                   (forecast_initial_time, forecast_hour, latitude, longitude) float32 3TB dask.array<chunksize=(2, 12, 721, 1440), meta=np.ndarray>
     utc_date               (forecast_initial_time) int32 247kB dask.array<chunksize=(61666,), meta=np.ndarray>
 Attributes: (12/17)
     CONVERSION_DATE:                 Wed Nov  2 10:03:48 MDT 2022
     CONVERSION_PLATFORM:             Linux r3i0n28 4.12.14-95.51-default #1 S...
     Conventions:                     CF-1.6
     DATA_SOURCE:                     ECMWF: https://cds.climate.copernicus.eu...
     NCO:                             netCDF Operators version 5.0.3 (Homepage...
     NETCDF_COMPRESSION:              NCO: Precision-preserving compression to...
     ...                              ...
     intake_esm_attrs:variable:       MTPR
     intake_esm_attrs:_data_format_:  reference
     intake_esm_attrs:short_name:     mtpr
     intake_esm_attrs:long_name:      Mean total precipitation rate
     intake_esm_attrs:units:          kg m**-2 s**-1
     intake_esm_dataset_key:          MTPR.mtpr}

mtpr = dset_subset['MTPR.mtpr']
mtpr

da = mtpr.MTPR.isel(forecast_hour=10).isel(forecast_initial_time=10)

# 2) Make a Cartopy map axis
proj = ccrs.PlateCarree()  
fig, ax = plt.subplots(figsize=(12, 6), subplot_kw={"projection": proj})

# 3) Plot onto that axis
im = da.plot(
    ax=ax,  
    transform=ccrs.PlateCarree(),
    cmap = 'Blues',
    x="longitude",
    y="latitude",
    #robust=True,                
    cbar_kwargs={"label": getattr(da, "units", "")},
)

ax.coastlines(color="black", linewidth=1.0)

<cartopy.mpl.feature_artist.FeatureArtist at 0x151ab39f1650>

cluster.close()