intake-esm open dataset error · python-questions

I'm getting this stack track on Casper using NCAR_JobQueue. It looks like opening an intake-esm catalog is failing.

Brian Bonnlander (Mar 16 2020 at 17:42):

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/core.py in to_dataset_dict(self, zarr_kwargs, cdf_kwargs, preprocess, aggregate, storage_options, progressbar)
    390             self.progressbar = progressbar
    391
--> 392         return self._open_dataset()
    393
    394     def _open_dataset(self):

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/core.py in _open_dataset(self)
    501                 progress(futures)
    502
--> 503             dsets = client.gather(futures)
    504
    505         self._ds = {group_id: ds for (group_id, ds) in dsets}

~/miniconda3/envs/analysis/lib/python3.7/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
   1891                 direct=direct,
   1892                 local_worker=local_worker,
-> 1893                 asynchronous=asynchronous,
   1894             )
   1895

~/miniconda3/envs/analysis/lib/python3.7/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    778         else:
    779             return sync(
--> 780                 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
    781             )
    782

~/miniconda3/envs/analysis/lib/python3.7/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    346     if error[0]:
    347         typ, exc, tb = error[0]
--> 348         raise exc.with_traceback(tb)
    349     else:
    350         return result[0]

~/miniconda3/envs/analysis/lib/python3.7/site-packages/distributed/utils.py in f()
    330             if callback_timeout is not None:
    331                 future = asyncio.wait_for(future, callback_timeout)
--> 332             result[0] = yield future
    333         except Exception as exc:
    334             error[0] = sys.exc_info()

~/miniconda3/envs/analysis/lib/python3.7/site-packages/tornado/gen.py in run(self)
    733
    734                     try:
--> 735                         value = future.result()
    736                     except Exception:
    737                         exc_info = sys.exc_info()

~/miniconda3/envs/analysis/lib/python3.7/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
   1750                             exc = CancelledError(key)
   1751                         else:
-> 1752                             raise exception.with_traceback(traceback)
   1753                         raise exc
   1754                     if errors == "skip":

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/core.py in _load_group_dataset()
    582         zarr_kwargs,
    583         cdf_kwargs,
--> 584         preprocess,
    585     )
    586

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in _aggregate()
    162             return ds
    163
--> 164     return apply_aggregation(v)
    165
    166

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in apply_aggregation()
    109             dsets = [
    110                 apply_aggregation(value, agg_column, key=key, level=level + 1)
--> 111                 for key, value in v.items()
    112             ]
    113             keys = list(v.keys())

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in <listcomp>()
    109             dsets = [
    110                 apply_aggregation(value, agg_column, key=key, level=level + 1)
--> 111                 for key, value in v.items()
    112             ]
    113             keys = list(v.keys())

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in apply_aggregation()
    109             dsets = [
    110                 apply_aggregation(value, agg_column, key=key, level=level + 1)
--> 111                 for key, value in v.items()
    112             ]
    113             keys = list(v.keys())

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in <listcomp>()
    109             dsets = [
    110                 apply_aggregation(value, agg_column, key=key, level=level + 1)
--> 111                 for key, value in v.items()
    112             ]
    113             keys = list(v.keys())

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in apply_aggregation()
    109             dsets = [
    110                 apply_aggregation(value, agg_column, key=key, level=level + 1)
--> 111                 for key, value in v.items()
    112             ]
    113             keys = list(v.keys())

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in <listcomp>()
    109             dsets = [
    110                 apply_aggregation(value, agg_column, key=key, level=level + 1)
--> 111                 for key, value in v.items()
    112             ]
    113             keys = list(v.keys())

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in apply_aggregation()
     91                 zarr_kwargs=zarr_kwargs,
     92                 cdf_kwargs=cdf_kwargs,
---> 93                 preprocess=preprocess,
     94             )
     95             ds.attrs['intake_esm_varname'] = varname

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in _open_asset()
    194         except Exception as e:
    195             logger.error(f'Failed to open netCDF/HDF dataset.')
--> 196             raise e
    197
    198     if preprocess is None:

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in _open_asset()
    191         logger.info(f'Opening netCDF/HDF dataset: {root} - protocol: {protocol}')
    192         try:
--> 193             ds = xr.open_dataset(path, **cdf_kwargs)
    194         except Exception as e:
    195             logger.error(f'Failed to open netCDF/HDF dataset.')

~/miniconda3/envs/analysis/lib/python3.7/site-packages/xarray/backends/api.py in open_dataset()
    527                 "with engine='scipy' or 'h5netcdf'"
    528             )
--> 529         engine = _get_engine_from_magic_number(filename_or_obj)
    530         if engine == "scipy":
    531             store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs)

~/miniconda3/envs/analysis/lib/python3.7/site-packages/xarray/backends/api.py in _get_engine_from_magic_number()
    115         magic_number = filename_or_obj[:8]
    116     else:
--> 117         if filename_or_obj.tell() != 0:
    118             raise ValueError(
    119                 "file-like object read/write pointer not at zero "

AttributeError: 'FSMap' object has no attribute 'tell'

Anderson Banihirwe (Mar 16 2020 at 17:44):

@Brian Bonnlander, that was a bug (introduced by me a few weeks ago :slight_smile:). I fixed it earlier this week-end...

python -m pip install git+https://github.com/NCAR/intake-esm.git

Anderson Banihirwe (Mar 16 2020 at 17:45):

Brian Bonnlander (Mar 16 2020 at 17:47):

Well, thanks for fixing it! :)
Trying to write some LENS ocean data to Zarr today...

Brian Bonnlander (Mar 16 2020 at 17:48):

@Anderson Banihirwe Do I have to restart my kernel after the pip install? I am still getting the same error using the same kernel.

Anderson Banihirwe (Mar 16 2020 at 17:49):

%laod_ext autoreload
%autoreload 2

Brian Bonnlander (Mar 16 2020 at 17:58):

Thanks for your help so far today. Is it bad when the Dask status tabs stop working in a JupyterHub session? In the middle of starting the first Zarr store save, the tabs went blank.

Anderson Banihirwe (Mar 16 2020 at 17:59):

Anderson Banihirwe (Mar 16 2020 at 18:00):

Anderson Banihirwe (Mar 16 2020 at 18:01):

It tends to happen when a huge amount of information is being sent to the bokeh server on which the dashboard is running on

Anderson Banihirwe (Mar 16 2020 at 18:02):

If you wait a little bit, things may normalize and the dashboard may go back to working as usual

Brian Bonnlander (Mar 16 2020 at 18:02):

OK, but I am also not seeing my Zarr folder growing in size. After about 5 minutes, it is stuck at around 250K bytes. Can it sometimes take a while for the in-memory work to finish before starting a write?

Anderson Banihirwe (Mar 16 2020 at 18:03):

Brian Bonnlander (Mar 16 2020 at 18:03):

Anderson Banihirwe (Mar 16 2020 at 18:04):

Deepak Cherian (Mar 16 2020 at 18:04):

How much data are you writing? what is len(dataset.__dask_graph__()) → This is the number of tasks. If it's more than 2 million or so, it's too big

Brian Bonnlander (Mar 16 2020 at 18:05):

So, I go back to the tab to kill everything, and now the Dask tabs are working again!

Brian Bonnlander (Mar 16 2020 at 18:07):

But the number of Dask workers is staying at 2. Everything looks really slow. Restart?

Anderson Banihirwe (Mar 16 2020 at 18:07):

As Deepak pointed out, I would also check the number of tasks before writing to zarr

Brian Bonnlander (Mar 16 2020 at 18:08):

Aha, I have to interrupt the Zarr write to find this out, right? Change the code to print this?

Anderson Banihirwe (Mar 16 2020 at 18:09):

This is normal. You are competing for resources with others...... If the queue is full, dask-workers have to wait to get in

Brian Bonnlander (Mar 16 2020 at 18:10):

Anderson Banihirwe (Mar 16 2020 at 18:10):

Anderson Banihirwe (Mar 16 2020 at 18:11):

Brian Bonnlander (Mar 16 2020 at 18:11):

Anderson Banihirwe (Mar 16 2020 at 18:12):

If yes, if you look at the dashboard /status page or progress frame, you should be able to see the number of tasks

Brian Bonnlander (Mar 16 2020 at 18:13):

Anderson Banihirwe (Mar 16 2020 at 18:13):

Brian Bonnlander (Mar 16 2020 at 18:13):

Brian Bonnlander (Mar 16 2020 at 19:01):

I can't remember how to authenticate with Globus File Transfer. Did we use a special account, or our own Google identity, or something else?

Brian Bonnlander (Mar 16 2020 at 19:02):

Kevin Paul (Mar 16 2020 at 19:19):

Brian Bonnlander (Mar 16 2020 at 19:21):

I expected to find a little trash can icon in the upper right menu, but it's not there.

Brian Bonnlander (Mar 16 2020 at 19:22):

Brian Bonnlander (Mar 16 2020 at 19:23):

Danica Lombardozzi (Oct 12 2020 at 21:10):

I've been using the following code successfully for several months:
catalog = intake.open_esm_datastore("/glade/collections/cmip/catalog/intake-esm-datastore/catalogs/glade-cmip6.json")

I am now getting an error message FileNotFoundError: [Errno 2] No such file or directory: 'glade-cmip6.csv.gz'

I've checked that the file exists in the directory, and I also update as suggested above: pip install git+https://github.com/NCAR/intake-esm.git and restarted the kernel. This fix does not work, and I get the same error message. Did something else change or get updated that I'm not aware of?

Anderson Banihirwe (Oct 12 2020 at 23:25):

In [1]: import intake

In [2]: catalog = intake.open_esm_datastore("/glade/collections/cmip/catalog/intake-esm-datasto
   ...: re/catalogs/glade-cmip6.json")

In [3]: catalog.df.head()
Out[3]:
  activity_id institution_id  ...     time_range                                               path
0  AerChemMIP            BCC  ...  201501-205512  /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
1  AerChemMIP            BCC  ...  201501-205512  /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
2  AerChemMIP            BCC  ...  201501-205512  /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
3  AerChemMIP            BCC  ...  201501-205512  /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
4  AerChemMIP            BCC  ...  201501-205512  /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...

abanihi at casper26 in /glade/collections/cmip/catalog/intake-esm-datastore/catalogs on master [!]
$ ls glade-cmip*
glade-cmip5.csv.gz  glade-cmip5.json  glade-cmip6.csv.gz  glade-cmip6.json

Anderson Banihirwe (Oct 12 2020 at 23:27):

When you get a chance, could you post here the full traceback of the error you are getting?

Danica Lombardozzi (Oct 12 2020 at 23:34):

Really? Well, I wonder what's going on. I'm having the same issue in two of my scripts. I've been using the CMIP62019.10a kernel. Below is more information on the problem I am encountering.

import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import esmlab

#Tried below install, unsuccessful at fixing the intake error
#pip install git+https://github.com/NCAR/intake-esm.git

import intake

catalog = intake.open_esm_datastore("/glade/collections/cmip/catalog/intake-esm-datastore/catalogs/glade-cmip6.json")
catalog.df.head()

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-3-1231c631f29f> in <module>
----> 1 catalog = intake.open_esm_datastore("/glade/collections/cmip/catalog/intake-esm-datastore/catalogs/glade-cmip6.json")
      2 catalog.df.head()

/ncar/usr/jupyterhub/envs/cmip6-201910a/lib/python3.7/site-packages/intake_esm/core.py in __init__(self, esmcol_path, **kwargs)
     68         self.esmcol_path = esmcol_path
     69         self._col_data = _fetch_and_parse_file(esmcol_path)
---> 70         self.df = self._fetch_catalog()
     71         self._entries = {}
     72         self.urlpath = ''

/ncar/usr/jupyterhub/envs/cmip6-201910a/lib/python3.7/site-packages/intake_esm/core.py in _fetch_catalog(self)
    113         """Get the catalog file and cache it.
    114         """
--> 115         return pd.read_csv(self._col_data['catalog_file'])
    116
    117     def nunique(self):

/ncar/usr/jupyterhub/envs/cmip6-201910a/lib/python3.7/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    683         )
    684
--> 685         return _read(filepath_or_buffer, kwds)
    686
    687     parser_f.__name__ = name

/ncar/usr/jupyterhub/envs/cmip6-201910a/lib/python3.7/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    455
    456     # Create the parser.
--> 457     parser = TextFileReader(fp_or_buf, **kwds)
    458
    459     if chunksize or iterator:

/ncar/usr/jupyterhub/envs/cmip6-201910a/lib/python3.7/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    893             self.options["has_index_names"] = kwds["has_index_names"]
    894
--> 895         self._make_engine(self.engine)
    896
    897     def close(self):

/ncar/usr/jupyterhub/envs/cmip6-201910a/lib/python3.7/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
   1133     def _make_engine(self, engine="c"):
   1134         if engine == "c":
-> 1135             self._engine = CParserWrapper(self.f, **self.options)
   1136         else:
   1137             if engine == "python":

/ncar/usr/jupyterhub/envs/cmip6-201910a/lib/python3.7/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1915         kwds["usecols"] = self.usecols
   1916
-> 1917         self._reader = parsers.TextReader(src, **kwds)
   1918         self.unnamed_cols = self._reader.unnamed_cols
   1919

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()

/ncar/usr/jupyterhub/envs/cmip6-201910a/lib/python3.7/gzip.py in __init__(self, filename, mode, compresslevel, fileobj, mtime)
    161             mode += 'b'
    162         if fileobj is None:
--> 163             fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
    164         if filename is None:
    165             filename = getattr(fileobj, 'name', '')

FileNotFoundError: [Errno 2] No such file or directory: 'glade-cmip6.csv.gz'

Anderson Banihirwe (Oct 12 2020 at 23:45):

1) The environment ( cmip6-201910a) you are using is outdated and unfortunately pip install .... doesn't have any effect towards fixing the issue

2) You actually want to use the cmip6-201910 environment instead of cmip6-201910a (notice the a at the end). The cmip6-2019 has an up-to-date version of intake-esm that knows how to resolve the absolute path of the glade-cmip6-csv.gz file when given the path to the JSON.

Danica Lombardozzi (Oct 12 2020 at 23:51):

Aha! That did the trick! Thanks for the update. I thought that cmip6-201910a was the more up-to-date version, so it's good to know that I shouldn't use it.

Anderson Banihirwe (Oct 13 2020 at 00:05):

This was the original idea, however, for administrative reasons the two environments ended up diverging over time (as far as package versions are concerned)..
I happen to have admin permissions on the cmip6-2019 environment. As a result, I try to keep it up-to-date. However, I don't have control over the cmip6-2019a environment

Stream: python-questions

Topic: intake-esm open dataset error

Brian Bonnlander (Mar 16 2020 at 17:42):

Brian Bonnlander (Mar 16 2020 at 17:42):

Anderson Banihirwe (Mar 16 2020 at 17:44):

Anderson Banihirwe (Mar 16 2020 at 17:45):

Brian Bonnlander (Mar 16 2020 at 17:47):

Brian Bonnlander (Mar 16 2020 at 17:48):

Anderson Banihirwe (Mar 16 2020 at 17:49):

Anderson Banihirwe (Mar 16 2020 at 17:49):

Brian Bonnlander (Mar 16 2020 at 17:58):

Anderson Banihirwe (Mar 16 2020 at 17:59):

Anderson Banihirwe (Mar 16 2020 at 18:00):

Anderson Banihirwe (Mar 16 2020 at 18:01):

Anderson Banihirwe (Mar 16 2020 at 18:02):

Brian Bonnlander (Mar 16 2020 at 18:02):

Anderson Banihirwe (Mar 16 2020 at 18:03):

Anderson Banihirwe (Mar 16 2020 at 18:03):

Brian Bonnlander (Mar 16 2020 at 18:03):

Brian Bonnlander (Mar 16 2020 at 18:03):

Anderson Banihirwe (Mar 16 2020 at 18:04):

Anderson Banihirwe (Mar 16 2020 at 18:04):

Deepak Cherian (Mar 16 2020 at 18:04):

Brian Bonnlander (Mar 16 2020 at 18:05):

Brian Bonnlander (Mar 16 2020 at 18:07):

Anderson Banihirwe (Mar 16 2020 at 18:07):

Brian Bonnlander (Mar 16 2020 at 18:08):

Anderson Banihirwe (Mar 16 2020 at 18:09):

Brian Bonnlander (Mar 16 2020 at 18:10):

Anderson Banihirwe (Mar 16 2020 at 18:10):

Anderson Banihirwe (Mar 16 2020 at 18:11):

Brian Bonnlander (Mar 16 2020 at 18:11):

Anderson Banihirwe (Mar 16 2020 at 18:12):

Brian Bonnlander (Mar 16 2020 at 18:13):

Anderson Banihirwe (Mar 16 2020 at 18:13):

Brian Bonnlander (Mar 16 2020 at 18:13):

Brian Bonnlander (Mar 16 2020 at 19:01):

Brian Bonnlander (Mar 16 2020 at 19:02):

Kevin Paul (Mar 16 2020 at 19:19):

Brian Bonnlander (Mar 16 2020 at 19:21):

Brian Bonnlander (Mar 16 2020 at 19:22):

Brian Bonnlander (Mar 16 2020 at 19:23):

Danica Lombardozzi (Oct 12 2020 at 21:10):

Anderson Banihirwe (Oct 12 2020 at 23:25):

Anderson Banihirwe (Oct 12 2020 at 23:27):

Danica Lombardozzi (Oct 12 2020 at 23:34):

Anderson Banihirwe (Oct 12 2020 at 23:45):

Danica Lombardozzi (Oct 12 2020 at 23:51):

Anderson Banihirwe (Oct 13 2020 at 00:05):