Stream: python-questions

Topic: intake-esm open dataset error


view this post on Zulip Brian Bonnlander (Mar 16 2020 at 17:42):

I'm getting this stack track on Casper using NCAR_JobQueue. It looks like opening an intake-esm catalog is failing.

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 17:42):

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/core.py in to_dataset_dict(self, zarr_kwargs, cdf_kwargs, preprocess, aggregate, storage_options, progressbar)
    390             self.progressbar = progressbar
    391
--> 392         return self._open_dataset()
    393
    394     def _open_dataset(self):

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/core.py in _open_dataset(self)
    501                 progress(futures)
    502
--> 503             dsets = client.gather(futures)
    504
    505         self._ds = {group_id: ds for (group_id, ds) in dsets}

~/miniconda3/envs/analysis/lib/python3.7/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
   1891                 direct=direct,
   1892                 local_worker=local_worker,
-> 1893                 asynchronous=asynchronous,
   1894             )
   1895

~/miniconda3/envs/analysis/lib/python3.7/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    778         else:
    779             return sync(
--> 780                 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
    781             )
    782

~/miniconda3/envs/analysis/lib/python3.7/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    346     if error[0]:
    347         typ, exc, tb = error[0]
--> 348         raise exc.with_traceback(tb)
    349     else:
    350         return result[0]

~/miniconda3/envs/analysis/lib/python3.7/site-packages/distributed/utils.py in f()
    330             if callback_timeout is not None:
    331                 future = asyncio.wait_for(future, callback_timeout)
--> 332             result[0] = yield future
    333         except Exception as exc:
    334             error[0] = sys.exc_info()

~/miniconda3/envs/analysis/lib/python3.7/site-packages/tornado/gen.py in run(self)
    733
    734                     try:
--> 735                         value = future.result()
    736                     except Exception:
    737                         exc_info = sys.exc_info()

~/miniconda3/envs/analysis/lib/python3.7/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
   1750                             exc = CancelledError(key)
   1751                         else:
-> 1752                             raise exception.with_traceback(traceback)
   1753                         raise exc
   1754                     if errors == "skip":

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/core.py in _load_group_dataset()
    582         zarr_kwargs,
    583         cdf_kwargs,
--> 584         preprocess,
    585     )
    586

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in _aggregate()
    162             return ds
    163
--> 164     return apply_aggregation(v)
    165
    166

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in apply_aggregation()
    109             dsets = [
    110                 apply_aggregation(value, agg_column, key=key, level=level + 1)
--> 111                 for key, value in v.items()
    112             ]
    113             keys = list(v.keys())

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in <listcomp>()
    109             dsets = [
    110                 apply_aggregation(value, agg_column, key=key, level=level + 1)
--> 111                 for key, value in v.items()
    112             ]
    113             keys = list(v.keys())

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in apply_aggregation()
    109             dsets = [
    110                 apply_aggregation(value, agg_column, key=key, level=level + 1)
--> 111                 for key, value in v.items()
    112             ]
    113             keys = list(v.keys())

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in <listcomp>()
    109             dsets = [
    110                 apply_aggregation(value, agg_column, key=key, level=level + 1)
--> 111                 for key, value in v.items()
    112             ]
    113             keys = list(v.keys())

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in apply_aggregation()
    109             dsets = [
    110                 apply_aggregation(value, agg_column, key=key, level=level + 1)
--> 111                 for key, value in v.items()
    112             ]
    113             keys = list(v.keys())

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in <listcomp>()
    109             dsets = [
    110                 apply_aggregation(value, agg_column, key=key, level=level + 1)
--> 111                 for key, value in v.items()
    112             ]
    113             keys = list(v.keys())

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in apply_aggregation()
     91                 zarr_kwargs=zarr_kwargs,
     92                 cdf_kwargs=cdf_kwargs,
---> 93                 preprocess=preprocess,
     94             )
     95             ds.attrs['intake_esm_varname'] = varname

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in _open_asset()
    194         except Exception as e:
    195             logger.error(f'Failed to open netCDF/HDF dataset.')
--> 196             raise e
    197
    198     if preprocess is None:

~/miniconda3/envs/analysis/lib/python3.7/site-packages/intake_esm/merge_util.py in _open_asset()
    191         logger.info(f'Opening netCDF/HDF dataset: {root} - protocol: {protocol}')
    192         try:
--> 193             ds = xr.open_dataset(path, **cdf_kwargs)
    194         except Exception as e:
    195             logger.error(f'Failed to open netCDF/HDF dataset.')

~/miniconda3/envs/analysis/lib/python3.7/site-packages/xarray/backends/api.py in open_dataset()
    527                 "with engine='scipy' or 'h5netcdf'"
    528             )
--> 529         engine = _get_engine_from_magic_number(filename_or_obj)
    530         if engine == "scipy":
    531             store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs)

~/miniconda3/envs/analysis/lib/python3.7/site-packages/xarray/backends/api.py in _get_engine_from_magic_number()
    115         magic_number = filename_or_obj[:8]
    116     else:
--> 117         if filename_or_obj.tell() != 0:
    118             raise ValueError(
    119                 "file-like object read/write pointer not at zero "

AttributeError: 'FSMap' object has no attribute 'tell'

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 17:44):

@Brian Bonnlander, that was a bug (introduced by me a few weeks ago :slight_smile:). I fixed it earlier this week-end...

python -m pip install git+https://github.com/NCAR/intake-esm.git

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 17:45):

Update to the HEAD of intake-esm's master branch

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 17:47):

Well, thanks for fixing it! :)
Trying to write some LENS ocean data to Zarr today...

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 17:48):

@Anderson Banihirwe Do I have to restart my kernel after the pip install? I am still getting the same error using the same kernel.

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 17:49):

Yes, you will need to restart you kernel

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 17:49):

unless you are using the autoreload magic

%laod_ext autoreload
%autoreload 2

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 17:58):

Thanks for your help so far today. Is it bad when the Dask status tabs stop working in a JupyterHub session? In the middle of starting the first Zarr store save, the tabs went blank.

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 17:59):

You are welcome!

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 18:00):

I wouldn't worry about the dashboard freezing..

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 18:01):

It tends to happen when a huge amount of information is being sent to the bokeh server on which the dashboard is running on

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 18:02):

If you wait a little bit, things may normalize and the dashboard may go back to working as usual

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 18:02):

OK, but I am also not seeing my Zarr folder growing in size. After about 5 minutes, it is stuck at around 250K bytes. Can it sometimes take a while for the in-memory work to finish before starting a write?

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 18:03):

It sounds like something went wrong

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 18:03):

I would restart everything

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 18:03):

tornado.websocket.WebSocketClosedError

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 18:03):

Restart the entire hub?

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 18:04):

tornado.websocket.WebSocketClosedError

That's the bokeh server

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 18:04):

Restart the entire hub?

Just the kernel

view this post on Zulip Deepak Cherian (Mar 16 2020 at 18:04):

How much data are you writing? what is len(dataset.__dask_graph__()) → This is the number of tasks. If it's more than 2 million or so, it's too big

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 18:05):

So, I go back to the tab to kill everything, and now the Dask tabs are working again!

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 18:07):

But the number of Dask workers is staying at 2. Everything looks really slow. Restart?

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 18:07):

As Deepak pointed out, I would also check the number of tasks before writing to zarr

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 18:08):

Aha, I have to interrupt the Zarr write to find this out, right? Change the code to print this?

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 18:09):

But the number of Dask workers is staying at 2. Everything looks really slow. Restart?

This is normal. You are competing for resources with others...... If the queue is full, dask-workers have to wait to get in

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 18:10):

OK, number of worker just jumped to around 10.

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 18:10):

Things should scale as you get more workers

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 18:11):

Do you have the dashboard open?

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 18:11):

Yes, dashboard is open

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 18:12):

If yes, if you look at the dashboard /status page or progress frame, you should be able to see the number of tasks

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 18:13):

Dask progress tab says total: 27,621. Is this the tasks?

view this post on Zulip Anderson Banihirwe (Mar 16 2020 at 18:13):

Yes

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 18:13):

OK, it looks like progress is being made.

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 19:01):

I can't remember how to authenticate with Globus File Transfer. Did we use a special account, or our own Google identity, or something else?

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 19:02):

Ah, I meant to make that a private message...sorry.

view this post on Zulip Kevin Paul (Mar 16 2020 at 19:19):

You can delete messages. FYI.

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 19:21):

I expected to find a little trash can icon in the upper right menu, but it's not there.

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 19:22):

There is a 10 minute time limit to delete, by default.

view this post on Zulip Brian Bonnlander (Mar 16 2020 at 19:23):

I think an "organization admin" is needed to change that.

view this post on Zulip Danica Lombardozzi (Oct 12 2020 at 21:10):

I've been using the following code successfully for several months:
catalog = intake.open_esm_datastore("/glade/collections/cmip/catalog/intake-esm-datastore/catalogs/glade-cmip6.json")

I am now getting an error message FileNotFoundError: [Errno 2] No such file or directory: 'glade-cmip6.csv.gz'

I've checked that the file exists in the directory, and I also update as suggested above: pip install git+https://github.com/NCAR/intake-esm.git and restarted the kernel. This fix does not work, and I get the same error message. Did something else change or get updated that I'm not aware of?

view this post on Zulip Anderson Banihirwe (Oct 12 2020 at 23:25):

@Danica Lombardozzi ,

I am unable to reproduce the issue:

In [1]: import intake

In [2]: catalog = intake.open_esm_datastore("/glade/collections/cmip/catalog/intake-esm-datasto
   ...: re/catalogs/glade-cmip6.json")

In [3]: catalog.df.head()
Out[3]:
  activity_id institution_id  ...     time_range                                               path
0  AerChemMIP            BCC  ...  201501-205512  /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
1  AerChemMIP            BCC  ...  201501-205512  /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
2  AerChemMIP            BCC  ...  201501-205512  /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
3  AerChemMIP            BCC  ...  201501-205512  /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
4  AerChemMIP            BCC  ...  201501-205512  /glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...

and I can confirm that the necessary files are available:

abanihi at casper26 in /glade/collections/cmip/catalog/intake-esm-datastore/catalogs on master [!]
$ ls glade-cmip*
glade-cmip5.csv.gz  glade-cmip5.json  glade-cmip6.csv.gz  glade-cmip6.json

view this post on Zulip Anderson Banihirwe (Oct 12 2020 at 23:27):

When you get a chance, could you post here the full traceback of the error you are getting?

view this post on Zulip Danica Lombardozzi (Oct 12 2020 at 23:34):

Really? Well, I wonder what's going on. I'm having the same issue in two of my scripts. I've been using the CMIP62019.10a kernel. Below is more information on the problem I am encountering.

I import several functions:

import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import esmlab

#Tried below install, unsuccessful at fixing the intake error
#pip install git+https://github.com/NCAR/intake-esm.git

import intake

Then I attempt to import the data:

catalog = intake.open_esm_datastore("/glade/collections/cmip/catalog/intake-esm-datastore/catalogs/glade-cmip6.json")
catalog.df.head()

Which results in the below error message

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-3-1231c631f29f> in <module>
----> 1 catalog = intake.open_esm_datastore("/glade/collections/cmip/catalog/intake-esm-datastore/catalogs/glade-cmip6.json")
      2 catalog.df.head()

/ncar/usr/jupyterhub/envs/cmip6-201910a/lib/python3.7/site-packages/intake_esm/core.py in __init__(self, esmcol_path, **kwargs)
     68         self.esmcol_path = esmcol_path
     69         self._col_data = _fetch_and_parse_file(esmcol_path)
---> 70         self.df = self._fetch_catalog()
     71         self._entries = {}
     72         self.urlpath = ''

/ncar/usr/jupyterhub/envs/cmip6-201910a/lib/python3.7/site-packages/intake_esm/core.py in _fetch_catalog(self)
    113         """Get the catalog file and cache it.
    114         """
--> 115         return pd.read_csv(self._col_data['catalog_file'])
    116
    117     def nunique(self):

/ncar/usr/jupyterhub/envs/cmip6-201910a/lib/python3.7/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    683         )
    684
--> 685         return _read(filepath_or_buffer, kwds)
    686
    687     parser_f.__name__ = name

/ncar/usr/jupyterhub/envs/cmip6-201910a/lib/python3.7/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    455
    456     # Create the parser.
--> 457     parser = TextFileReader(fp_or_buf, **kwds)
    458
    459     if chunksize or iterator:

/ncar/usr/jupyterhub/envs/cmip6-201910a/lib/python3.7/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    893             self.options["has_index_names"] = kwds["has_index_names"]
    894
--> 895         self._make_engine(self.engine)
    896
    897     def close(self):

/ncar/usr/jupyterhub/envs/cmip6-201910a/lib/python3.7/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
   1133     def _make_engine(self, engine="c"):
   1134         if engine == "c":
-> 1135             self._engine = CParserWrapper(self.f, **self.options)
   1136         else:
   1137             if engine == "python":

/ncar/usr/jupyterhub/envs/cmip6-201910a/lib/python3.7/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1915         kwds["usecols"] = self.usecols
   1916
-> 1917         self._reader = parsers.TextReader(src, **kwds)
   1918         self.unnamed_cols = self._reader.unnamed_cols
   1919

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()

/ncar/usr/jupyterhub/envs/cmip6-201910a/lib/python3.7/gzip.py in __init__(self, filename, mode, compresslevel, fileobj, mtime)
    161             mode += 'b'
    162         if fileobj is None:
--> 163             fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
    164         if filename is None:
    165             filename = getattr(fileobj, 'name', '')

FileNotFoundError: [Errno 2] No such file or directory: 'glade-cmip6.csv.gz'

view this post on Zulip Anderson Banihirwe (Oct 12 2020 at 23:45):

Thank you, Danica! I now know what's going on :slight_smile:

1) The environment ( cmip6-201910a) you are using is outdated and unfortunately pip install .... doesn't have any effect towards fixing the issue

2) You actually want to use the cmip6-201910 environment instead of cmip6-201910a (notice the a at the end). The cmip6-2019 has an up-to-date version of intake-esm that knows how to resolve the absolute path of the glade-cmip6-csv.gz file when given the path to the JSON.

view this post on Zulip Danica Lombardozzi (Oct 12 2020 at 23:51):

Aha! That did the trick! Thanks for the update. I thought that cmip6-201910a was the more up-to-date version, so it's good to know that I shouldn't use it.

view this post on Zulip Anderson Banihirwe (Oct 13 2020 at 00:05):

Awesome!

I thought that cmip6-201910a was the more up-to-date version

This was the original idea, however, for administrative reasons the two environments ended up diverging over time (as far as package versions are concerned)..
I happen to have admin permissions on the cmip6-2019 environment. As a result, I try to keep it up-to-date. However, I don't have control over the cmip6-2019a environment


Last updated: Jan 30 2022 at 12:01 UTC