From 283126dd57313e245a53ec91c4f3a190612ebbc2 Mon Sep 17 00:00:00 2001 From: Anderson Banihirwe Date: Tue, 8 Jan 2019 11:00:22 -0700 Subject: [PATCH 1/3] Add binder configuration --- .circleci/config.yml | 23 ---------------- binder/environment-dev.yml | 31 +++++++++++++++++++++ conda/meta.yaml | 56 -------------------------------------- 3 files changed, 31 insertions(+), 79 deletions(-) create mode 100644 binder/environment-dev.yml delete mode 100644 conda/meta.yaml diff --git a/.circleci/config.yml b/.circleci/config.yml index 3ddef26..2df2dc7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -5,14 +5,6 @@ workflows: default: jobs: - "python-3.6" - - "deploy-conda": - requires: - - "python-3.6" - filters: - tags: - only: /^v.*/ - branches: - ignore: /.*/ jobs: @@ -78,18 +70,3 @@ jobs: - store_artifacts: path: docs/_build/html destination: html - - - "deploy-conda": - docker: - - image: continuumio/miniconda:latest - - steps: - # Get our data and merge with upstream - - checkout - - - run: - name: Deploy package release to Anaconda.org - command: | - conda install conda-build anaconda-client --yes --quiet - ./ci/upload-anaconda.sh diff --git a/binder/environment-dev.yml b/binder/environment-dev.yml new file mode 100644 index 0000000..7823e31 --- /dev/null +++ b/binder/environment-dev.yml @@ -0,0 +1,31 @@ +name: intake-cmip-dev +channels: + - conda-forge + - intake + - defaults +dependencies: + - python=3.6 + - numpy + - intake + - intake-xarray + - xarray + - netcdf4 + - dask + - distributed + - autopep8 + - flake8 + - black + - isort + - pytest + - coverage + - pytest-cov + - codecov + - sphinx>=1.6 + - sphinx_rtd_theme + - recommonmark + - numpydoc + - nbsphinx + - pandoc + - ipykernel + - pip: + - sphinx_copybutton diff --git a/conda/meta.yaml b/conda/meta.yaml deleted file mode 100644 index 89c6d72..0000000 --- a/conda/meta.yaml +++ /dev/null @@ -1,56 +0,0 @@ -{% set name = "intake-cmip" %} -{% set version = "2019.1.0" %} - -package: - name: {{ name }} - version: {{ version }} - -source: - url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz - sha256: 711ec3d30f784268065d73b95e16538f6584fa1dcb1f4fba64c1337ac61247fb - - -build: - script: "{{ PYTHON }} -m pip install . --no-deps -vv" - noarch: python - number: 0 - -requirements: - - host: - - python - - pip - - - run: - - python - - netcdf4 - - xarray - - intake - - intake-xarray - - dask - -test: - source_files: - - tests - requires: - - pytest - imports: - - intake_cmip - commands: - - pytest --verbose - -about: - - home: https://github.com/NCAR/intake-cmip - license: Apache 2.0 - license_file: LICENSE - summary: | - Intake-cmip provides a plugin for reading CMIP5, and CMIP6 data using intake. - dev_url: https://github.com/NCAR/intake-cmip - doc_url: https://intake-cmip.readthedocs.io/en/latest/ - -extra: - recipe-maintainers: - - andersy005 - - kmpaul \ No newline at end of file From b3030f5bf32df0c21fe1a3acec31d3d59d8305bb Mon Sep 17 00:00:00 2001 From: Anderson Banihirwe Date: Tue, 8 Jan 2019 11:08:43 -0700 Subject: [PATCH 2/3] Rename env file --- binder/{environment-dev.yml => environment.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename binder/{environment-dev.yml => environment.yml} (100%) diff --git a/binder/environment-dev.yml b/binder/environment.yml similarity index 100% rename from binder/environment-dev.yml rename to binder/environment.yml From eb12eba2c8d384f650a038f8223473882d87d934 Mon Sep 17 00:00:00 2001 From: Anderson Banihirwe Date: Tue, 8 Jan 2019 13:50:40 -0700 Subject: [PATCH 3/3] Add initial version of `search` --- intake_cmip/cmip5.py | 209 ++++++++++++++++++++++--------------------- tests/test_cmip5.py | 44 +++++++-- 2 files changed, 145 insertions(+), 108 deletions(-) diff --git a/intake_cmip/cmip5.py b/intake_cmip/cmip5.py index d6e6086..76c7198 100644 --- a/intake_cmip/cmip5.py +++ b/intake_cmip/cmip5.py @@ -24,17 +24,8 @@ class CMIP5DataSource(intake_xarray.base.DataSourceMixin): partition_access = True name = "cmip5" - def __init__( - self, - database, - model, - experiment, - frequency, - realm, - ensemble, - varname=None, - metadata=None, - ): + def __init__(self, database, metadata=None): + """ Parameters @@ -43,6 +34,53 @@ def __init__( database : string or file handle File path or object for cmip5 database. For users with access to NCAR's glade file system, this argument can be set to 'glade'. + """ + + # store important kwargs + self.database = self._read_database(database) + self.urlpath = "" + self.query = {} + self.query_results = None + self._ds = None + super(CMIP5DataSource, self).__init__(metadata=metadata) + + def _read_database(self, database): + if database == "glade": + database = glade_cmip5_db + if os.path.exists(database): + return pd.read_csv(database) + else: + raise FileNotFoundError(f"{database}") + + def _open_dataset(self): + + ens_filepaths = self._get_ens_filepaths() + + ds_list = [xr.open_mfdataset(paths) for paths in ens_filepaths.values()] + ens_list = list(ens_filepaths.keys()) + self._ds = xr.concat(ds_list, dim="ensemble") + self._ds["ensemble"] = ens_list + + def to_xarray(self, dask=True): + """Return dataset as an xarray instance""" + if dask: + return self.to_dask() + return self.read() + + def search( + self, + model=None, + experiment=None, + frequency=None, + realm=None, + ensemble=None, + varname=None, + ): + + """ + Parameters + ----------- + model : str identifies the model used (e.g. HADCM3, HADCM3-233). experiment : str @@ -82,92 +120,63 @@ def __init__( """ - # store important kwargs - self.database = self._read_database(database) - self.model = model - self.experiment = experiment - self.frequency = frequency - self.realm = realm - self.ensemble = ensemble - self.varname = varname - self.urlpath = "" - self._ds = None - super(CMIP5DataSource, self).__init__(metadata=metadata) - - def _read_database(self, database): - if database == "glade": - database = glade_cmip5_db - if os.path.exists(database): - return pd.read_csv(database) - else: - raise FileNotFoundError(f"{database}") - - def _open_dataset(self): - ens_filepaths = get_ens_filepaths( - self.database, - self.model, - self.experiment, - self.frequency, - self.realm, - self.ensemble, - self.varname, - ) - - ds_list = [xr.open_mfdataset(paths) for paths in ens_filepaths.values()] - ens_list = list(ens_filepaths.keys()) - self._ds = xr.concat(ds_list, dim="ensemble") - self._ds["ensemble"] = ens_list - - def to_xarray(self, dask=True): - """Return dataset as an xarray instance""" - if dask: - return self.to_dask() - return self.read() - - -def get_ens_filepaths(database, model, experiment, frequency, realm, ensemble, varname): - query = { - "model": model, - "experiment": experiment, - "frequency": frequency, - "realm": realm, - "ensemble": ensemble, - "varname": varname, - } - - condition = np.ones(len(database), dtype=bool) - - for key, val in query.items(): - if val is not None: - - condition = condition & (database[key] == val) - - database_subset = database.loc[condition] - - if database_subset.empty: - - raise ValueError( - f"No dataset found for:\n \ - \tmodel = {model} \n \ - \texperiment = {experiment} \n \ - \tfrequency = {frequency} \n \ - \trealm = {realm} \n \ - \tensemble = {ensemble} \n \ - \tvarname = {varname}" - ) - - # -- realm is optional arg so check that the same varname is not in multiple realms - realm_list = database_subset.realm.unique() - if len(realm_list) != 1: - raise ValueError( - f"{varname} found in multiple realms:\n \ - '\t{realm_list}. Please specify the realm to use" - ) - - ds_dict = OrderedDict() - for ens in database_subset["ensemble"].unique(): - ens_match = database_subset["ensemble"] == ens - paths = database_subset.loc[ens_match]["file_fullpath"].tolist() - ds_dict[ens] = paths - - return ds_dict + self.query = { + "model": model, + "experiment": experiment, + "frequency": frequency, + "realm": realm, + "ensemble": ensemble, + "varname": varname, + } + database = self.database + condition = np.ones(len(database), dtype=bool) + + for key, val in self.query.items(): + if val is not None: + + condition = condition & (database[key] == val) + + self.query_results = database.loc[condition] + return self + + def results(self): + return self.query_results + + def _get_ens_filepaths(self): + if self.query_results.empty: + raise ValueError( + f"No dataset found for:\n \ + \tmodel = {self.query['model']}\n \ + \texperiment = {self.query['experiment']} \n \ + \tfrequency = {self.query['frequency']} \n \ + \trealm = {self.query['realm']} \n \ + \tensemble = {self.query['ensemble']} \n \ + \tvarname = {self.query['varname']}" + ) + + models = self.query_results.ensemble.nunique() > 1 + experiments = self.query_results.experiment.nunique() > 1 + frequencies = self.query_results.frequency.nunique() > 1 + + if models or experiments or frequencies: + + raise ValueError( + f"Invalid results for search query = {self.query}.\n\ + Please specify unique model, experiment, and frequency to use" + ) + + # Check that the same varname is not in multiple realms + realm_list = self.query_results.realm.unique() + if len(realm_list) != 1: + raise ValueError( + f"{self.query['varname']} found in multiple realms:\ + \t{self.query['realm_list']}. Please specify the realm to use" + ) + + ds_dict = OrderedDict() + for ens in self.query_results["ensemble"].unique(): + ens_match = self.query_results["ensemble"] == ens + paths = self.query_results.loc[ens_match]["file_fullpath"].tolist() + ds_dict[ens] = paths + + return ds_dict diff --git a/tests/test_cmip5.py b/tests/test_cmip5.py index 7b6f7bd..71a3198 100644 --- a/tests/test_cmip5.py +++ b/tests/test_cmip5.py @@ -69,15 +69,16 @@ def test_source(): setup() create_cmip5_database(CMIP5_TEST_DIR, DB_DIR) db_file = f"{DB_DIR}/cmip5.csv" - source = CMIP5DataSource( - database=db_file, + source = CMIP5DataSource(database=db_file) + assert isinstance(source, CMIP5DataSource) + + source.search( model="CanESM2", experiment="rcp85", frequency="mon", realm="atmos", ensemble="r2i1p1", ) - assert isinstance(source, CMIP5DataSource) ds = source.to_xarray() ds_2 = source.to_xarray(dask=False) @@ -86,15 +87,42 @@ def test_source(): teardown() -def test_glade_db(): - source = CMIP5DataSource( - database="glade", +def test_source_exception(): + setup() + create_cmip5_database(CMIP5_TEST_DIR, DB_DIR) + db_file = f"{DB_DIR}/cmip5.csv" + + with pytest.raises(FileNotFoundError): + source = CMIP5DataSource(database="intake.csv") + + with pytest.raises(ValueError): + source = CMIP5DataSource(database=db_file) + + source.search() + source.to_xarray() + + teardown() + + +def test_search(): + setup() + create_cmip5_database(CMIP5_TEST_DIR, DB_DIR) + db_file = f"{DB_DIR}/cmip5.csv" + source = CMIP5DataSource(database=db_file) + + source.search() + results = source.search( model="CanESM2", experiment="rcp85", frequency="mon", realm="atmos", ensemble="r2i1p1", - varname="ua", - ) + ).results() + assert isinstance(results, pd.DataFrame) + teardown() + + +def test_glade_db(): + source = CMIP5DataSource(database="glade") assert isinstance(source, CMIP5DataSource)