Add functions to import the datasets (#3)

stefsmeets · bouweandela · web-flow · commit b0b5b6d9f0fa · 2020-11-19T08:58:40.000Z
* Update .gitignore

* Add basic setup.py

* Add basic functionality to load datasets

* Set cube helper logging level to ERROR to hide warnings

i.e. `tracking_id, history and creation_date attributes inconsistent`

* Comment out problematic dataset

* Speed up data loading and lose the cube-helper dependency

* Rename data -&gt; cubes

* Add package data

* Update esmvaltool_sample_data/loader.py

Co-authored-by: Bouwe Andela &lt;b.andela@esciencecenter.nl&gt;

* Add developer imports

* Address review comments

* Update doc strings and add annotations

* Select subset of data

* Add whitelists for specific subsets of data

* Use ignore list to filter problematic datasets

* Ignore dataset that fails to regrid

* Remove unused functions

* Remove code used for testing

Co-authored-by: Bouwe Andela &lt;b.andela@esciencecenter.nl&gt;
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,89 @@
-# ignore these files
+# Distribution / packaging
+.Python
+build/
+c
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+env/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+*.orig
+*.tmp
+MANIFEST
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Created by editors
+*~
+\#*
+\.\#*
+*.swp
+
+# Created by PyCharm
+.idea/
+
+# eclipse/pydev
+.project
+.pydevproject
+.settings
+
+#Create by VSCode
+.vscode
+
+#pytest
+.cache
+.pytest_cache
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+*.tmp
+*.orig
+/c
+/tests/data/**
+test-reports/
+/test_bash.sh
+/python_test_out.txt
+
+# Build folder
+doc/sphinx/build
 
 # esgf-pyclient cache
 *.sqlite
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include esmvaltool_sample_data/data/ *.nc
diff --git a/esmvaltool_sample_data/__init__.py b/esmvaltool_sample_data/__init__.py
@@ -0,0 +1,106 @@
+from pathlib import Path
+
+import cf_units
+import iris
+import yaml
+
+base_dir = Path(__file__).parent
+
+VERBOSE = False
+
+with open(base_dir / 'datasets.yml', 'r') as f:
+    config = yaml.safe_load(f)
+
+ignore_list = [fn.replace('.', '/') for fn in config['ignore']]
+
+
+def strip_attributes(cube: 'iris.Cube') -> None:
+    """Remove attributes in-place that cause issues with merging and
+    concatenation."""
+    for attr in ['creation_date', 'tracking_id', 'history']:
+        if attr in cube.attributes:
+            cube.attributes.pop(attr)
+
+
+def simplify_time(cube: 'iris.Cube') -> None:
+    """Simplifies the time coordinate in-place."""
+    coord = cube.coord('time')
+    coord.convert_units(
+        cf_units.Unit('days since 1850-1-1 00:00:00',
+                      calendar=coord.units.calendar))
+
+
+def load_cubes_from_input_dirs(input_dirs: list) -> 'iris.Cube':
+    """Generator that loads all *.nc files from each input dir into a cube."""
+    for i, input_dir in enumerate(sorted(input_dirs)):
+        if VERBOSE:
+            print(f'Loading #{i:02d}:', input_dir)
+
+        files = input_dir.glob('*.nc')
+        cubes = iris.load(str(file) for file in files)
+        for cube in cubes:
+            strip_attributes(cube)
+            simplify_time(cube)
+
+        cube = cubes.concatenate_cube()
+
+        if VERBOSE:
+            print('           ', cube.shape, cube.coord('time').units.calendar)
+
+        yield cube
+
+
+def filter_ignored_datasets(dirs, root):
+    for drc in dirs:
+        test_drc = str(drc.relative_to(root))
+        if test_drc not in ignore_list:
+            yield drc
+        elif VERBOSE:
+            print('Ignored:', test_drc)
+
+
+def load_timeseries_cubes(mip_table: str = 'Amon') -> list:
+    """Returns a list of iris cubes with timeseries data.
+
+    The data are: ta / Amon / historical / r1i1p1f1, any grid, 1950 - onwards.
+    All dimensions were reduced to a few steps except for the time dimension.
+
+    Parameters
+    ----------
+    mip_table: str
+        select monthly (`Amon`) or daily (`day`) data.
+
+    Returns
+    -------
+    list of iris.cube
+    """
+
+    timeseries_dir = base_dir / 'data' / 'timeseries'
+
+    paths = timeseries_dir.glob(f'**/{mip_table}/**/*.nc')
+    input_dirs = list(set(path.parent for path in paths))
+
+    input_dirs = list(filter_ignored_datasets(input_dirs, timeseries_dir))
+
+    cubes = load_cubes_from_input_dirs(input_dirs)
+
+    return list(cubes)
+
+
+if __name__ == '__main__':
+    VERBOSE = True
+
+    for mip_table in (
+            'Amon',
+            'day',
+    ):
+        print()
+        print(f'Loading `{mip_table}`')
+        ts = load_timeseries_cubes(mip_table)
+
+        first_cube = ts[0]
+        for i, cube in enumerate(ts):
+            print(i)
+            cube.regrid(grid=first_cube, scheme=iris.analysis.Linear())
+
+    # breakpoint()
diff --git a/esmvaltool_sample_data/datasets.yml b/esmvaltool_sample_data/datasets.yml
@@ -27,3 +27,7 @@ ignore:
   - CMIP6.CMIP.EC-Earth-Consortium.EC-Earth3-Veg-LR.historical.r1i1p1f1.Amon.ta.gr.v20200217
   # something wrong with lon coord
   - CMIP6.CMIP.UA.MCM-UA-1-0.historical.r1i1p1f1.Amon.ta.gn.v20190731
+  # iris.exceptions.ConcatenateError: failed to concatenate into a single cube.
+  - CMIP6.CMIP.NCC.NorCPM1.historical.r1i1p1f1.Amon.ta.gn.v20190914
+  # Regridding -> ValueError: Cube 'air_temperature' must contain a single 1D y coordinate.
+  - CMIP6.CMIP.FIO-QLNM.FIO-ESM-2-0.historical.r1i1p1f1.Amon.ta.gn.v20191204
diff --git a/setup.py b/setup.py
@@ -0,0 +1,54 @@
+from setuptools import setup
+
+with open('README.md') as readme_file:
+    readme = readme_file.read()
+
+PACKAGES = [
+    'esmvaltool_sample_data',
+]
+
+setup(
+    name='ESMValTool sample data',
+    version='0.0.1',
+    description="ESMValTool sample data",
+    long_description=readme + '\n\n',
+    author="",
+    author_email='',
+    url='https://github.com/ESMValGroup/ESMValTool_sample_data',
+    packages=PACKAGES,
+    include_package_data=True,
+    license="",
+    zip_safe=False,
+    keywords='ESMValTool',
+    classifiers=[
+        'Development Status :: 2 - Pre-Alpha',
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: Apache Software License',
+        'Natural Language :: English',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+    ],
+    test_suite='tests',
+    install_requires=[
+        'scitools-iris>=2.2',
+    ],
+    # tests_require=[
+    #     'pytest',
+    #     'pytest-cov',
+    #     'pycodestyle',
+    # ],
+    extras_require={
+        'develop': [
+            'codespell',
+            'docformatter',
+            'esgf-pyclient',
+            'isort',
+            'myproxyclient',
+            'pre-commit',
+            'prospector[with_pyroma]!=1.1.6.3,!=1.1.6.4',
+            'yamllint',
+            'yapf',
+        ],
+    },
+)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+recursive-include esmvaltool_sample_data/data/ *.nc`