diff --git a/virtualizarr-kerchunk-cubed.ipynb b/virtualizarr-kerchunk-cubed.ipynb new file mode 100644 index 000000000..b8e89fdc4 --- /dev/null +++ b/virtualizarr-kerchunk-cubed.ipynb @@ -0,0 +1,1739 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bbd04165-9cf2-4bf5-a7fd-806cc500aec6", + "metadata": {}, + "source": [ + "# VirtualiZarr + Kerchunk + Cubed" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "04157803-9ddb-46fa-9bdb-71701bda091f", + "metadata": {}, + "outputs": [], + "source": [ + "import cubed\n", + "import xarray as xr\n", + "import virtualizarr" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "64707039-e349-4852-a420-bd7598793033", + "metadata": {}, + "outputs": [], + "source": [ + "! rm -rf combined* rechunked" + ] + }, + { + "cell_type": "markdown", + "id": "d96b430e-1134-4756-904d-d523a8d732e4", + "metadata": {}, + "source": [ + "### Open tutorial data" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "7cfa2d26-c00f-446d-9c13-82a72ce9cd68", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 31MB\n",
+       "Dimensions:  (lat: 25, time: 2920, lon: 53)\n",
+       "Coordinates:\n",
+       "  * lat      (lat) float32 100B 75.0 72.5 70.0 67.5 65.0 ... 22.5 20.0 17.5 15.0\n",
+       "  * lon      (lon) float32 212B 200.0 202.5 205.0 207.5 ... 325.0 327.5 330.0\n",
+       "  * time     (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00\n",
+       "Data variables:\n",
+       "    air      (time, lat, lon) float64 31MB ...\n",
+       "Attributes:\n",
+       "    Conventions:  COARDS\n",
+       "    title:        4x daily NMC reanalysis (1948)\n",
+       "    description:  Data is from NMC initialized reanalysis\\n(4x/day).  These a...\n",
+       "    platform:     Model\n",
+       "    references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...
" + ], + "text/plain": [ + " Size: 31MB\n", + "Dimensions: (lat: 25, time: 2920, lon: 53)\n", + "Coordinates:\n", + " * lat (lat) float32 100B 75.0 72.5 70.0 67.5 65.0 ... 22.5 20.0 17.5 15.0\n", + " * lon (lon) float32 212B 200.0 202.5 205.0 207.5 ... 325.0 327.5 330.0\n", + " * time (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00\n", + "Data variables:\n", + " air (time, lat, lon) float64 31MB ...\n", + "Attributes:\n", + " Conventions: COARDS\n", + " title: 4x daily NMC reanalysis (1948)\n", + " description: Data is from NMC initialized reanalysis\\n(4x/day). These a...\n", + " platform: Model\n", + " references: http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly..." + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = xr.tutorial.open_dataset('air_temperature')\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "id": "8ea042dd-f5bf-41e4-9fe6-0671507cf562", + "metadata": {}, + "source": [ + "### Split into two NetCDF files" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "04883065-a97b-4c84-86d0-900f8802977f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/8t/gslp67x10vgfjgv68qj92n0m0000gn/T/ipykernel_867/2127577491.py:3: SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs\n", + " ds1.to_netcdf('air1.nc')\n", + "/var/folders/8t/gslp67x10vgfjgv68qj92n0m0000gn/T/ipykernel_867/2127577491.py:4: SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs\n", + " ds2.to_netcdf('air2.nc')\n" + ] + } + ], + "source": [ + "ds1 = ds.isel(time=slice(None, 1460))\n", + "ds2 = ds.isel(time=slice(1460, None))\n", + "ds1.to_netcdf('air1.nc')\n", + "ds2.to_netcdf('air2.nc')" + ] + }, + { + "cell_type": "markdown", + "id": "3e7a9b7d-e7df-46e3-9c69-1d4656911997", + "metadata": {}, + "source": [ + "### Combine two NetCDFs using VirtualiZarr" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c6f078d6-6e3a-4890-aa9e-731bbb3d19dd", + "metadata": {}, + "outputs": [], + "source": [ + "vds1 = virtualizarr.open_virtual_dataset('air1.nc', indexes={})\n", + "vds2 = virtualizarr.open_virtual_dataset('air2.nc', indexes={})\n", + "combined_vds = xr.concat([vds1, vds2], dim='time', coords='minimal', compat='override')" + ] + }, + { + "cell_type": "markdown", + "id": "42df4df6-7e2c-45b6-915f-d429f630466d", + "metadata": {}, + "source": [ + "### Write references to Kerchunk" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f90b51b9-cf31-4f8d-ba63-ea07cfc0cedf", + "metadata": {}, + "outputs": [], + "source": [ + "combined_vds.virtualize.to_kerchunk('combined.json', format='json')" + ] + }, + { + "cell_type": "markdown", + "id": "01ac6ce3-e782-4835-976a-37e0c946404a", + "metadata": {}, + "source": [ + "### Open in Xarray using Cubed arrays" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3fea37e3-2c1e-4b19-92f5-6ced3a2da189", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 31MB\n",
+       "Dimensions:  (time: 2920, lat: 25, lon: 53)\n",
+       "Coordinates:\n",
+       "  * lat      (lat) float32 100B 75.0 72.5 70.0 67.5 65.0 ... 22.5 20.0 17.5 15.0\n",
+       "  * lon      (lon) float32 212B 200.0 202.5 205.0 207.5 ... 325.0 327.5 330.0\n",
+       "  * time     (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00\n",
+       "Data variables:\n",
+       "    air      (time, lat, lon) float64 31MB cubed.Array<chunksize=(1460, 25, 53)>\n",
+       "Attributes:\n",
+       "    Conventions:  COARDS\n",
+       "    description:  Data is from NMC initialized reanalysis\\n(4x/day).  These a...\n",
+       "    platform:     Model\n",
+       "    references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...\n",
+       "    title:        4x daily NMC reanalysis (1948)
" + ], + "text/plain": [ + " Size: 31MB\n", + "Dimensions: (time: 2920, lat: 25, lon: 53)\n", + "Coordinates:\n", + " * lat (lat) float32 100B 75.0 72.5 70.0 67.5 65.0 ... 22.5 20.0 17.5 15.0\n", + " * lon (lon) float32 212B 200.0 202.5 205.0 207.5 ... 325.0 327.5 330.0\n", + " * time (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00\n", + "Data variables:\n", + " air (time, lat, lon) float64 31MB cubed.Array\n", + "Attributes:\n", + " Conventions: COARDS\n", + " description: Data is from NMC initialized reanalysis\\n(4x/day). These a...\n", + " platform: Model\n", + " references: http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...\n", + " title: 4x daily NMC reanalysis (1948)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spec = cubed.Spec(allowed_mem=\"2GB\", executor_name=\"single-threaded\")\n", + "ds = xr.open_dataset(\n", + " 'combined.json',\n", + " engine=\"kerchunk\",\n", + " chunked_array_type='cubed',\n", + " from_array_kwargs={'spec': spec},\n", + " chunks={}\n", + ")\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "id": "05f33cd8-5caf-43b3-8258-ee9da0ac1afc", + "metadata": {}, + "source": [ + "### Rechunk" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f598d59a-d11a-4e1f-a0b7-6a01913540c3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 31MB\n",
+       "Dimensions:  (time: 2920, lat: 25, lon: 53)\n",
+       "Coordinates:\n",
+       "  * lat      (lat) float32 100B 75.0 72.5 70.0 67.5 65.0 ... 22.5 20.0 17.5 15.0\n",
+       "  * lon      (lon) float32 212B 200.0 202.5 205.0 207.5 ... 325.0 327.5 330.0\n",
+       "  * time     (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00\n",
+       "Data variables:\n",
+       "    air      (time, lat, lon) float64 31MB cubed.Array<chunksize=(2920, 5, 5)>\n",
+       "Attributes:\n",
+       "    Conventions:  COARDS\n",
+       "    description:  Data is from NMC initialized reanalysis\\n(4x/day).  These a...\n",
+       "    platform:     Model\n",
+       "    references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...\n",
+       "    title:        4x daily NMC reanalysis (1948)
" + ], + "text/plain": [ + " Size: 31MB\n", + "Dimensions: (time: 2920, lat: 25, lon: 53)\n", + "Coordinates:\n", + " * lat (lat) float32 100B 75.0 72.5 70.0 67.5 65.0 ... 22.5 20.0 17.5 15.0\n", + " * lon (lon) float32 212B 200.0 202.5 205.0 207.5 ... 325.0 327.5 330.0\n", + " * time (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00\n", + "Data variables:\n", + " air (time, lat, lon) float64 31MB cubed.Array\n", + "Attributes:\n", + " Conventions: COARDS\n", + " description: Data is from NMC initialized reanalysis\\n(4x/day). These a...\n", + " platform: Model\n", + " references: http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...\n", + " title: 4x daily NMC reanalysis (1948)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# TODO: why do we need to pass in chunked_array_type and from_array_kwargs again?\n", + "ds2 = ds.chunk({\"time\": 2920, \"lat\": 5, \"lon\": 5}, chunked_array_type=\"cubed\", from_array_kwargs={'spec': spec})\n", + "ds2" + ] + }, + { + "cell_type": "markdown", + "id": "a5943126-e7cf-4fc2-86c9-62f5768b4afb", + "metadata": {}, + "source": [ + "### Save rechunked arrays as Zarr" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "7db3927d-95be-4b5a-bf28-3cc9303f7ab0", + "metadata": {}, + "outputs": [], + "source": [ + "cubed.to_zarr(ds2[\"air\"].data, \"rechunked\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bfd1ae02-8a86-4a9a-ab79-886916bb6e35", + "metadata": {}, + "outputs": [], + "source": [ + "#ds.to_zarr(\"rechunked\", safe_chunks=False, chunkmanager_store_kwargs={'spec': spec})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af77d2aa-1dcc-4048-9518-ce5ef907e005", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}