{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n# Working with netCDF groups\n\nThis is an example about how to work with netCDF\ngroups. Xarray does not natively read netCDF group\nfiles, but it does have the ability to read the\ndata with a few independent calls.\n\nAuthor: Ken Kehoe\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\nimport numpy as np\nimport xarray as xr\nfrom arm_test_data import DATASETS\n\nfrom act.io.arm import read_arm_netcdf\nfrom act.plotting import TimeSeriesDisplay\n\n# This data file is a bit complicated with the group organization. Each group\n# will need to be treated as a different netCDF file for reading. We can read each group\n# independently and merge into a single Dataset to use with standard Xarray or ACT methods.\n\n# Top Level:\n# data\n#\n# Groups:\n# - data\n# - light_absorption\n# - instrument\n# - particle_concentration\n# - instrument\n# - light_scattering\n# - instrument\n\nfilename = DATASETS.fetch(\n 'ESRL-GMD-AEROSOL_v1.0_HOUR_MLO_s20200101T000000Z_e20210101T000000Z_c20210214T053835Z.nc'\n)\n\n# We start by reading the location information from the top level of the netCDF file.\n# This is a standard Xarary call without a group keyword. Only the top level data is read.\n# The returned Dataset will contain all top level variables and top level global attributes.\n# We will use the direct Xarray call to read data since there is no time dimension at the top\n# level so we don't need ACT to do anything for us.\nds_location = xr.open_mfdataset(filename)\n\n# Second, we read the 'data' group. This has a time dimension so we can use ACT to manage\n# correctly reading and formatting that information. We need to specify the 'data' group\n# to be read. This will read the data group but not the sub-group.\nds_data = read_arm_netcdf(filename, group='data')\n\n# Third, we read the 'light_scattering' sub-group. We can read sub-groups\n# by using standard Linux directory path notation. Notice the light_scattering group is a\n# sub-group to 'data'.\nds_ls = xr.open_mfdataset(filename, group='data/light_scattering')\n\n# We can also read the third group 'instrument'. This only contains a small amount of information\n# about the instrument used to collect the data for 'light_scattering' and has no\n# dimensions, only scalar values.\nds = xr.open_mfdataset(filename, group='data/light_scattering/instrument')\n\n# Since the dimensionality aligns we can merge the three Datasets into a single Dataset.\nds = xr.merge([ds_data, ds_ls, ds_location])\n\n# Since the data file contains data for a full year we can subset the Dataset to be easier\n# to work with and process faster.\nds = ds.sel(time=slice('2020-01-01T00:00:00', '2020-01-04T23:59:59'))\n\n# The data is written with dimensionality in reverse order from what ACT expects. We need to\n# reverse the dimensionality order.\nds = ds.transpose()\n\n# Since we want to only plot the data from one of the cut sizes we can subset the Dataset\n# to only have data where the cut_size variable is set to a single value. This will reduce\n# the dimensionality from 3 to 2 or 2 to 1 dimensions on the variables. Take note that\n# the value of cut size is not the um value, it is the index. Since the index values\n# are 0 or 1, 0 = 1 um and 1 = 10 um. We are subsetting for less than 10 um.\nds = ds.isel(cut_size=1)\n\n# netCDF has a default _FillValue that is used to indicate when the values are missing. There\n# is a known issue with float precision that does not correctly set the _FillValue to NaN\n# when reading. We need to set the obviously incorrect values to NaN for the data variables\n# that can have this problem.\nfor var_name in ds.data_vars:\n try:\n data = ds[var_name].values\n data[ds[var_name].values >= 9e36] = np.nan\n ds[var_name].values = data\n except np._core._exceptions._UFuncNoLoopError:\n pass\n\n# This is a querk of reading the data. If we want to plot the day/night background correctly\n# we need to delete these global attribute.\ndel ds.attrs['_file_dates']\ndel ds.attrs['_file_times']\n\n# Since the data variable has a second dimension of wavelength and we want to plot each one as a\n# line plot, we will pass force_line_plot=True to force the second dimension to be removed from\n# the time series plot. We will want to set the labels to view on the plot by extracting the values\n# from the wavelenght dimension and pass into the plotting call.\nlabels = [f\"{int(wl)} {ds['wavelength'].attrs['units']}\" for wl in ds['wavelength'].values]\n\ndisplay = TimeSeriesDisplay({'ESRL ML': ds})\ndisplay.plot(\n 'scattering_coefficient', day_night_background=True, force_line_plot=True, labels=labels\n)\nplt.show()\n\n# A second option is to extract the wavelength dimension form the variable and create a new variable\n# to plot. We are selecting the wavelength by using index so a value of 1 = 550 um. We need to\n# provide a new variable name not already in use and correctly describes the data.\nds['scattering_coefficient_450'] = ds['scattering_coefficient'].isel(indexers={'wavelength': 0})\nds['scattering_coefficient_550'] = ds['scattering_coefficient'].isel(indexers={'wavelength': 1})\nds['scattering_coefficient_700'] = ds['scattering_coefficient'].isel(indexers={'wavelength': 2})\n\ndisplay = TimeSeriesDisplay({'ESRL ML': ds})\ntitle = 'ESRL ML Scatter Coefficient from 2020-01-01 to 2020-01-04'\ndisplay.plot('scattering_coefficient_450', label=labels[0], day_night_background=True)\ndisplay.plot('scattering_coefficient_550', label=labels[1])\ndisplay.plot('scattering_coefficient_700', label=labels[2], set_title=title)\nplt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.3" } }, "nbformat": 4, "nbformat_minor": 0 }