Source code for act.io.noaagml

"""
Modules for reading in NOAA GML data

"""
import re
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import xarray as xr

from act.io.text import read_csv


[docs]def read_gml(filename, datatype=None, remove_time_vars=True, convert_missing=True, **kwargs):
    """
    Function to call or guess what reading NOAA GML daga routine to use. It
    tries to guess the correct reading function to call based on filename.
    It mostly works, but you may want to specify for best results.

    Parameters
    ----------
    filename : str or pathlib.Path
        Data file full path name. In theory it should work with a list of
        filenames but it is not working as well with that as expected.
    datatype : str
        Data file type that bypasses the guessing from filename format
        and goes directly to the reading routine. Options include
        [MET, RADIATION, OZONE, CO2, HALO]
    remove_time_vars : bool
        Some variables are convereted into coordinate variables in Xarray
        DataSet and not needed after conversion. This will remove those
        variables.
    convert_missing : bool
        Convert missing value indicator in CSV to NaN in Xarray DataSet.
    **kwargs : keywords
        Keywords to pass through to instrument specific reading routine.

    Returns
    -------
    ds : xarray.Dataset
        Standard ACT Xarray dataset with the data cleaned up to have units,
        long_name, correct type and some other stuff.

    """
    if datatype is not None:
        if datatype.upper() == 'MET':
            return read_gml_met(filename, convert_missing=convert_missing, **kwargs)
        elif datatype.upper() == 'RADIATION':
            return read_gml_radiation(
                filename,
                remove_time_vars=remove_time_vars,
                convert_missing=convert_missing,
                **kwargs,
            )
        elif datatype.upper() == 'OZONE':
            return read_gml_ozone(filename, **kwargs)
        elif datatype.upper() == 'CO2':
            return read_gml_co2(filename, convert_missing=convert_missing, **kwargs)
        elif datatype.upper() == 'HALO':
            return read_gml_halo(filename, **kwargs)
        elif datatype.upper() == 'AEROSOL':
            return read_gml_aerosol(filename, **kwargs)
        else:
            raise ValueError('datatype is unknown')

    else:
        test_filename = filename
        if isinstance(test_filename, (list, tuple)):
            test_filename = filename[0]

        test_filename = str(Path(test_filename).name)

        if test_filename.startswith('met_') and test_filename.endswith('.txt'):
            return read_gml_met(filename, convert_missing=convert_missing, **kwargs)

        if test_filename.startswith('co2_') and test_filename.endswith('.txt'):
            return read_gml_co2(filename, convert_missing=convert_missing, **kwargs)

        if test_filename.endswith('.nas'):
            return read_gml_aerosol(filename, **kwargs)

        result = re.match(r'([a-z]{3})([\d]{5}).dat', test_filename)
        if result is not None:
            return read_gml_radiation(
                filename,
                remove_time_vars=remove_time_vars,
                convert_missing=convert_missing,
                **kwargs,
            )

        ozone_pattern = [
            r'[a-z]{3}_[\d]{4}_[\d]{2}_hour.dat',
            r'[a-z]{3}_[\d]{2}_[\d]{4}_hour.dat',
            r'[a-z]{3}_[\d]{4}_all_minute.dat',
            r'[a-z]{3}_[\d]{2}_[\d]{4}_5minute.dat',
            r'[a-z]{3}_[\d]{2}_[\d]{4}_min.dat',
            r'[a-z]{3}_o3_6m_hour_[\d]{2}_[\d]{4}.dat',
            r'[a-z]{3}_ozone_houry__[\d]{4}',
        ]
        for pattern in ozone_pattern:
            result = re.match(pattern, test_filename)
            if result is not None:
                return read_gml_ozone(filename, **kwargs)

        ozone_pattern = [
            r'[a-z]{3}_CCl4_Day.dat',
            r'[a-z]{3}_CCl4_All.dat',
            r'[a-z]{3}_CCl4_MM.dat',
            r'[a-z]{3}_MC_MM.dat',
        ]
        for pattern in ozone_pattern:
            result = re.match(pattern, test_filename)
            if result is not None:
                return read_gml_halo(filename, **kwargs)


[docs]def read_gml_halo(filename, **kwargs):
    """
    Function to read Halocarbon data from NOAA GML.

    Parameters
    ----------
    filename : str or pathlib.Path
        Data file full path name.

    Returns
    -------
    ds : xarray.Dataset
        Standard ACT Xarray dataset with the data cleaned up to have units,
        long_name, correct type and some other stuff.
    **kwargs : keywords
        Keywords to pass through to ACT read_csv() routine.

    """
    ds = None
    if filename is None:
        return ds

    variables = {
        'CCl4catsBRWm': {
            'long_name': 'Carbon Tetrachloride (CCl4) daily median',
            'units': 'ppt',
            '_FillValue': np.nan,
            '__type': np.float32,
            '__rename': 'CCl4',
        },
        'CCl4catsBRWmsd': {
            'long_name': 'Carbon Tetrachloride (CCl4) standard deviation',
            'units': 'ppt',
            '_FillValue': np.nan,
            '__type': np.float32,
            '__rename': 'CCl4_std_dev',
        },
        'CCl4catsBRWsd': {
            'long_name': 'Carbon Tetrachloride (CCl4) standard deviation',
            'units': 'ppt',
            '_FillValue': np.nan,
            '__type': np.float32,
            '__rename': 'CCl4_std_dev',
        },
        'CCl4catsBRWn': {
            'long_name': 'Number of samples',
            'units': 'count',
            '__type': np.int16,
            '__rename': 'number_of_samples',
        },
        'CCl4catsBRWunc': {
            'long_name': 'Carbon Tetrachloride (CCl4) uncertainty',
            'units': 'ppt',
            '_FillValue': np.nan,
            '__type': np.float32,
            '__rename': 'CCl4_uncertainty',
        },
        'MCcatsBRWm': {
            'long_name': 'Methyl Chloroform (CH3CCl3)',
            'units': 'ppt',
            '_FillValue': np.nan,
            '__type': np.float32,
            '__rename': 'methyl_chloroform',
        },
        'MCcatsBRWunc': {
            'long_name': 'Methyl Chloroform (CH3CCl3) uncertainty',
            'units': 'ppt',
            '_FillValue': np.nan,
            '__type': np.float32,
            '__rename': 'methyl_chloroform_uncertainty',
        },
        'MCcatsBRWsd': {
            'long_name': 'Methyl Chloroform (CH3CCl3) standard deviation',
            'units': 'ppt',
            '_FillValue': np.nan,
            '__type': np.float32,
            '__rename': 'methyl_chloroform_std_dev',
        },
        'MCcatsBRWmsd': {
            'long_name': 'Methyl Chloroform (CH3CCl3) standard deviation',
            'units': 'ppt',
            '_FillValue': np.nan,
            '__type': np.float32,
            '__rename': 'methyl_chloroform_std_dev',
        },
        'MCcatsBRWn': {
            'long_name': 'Number of samples',
            'units': 'count',
            '__type': np.int16,
            '__rename': 'number_of_samples',
        },
        'MCritsBRWm': {
            'long_name': 'Methyl Chloroform (CH3CCl3)',
            'units': 'ppt',
            '_FillValue': np.nan,
            '__type': np.float32,
            '__rename': 'methyl_chloroform',
        },
        'MCritsBRWsd': {
            'long_name': 'Methyl Chloroform (CH3CCl3) standard deviation',
            'units': 'ppt',
            '_FillValue': np.nan,
            '__type': np.float32,
            '__rename': 'methyl_chloroform_std_dev',
        },
        'MCritsBRWn': {
            'long_name': 'Number of samples',
            'units': 'count',
            '__type': np.int16,
            '__rename': 'number_of_samples',
        },
    }

    test_filename = filename
    if isinstance(test_filename, (list, tuple)):
        test_filename = test_filename[0]

    with open(test_filename) as fc:
        header = 0
        while True:
            line = fc.readline().strip()
            if not line.startswith('#'):
                break
            header += 1

    ds = read_csv(
        filename, sep=r'\s+', header=header, na_values=['Nan', 'NaN', 'nan', 'NAN'], **kwargs
    )
    var_names = list(ds.data_vars)
    year_name, month_name, day_name, hour_name, min_name = None, None, None, None, None
    for var_name in var_names:
        if var_name.endswith('yr'):
            year_name = var_name
        elif var_name.endswith('mon'):
            month_name = var_name
        elif var_name.endswith('day'):
            day_name = var_name
        elif var_name.endswith('hour'):
            hour_name = var_name
        elif var_name.endswith('min'):
            min_name = var_name

    timestamp = np.full(ds[var_names[0]].size, np.nan, dtype='datetime64[ns]')
    for ii in range(0, len(timestamp)):
        if min_name is not None:
            ts = datetime(
                ds[year_name].values[ii],
                ds[month_name].values[ii],
                ds[day_name].values[ii],
                ds[hour_name].values[ii],
                ds[min_name].values[ii],
            )
        elif hour_name is not None:
            ts = datetime(
                ds[year_name].values[ii],
                ds[month_name].values[ii],
                ds[day_name].values[ii],
                ds[hour_name].values[ii],
            )
        elif day_name is not None:
            ts = datetime(
                ds[year_name].values[ii],
                ds[month_name].values[ii],
                ds[day_name].values[ii],
            )
        else:
            ts = datetime(ds[year_name].values[ii], ds[month_name].values[ii], 1)

        timestamp[ii] = np.datetime64(ts, 'ns')

    for var_name in [year_name, month_name, day_name, hour_name, min_name]:
        try:
            del ds[var_name]
        except KeyError:
            pass

    ds = ds.rename({'index': 'time'})
    ds = ds.assign_coords(time=timestamp)
    ds['time'].attrs['long_name'] = 'Time'

    for var_name, value in variables.items():
        if var_name not in var_names:
            continue

        for att_name, att_value in value.items():
            if att_name == '__type':
                values = ds[var_name].values
                values = values.astype(att_value)
                ds[var_name].values = values
            elif att_name == '__rename':
                ds = ds.rename({var_name: att_value})
            else:
                ds[var_name].attrs[att_name] = att_value

    return ds


[docs]def read_gml_co2(filename=None, convert_missing=True, **kwargs):
    """
    Function to read carbon dioxide data from NOAA GML.

    Parameters
    ----------
    filename : str or pathlib.Path
        Data file full path name.
    convert_missing : boolean
        Option to convert missing values to NaN. If turned off will
        set variable attribute to missing value expected. This works well
        to preserve the data type best for writing to a netCDF file.

    Returns
    -------
    ds : xarray.Dataset
        Standard ACT Xarray dataset with the data cleaned up to have units,
        long_name, correct type and some other stuff.
    **kwargs : keywords
        Keywords to pass through to ACT read_csv() routine.

    """
    ds = None
    if filename is None:
        return ds

    variables = {
        'site_code': None,
        'year': None,
        'month': None,
        'day': None,
        'hour': None,
        'minute': None,
        'second': None,
        'time_decimal': None,
        'value': {
            'long_name': 'Carbon monoxide in dry air',
            'units': 'ppm',
            '_FillValue': -999.99,
            'comment': (
                'Mole fraction reported in units of micromol mol-1 '
                '(10-6 mol per mol of dry air); abbreviated as ppm (parts per million).'
            ),
            '__type': np.float32,
            '__rename': 'co2',
        },
        'value_std_dev': {
            'long_name': 'Carbon monoxide in dry air',
            'units': 'ppm',
            '_FillValue': -99.99,
            'comment': (
                'This is the standard deviation of the reported mean value '
                'when nvalue is greater than 1. See provider_comment if available.'
            ),
            '__type': np.float32,
            '__rename': 'co2_std_dev',
        },
        'nvalue': {
            'long_name': 'Number of measurements contributing to reported value',
            'units': '1',
            '_FillValue': -9,
            '__type': np.int16,
            '__rename': 'number_of_measurements',
        },
        'latitude': {
            'long_name': 'Latitude at which air sample was collected',
            'units': 'degrees_north',
            '_FillValue': -999.999,
            'standard_name': 'latitude',
            '__type': np.float32,
        },
        'longitude': {
            'long_name': 'Latitude at which air sample was collected',
            'units': 'degrees_east',
            '_FillValue': -999.999,
            'standard_name': 'longitude',
            '__type': np.float32,
        },
        'altitude': {
            'long_name': 'Sample altitude',
            'units': 'm',
            '_FillValue': -999.999,
            'standard_name': 'altitude',
            'comment': (
                'Altitude for this dataset is the sum of surface elevation '
                '(masl) and sample intake height (magl)'
            ),
            '__type': np.float32,
        },
        'intake_height': {
            'long_name': 'Sample intake height above ground level',
            'units': 'm',
            '_FillValue': -999.999,
            '__type': np.float32,
        },
    }

    test_filename = filename
    if isinstance(test_filename, (list, tuple)):
        test_filename = test_filename[0]

    with open(test_filename) as fc:
        skiprows = int(fc.readline().strip().split()[-1]) - 1

    ds = read_csv(filename, sep=r'\s+', skiprows=skiprows, **kwargs)

    timestamp = np.full(ds['year'].size, np.nan, dtype='datetime64[ns]')
    for ii in range(0, len(timestamp)):
        ts = datetime(
            ds['year'].values[ii],
            ds['month'].values[ii],
            ds['day'].values[ii],
            ds['hour'].values[ii],
            ds['minute'].values[ii],
            ds['second'].values[ii],
        )
        timestamp[ii] = np.datetime64(ts, 'ns')

    ds = ds.rename({'index': 'time'})
    ds = ds.assign_coords(time=timestamp)
    ds['time'].attrs['long_name'] = 'Time'

    for var_name, value in variables.items():
        if value is None:
            del ds[var_name]
        else:
            for att_name, att_value in value.items():
                if att_name == '__type':
                    values = ds[var_name].values
                    values = values.astype(att_value)
                    ds[var_name].values = values
                elif att_name == '__rename':
                    ds = ds.rename({var_name: att_value})
                else:
                    ds[var_name].attrs[att_name] = att_value

            if convert_missing:
                try:
                    var_name = variables[var_name]['__rename']
                except KeyError:
                    pass

                try:
                    missing_value = ds[var_name].attrs['_FillValue']
                    values = ds[var_name].values.astype(float)
                    values[np.isclose(missing_value, values)] = np.nan
                    ds[var_name].values = values
                    ds[var_name].attrs['_FillValue'] = np.nan
                except KeyError:
                    pass

    values = ds['qcflag'].values
    bad_index = []
    suspect_index = []
    for ii, value in enumerate(values):
        pts = list(value)
        if pts[0] != '.':
            bad_index.append(ii)
        if pts[1] != '.':
            suspect_index.append(ii)

    var_name = 'co2'
    qc_var_name = ds.qcfilter.create_qc_variable(var_name)
    ds.qcfilter.add_test(
        var_name,
        index=bad_index,
        test_assessment='Bad',
        test_meaning='Obvious problems during collection or analysis',
    )
    ds.qcfilter.add_test(
        var_name,
        index=suspect_index,
        test_assessment='Indeterminate',
        test_meaning=(
            'Likely valid but does not meet selection criteria determined by '
            'the goals of a particular investigation'
        ),
    )
    ds[qc_var_name].attrs[
        'comment'
    ] = 'This quality control flag is provided by the contributing PIs'
    del ds['qcflag']

    return ds


[docs]def read_gml_ozone(filename=None, **kwargs):
    """
    Function to read ozone data from NOAA GML.

    Parameters
    ----------
    filename : str or pathlib.Path
        Data file full path name.
    **kwargs : keywords
        Keywords to pass through to ACT read_csv() routine.

    Returns
    -------
    ds : xarray.Dataset
        Standard ACT Xarray dataset with the data cleaned up to have units,
        long_name, correct type and some other stuff.

    """
    ds = None
    if filename is None:
        return ds

    test_filename = filename
    if isinstance(test_filename, (list, tuple)):
        test_filename = test_filename[0]

    with open(test_filename) as fc:
        skiprows = 0
        while True:
            line = fc.readline().strip().split()
            try:
                if len(line) == 6 and line[0] == 'STN':
                    break
            except IndexError:
                pass
            skiprows += 1

    ds = read_csv(filename, sep=r'\s+', skiprows=skiprows, **kwargs)
    ds.attrs['station'] = str(ds['STN'].values[0]).lower()

    timestamp = np.full(ds['YEAR'].size, np.nan, dtype='datetime64[ns]')
    for ii in range(0, len(timestamp)):
        ts = datetime(
            ds['YEAR'].values[ii],
            ds['MON'].values[ii],
            ds['DAY'].values[ii],
            ds['HR'].values[ii],
        )
        timestamp[ii] = np.datetime64(ts, 'ns')

    ds = ds.rename({'index': 'time'})
    ds = ds.assign_coords(time=timestamp)
    ds['time'].attrs['long_name'] = 'Time'

    for var_name in ['STN', 'YEAR', 'MON', 'DAY', 'HR']:
        del ds[var_name]

    var_name = 'ozone'
    ds = ds.rename({'O3(PPB)': var_name})
    ds[var_name].attrs['long_name'] = 'Ozone'
    ds[var_name].attrs['units'] = 'ppb'
    ds[var_name].attrs['_FillValue'] = np.nan
    ds[var_name].values = ds[var_name].values.astype(np.float32)

    return ds


[docs]def read_gml_radiation(filename=None, convert_missing=True, remove_time_vars=True, **kwargs):
    """
    Function to read radiation data from NOAA GML.

    Parameters
    ----------
    filename : str or pathlib.Path
        Data file full path name.
    convert_missing : boolean
        Option to convert missing values to NaN. If turned off will
        set variable attribute to missing value expected. This works well
        to preserve the data type best for writing to a netCDF file.
    remove_time_vars : boolean
        Some column names in the CSV file are used for creating the time
        coordinate variable in the returend Xarray DataSet. Once used the
        variables are not needed and will be removed from DataSet.
    **kwargs : keywords
        Keywords to pass through to ACT read_csv() routine.

    Returns
    -------
    ds : xarray.Dataset
        Standard ACT Xarray dataset with the data cleaned up to have units,
        long_name, correct type and some other stuff.
    """

    ds = None
    if filename is None:
        return ds

    column_names = {
        'year': None,
        'jday': None,
        'month': None,
        'day': None,
        'hour': None,
        'minute': None,
        'decimal_time': None,
        'solar_zenith_angle': {
            'units': 'degree',
            'long_name': 'Solar zenith angle',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'downwelling_global_solar': {
            'units': 'W/m^2',
            'long_name': 'Downwelling global solar',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'upwelling_global_solar': {
            'units': 'W/m^2',
            'long_name': 'Upwelling global solar',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'direct_normal_solar': {
            'units': 'W/m^2',
            'long_name': 'Direct-normal solar',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'downwelling_diffuse_solar': {
            'units': 'W/m^2',
            'long_name': 'Downwelling diffuse solar',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'downwelling_thermal_infrared': {
            'units': 'W/m^2',
            'long_name': 'Downwelling thermal infrared',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'downwelling_infrared_case_temp': {
            'units': 'degK',
            'long_name': 'Downwelling infrared case temp',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'downwelling_infrared_dome_temp': {
            'units': 'degK',
            'long_name': 'downwelling infrared dome temp',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'upwelling_thermal_infrared': {
            'units': 'W/m^2',
            'long_name': 'Upwelling thermal infrared',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'upwelling_infrared_case_temp': {
            'units': 'degK',
            'long_name': 'Upwelling infrared case temp',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'upwelling_infrared_dome_temp': {
            'units': 'degK',
            'long_name': 'Upwelling infrared dome temp',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'global_UVB': {
            'units': 'mW/m^2',
            'long_name': 'global ultraviolet-B',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'par': {
            'units': 'W/m^2',
            'long_name': 'Photosynthetically active radiation',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'net_solar': {
            'units': 'W/m^2',
            'long_name': 'Net solar (downwelling_global_solar - upwelling_global_solar)',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'net_infrared': {
            'units': 'W/m^2',
            'long_name': (
                'Net infrared (downwelling_thermal_infrared - ' 'upwelling_thermal_infrared)'
            ),
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'net_radiation': {
            'units': 'W/m^2',
            'long_name': 'Net radiation (net_solar + net_infrared)',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'air_temperature_10m': {
            'units': 'degC',
            'long_name': '10-meter air temperature',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'relative_humidity': {
            'units': '%',
            'long_name': 'Relative humidity',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'wind_speed': {
            'units': 'm/s',
            'long_name': 'Wind speed',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'wind_direction': {
            'units': 'degree',
            'long_name': 'Wind direction (clockwise from north)',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
        'station_pressure': {
            'units': 'millibar',
            'long_name': 'Station atmospheric pressure',
            '_FillValue': -9999.9,
            '__type': np.float32,
        },
    }

    # Add additinal column names for NOAA SPASH campaign
    if str(Path(filename).name).startswith('cbc') or str(Path(filename).name).startswith('ckp'):
        column_names['SPN1_total'] = {
            'units': 'W/m^2',
            'long_name': 'SPN1 total average',
            '_FillValue': -9999.9,
            '__type': np.float32,
        }
        column_names['SPN1_diffuse'] = {
            'units': 'W/m^2',
            'long_name': 'SPN1 diffuse average',
            '_FillValue': -9999.9,
            '__type': np.float32,
        }

    names = list(column_names.keys())
    skip_vars = [
        'year',
        'jday',
        'month',
        'day',
        'hour',
        'minute',
        'decimal_time',
        'solar_zenith_angle',
    ]
    num = 1
    for ii, name in enumerate(column_names.keys()):
        if name in skip_vars:
            continue
        names.insert(ii + num, 'qc_' + name)
        num += 1

    ds = read_csv(filename, sep=r'\s+', header=None, skiprows=2, column_names=names, **kwargs)

    if isinstance(filename, (list, tuple)):
        filename = filename[0]

    if ds is not None:
        with open(filename) as fc:
            lat = None
            lon = None
            alt = None
            alt_unit = None
            station = None
            for ii in [0, 1]:
                line = fc.readline().strip().split()
                if len(line) == 1:
                    station = line[0]
                else:
                    lat = np.array(line[0], dtype=np.float32)
                    lon = np.array(line[1], dtype=np.float32)
                    alt = np.array(line[2], dtype=np.float32)
                    alt_unit = str(line[3])

        ds['lat'] = xr.DataArray(
            lat,
            attrs={
                'long_name': 'Latitude',
                'units': 'degree_north',
                'standard_name': 'latitude',
            },
        )
        ds['lon'] = xr.DataArray(
            lon,
            attrs={
                'long_name': 'Longitude',
                'units': 'degree_east',
                'standard_name': 'longitude',
            },
        )
        ds['alt'] = xr.DataArray(
            alt,
            attrs={
                'long_name': 'Altitude',
                'units': alt_unit,
                'standard_name': 'altitude',
            },
        )
        ds.attrs['location'] = station

        timestamp = np.full(ds['year'].size, np.nan, dtype='datetime64[ns]')
        for ii in range(0, len(timestamp)):
            ts = datetime(
                ds['year'].values[ii],
                ds['month'].values[ii],
                ds['day'].values[ii],
                ds['hour'].values[ii],
                ds['minute'].values[ii],
            )
            timestamp[ii] = np.datetime64(ts, 'ns')

        ds = ds.rename({'index': 'time'})
        ds = ds.assign_coords(time=timestamp)
        ds['time'].attrs['long_name'] = 'Time'
        for var_name, value in column_names.items():
            if value is None:
                ds[var_name]
            else:
                for att_name, att_value in value.items():
                    if att_name == '__type':
                        values = ds[var_name].values
                        values = values.astype(att_value)
                        ds[var_name].values = values
                    else:
                        ds[var_name].attrs[att_name] = att_value

                if convert_missing:
                    try:
                        missing_value = ds[var_name].attrs['_FillValue']
                        values = ds[var_name].values.astype(float)
                        index = np.isclose(values, missing_value)
                        values[index] = np.nan
                        ds[var_name].values = values
                        ds[var_name].attrs['_FillValue'] = np.nan
                    except KeyError:
                        pass

        for var_name in ds.data_vars:
            if not var_name.startswith('qc_'):
                continue
            data_var_name = var_name.replace('qc_', '', 1)
            attrs = {
                'long_name': f"Quality control variable for: {ds[data_var_name].attrs['long_name']}",
                'units': '1',
                'standard_name': 'quality_flag',
                'flag_values': [0, 1, 2],
                'flag_meanings': [
                    'Not failing any tests',
                    'Knowingly bad value',
                    'Should be used with scrutiny',
                ],
                'flag_assessments': ['Good', 'Bad', 'Indeterminate'],
            }
            ds[var_name].attrs = attrs
            ds[data_var_name].attrs['ancillary_variables'] = var_name

        if remove_time_vars:
            remove_var_names = [
                'year',
                'jday',
                'month',
                'day',
                'hour',
                'minute',
                'decimal_time',
            ]
            ds = ds.drop_vars(remove_var_names)

    return ds


[docs]def read_gml_met(filename=None, convert_missing=True, **kwargs):
    """
    Function to read meteorological data from NOAA GML.

    Parameters
    ----------
    filename : str or pathlib.Path
        Data file full path name.
    convert_missing : boolean
        Option to convert missing values to NaN. If turned off will
        set variable attribute to missing value expected. This works well
        to preserve the data type best for writing to a netCDF file.
    **kwargs : keywords
        Keywords to pass through to ACT read_csv() routine.

    Returns
    -------
    ds : xarray.Dataset
        Standard ACT Xarray dataset with the data cleaned up to have units,
        long_name, correct type and some other stuff.
    """

    ds = None
    if filename is None:
        return ds

    column_names = {
        'station': None,
        'year': None,
        'month': None,
        'day': None,
        'hour': None,
        'minute': None,
        'wind_direction': {
            'units': 'degree',
            'long_name': 'Average wind direction from which the wind is blowing',
            '_FillValue': -999,
            '__type': np.int16,
        },
        'wind_speed': {
            'units': 'm/s',
            'long_name': 'Average wind speed',
            '_FillValue': -999.9,
            '__type': np.float32,
        },
        'wind_steadiness_factor': {
            'units': '1',
            'long_name': '100 times the ratio of the vector wind speed to the '
            'average wind speed for the hour',
            '_FillValue': -9,
            '__type': np.int16,
        },
        'barometric_pressure': {
            'units': 'hPa',
            'long_name': 'Station barometric pressure',
            '_FillValue': -999.90,
            '__type': np.float32,
        },
        'temperature_2m': {
            'units': 'degC',
            'long_name': 'Temperature at 2 meters above ground level',
            '_FillValue': -999.9,
            '__type': np.float32,
        },
        'temperature_10m': {
            'units': 'degC',
            'long_name': 'Temperature at 10 meters above ground level',
            '_FillValue': -999.9,
            '__type': np.float32,
        },
        'temperature_tower_top': {
            'units': 'degC',
            'long_name': 'Temperature at top of instrument tower',
            '_FillValue': -999.9,
            '__type': np.float32,
        },
        'realitive_humidity': {
            'units': 'percent',
            'long_name': 'Relative humidity',
            '_FillValue': -99,
            '__type': np.int16,
        },
        'preciptation_intensity': {
            'units': 'mm/hour',
            'long_name': 'Amount of precipitation per hour',
            '_FillValue': -99,
            '__type': np.int16,
            'comment': (
                'The precipitation amount is measured with an unheated '
                'tipping bucket rain gauge.'
            ),
        },
    }

    minutes = True
    test_filename = filename
    if isinstance(test_filename, (list, tuple)):
        test_filename = test_filename[0]

    if '_hour_' in Path(test_filename).name:
        minutes = False
        del column_names['minute']

    ds = read_csv(filename, sep=r'\s+', header=None, column_names=column_names.keys(), **kwargs)

    if ds is not None:
        timestamp = np.full(ds['year'].size, np.nan, dtype='datetime64[ns]')
        for ii in range(0, len(timestamp)):
            if minutes:
                ts = datetime(
                    ds['year'].values[ii],
                    ds['month'].values[ii],
                    ds['day'].values[ii],
                    ds['hour'].values[ii],
                    ds['minute'].values[ii],
                )
            else:
                ts = datetime(
                    ds['year'].values[ii],
                    ds['month'].values[ii],
                    ds['day'].values[ii],
                    ds['hour'].values[ii],
                )

            timestamp[ii] = np.datetime64(ts, 'ns')

        ds = ds.rename({'index': 'time'})
        ds = ds.assign_coords(time=timestamp)
        ds['time'].attrs['long_name'] = 'Time'
        for var_name, value in column_names.items():
            if value is None:
                del ds[var_name]
            else:
                for att_name, att_value in value.items():
                    if att_name == '__type':
                        values = ds[var_name].values
                        values = values.astype(att_value)
                        ds[var_name].values = values
                    else:
                        ds[var_name].attrs[att_name] = att_value

                if convert_missing:
                    try:
                        missing_value = ds[var_name].attrs['_FillValue']
                        values = ds[var_name].values.astype(float)
                        index = np.isclose(values, missing_value)
                        values[index] = np.nan
                        ds[var_name].values = values
                        ds[var_name].attrs['_FillValue'] = np.nan
                    except KeyError:
                        pass

    return ds


[docs]def read_surfrad(filename, **kwargs):
    """
    Function to read in NOAA SurfRad data

    Parameters
    ----------
    filename : list
        Data files full path name or url to file
    **kwargs : keywords
        Keywords to pass through to instrument specific reading routine.

    Returns
    -------
    ds : xarray.Dataset
        Standard ACT Xarray dataset with the data cleaned up to have units,
        long_name, correct type and some other stuff.

    """

    ds = None
    if filename is None:
        return ds

    names = [
        'year',
        'jday',
        'month',
        'day',
        'hour',
        'minute',
        'dec_time',
        'solar_zenith_angle',
        'downwelling_global',
        'qc_downwelling_global',
        'upwelling_global',
        'qc_upwelling_global',
        'direct_normal',
        'qc_direct_normal',
        'downwelling_diffuse',
        'qc_downwelling_diffuse',
        'downwelling_ir',
        'qc_downwelling_ir',
        'downwelling_ir_casetemp',
        'qc_downwelling_ir_casetemp',
        'downwelling_ir_dometemp',
        'qc_downwelling_ir_dometemp',
        'upwelling_ir',
        'qc_upwelling_ir',
        'upwelling_ir_casetemp',
        'qc_upwelling_ir_casetemp',
        'upwelling_ir_dometemp',
        'qc_upwelling_ir_dometemp',
        'global_uvb',
        'qc_global_uvb',
        'par',
        'qc_par',
        'net_radiation',
        'qc_net_radiation',
        'net_ir',
        'qc_net_ir',
        'total_net',
        'qc_total_net',
        'temperature',
        'qc_temperature',
        'relative_humidity',
        'qc_relative_humidity',
        'wind_speed',
        'qc_wind_speed',
        'wind_direction',
        'qc_wind_direction',
        'pressure',
        'qc_pressure',
    ]
    for i, f in enumerate(filename):
        new_df = pd.read_csv(f, names=names, skiprows=2, delimiter=r'\s+', header=None)
        if i == 0:
            df = new_df
        else:
            df = pd.concat([df, new_df])

    # Create time variable and add as the coordinate
    ds = df.to_xarray()
    year = ds['year'].values
    month = ds['month'].values
    day = ds['day'].values
    hour = ds['hour'].values
    minute = ds['minute'].values
    time = [datetime(year[i], month[i], day[i], hour[i], minute[i]) for i in range(len(year))]
    ds = ds.assign_coords(index=time)
    ds = ds.rename(index='time')

    # Add attributes
    attrs = {
        'year': {'long_name': 'Year', 'units': 'unitless'},
        'jday': {'long_name': 'Julian day', 'units': 'unitless'},
        'month': {'long_name': 'Month', 'units': 'unitless'},
        'day': {'long_name': 'Day of the month', 'units': 'unitless'},
        'hour': {'long_name': 'Hour', 'units': 'unitless'},
        'minute': {'long_name': 'Minutes', 'units': 'unitless'},
        'dec_time': {'long_name': 'Decimal time', 'units': 'unitless'},
        'solar_zenith_angle': {'long_name': 'Solar zenith angle', 'units': 'deg'},
        'downwelling_global': {
            'long_name': 'Downwelling global solar',
            'units': 'W m^-2',
            'standard_name': 'surface_downwelling_shortwave_flux_in_air',
        },
        'upwelling_global': {
            'long_name': 'Upwelling global solar',
            'units': 'W m^-2',
            'standard_name': 'surface_upwelling_shortwave_flux_in_air',
        },
        'direct_normal': {
            'long_name': 'Direct normal solar',
            'units': 'W m^-2',
            'standard_name': 'surface_direct_downwelling_shortwave_flux_in_air',
        },
        'downwelling_diffuse': {
            'long_name': 'Downwelling diffuse solar',
            'units': 'W m^-2',
            'standard_name': 'diffuse_downwelling_shortwave_flux_in_air',
        },
        'downwelling_ir': {
            'long_name': 'Downwelling thermal infrared',
            'units': 'W m^-2',
            'standard_name': 'net_downward_longwave_flux_in_air',
        },
        'downwelling_ir_casetemp': {
            'long_name': 'Downwelling thermal infrared case temperature',
            'units': 'K',
        },
        'downwelling_ir_dometemp': {
            'long_name': 'Downwelling thermal infrared dome temperature',
            'units': 'K',
        },
        'upwelling_ir': {
            'long_name': 'Upwelling thermal infrared',
            'units': 'W m^-2',
            'standard_name': 'net_upward_longwave_flux_in_air',
        },
        'upwelling_ir_casetemp': {
            'long_name': 'Upwelling thermal infrared case temperature',
            'units': 'K',
        },
        'upwelling_ir_dometemp': {
            'long_name': 'Upwelling thermal infrared dome temperature',
            'units': 'K',
        },
        'global_uvb': {'long_name': 'Global UVB', 'units': 'milliWatts m^-2'},
        'par': {
            'long_name': 'Photosynthetically active radiation',
            'units': 'W m^-2',
            'standard_name': 'surface_downwelling_photosynthetic_radiative_flux_in_air',
        },
        'net_radiation': {
            'long_name': 'Net solar (downwelling_global-upwelling_global)',
            'units': 'W m^-2',
            'standard_name': 'surface_net_downward_shortwave_flux',
        },
        'net_ir': {
            'long_name': 'Net infrared (downwelling_ir-upwelling_ir)',
            'units': 'W m^-2',
            'standard_name': 'surface_net_downward_longwave_flux',
        },
        'total_net': {
            'long_name': 'Total Net radiation (net_radiation + net_ir)',
            'units': 'W m^-2',
        },
        'temperature': {
            'long_name': '10-meter air temperature',
            'units': 'degC',
            'standard_name': 'air_temperature',
        },
        'relative_humidity': {
            'long_name': 'Relative humidity',
            'units': '%',
            'standard_name': 'relative_humidity',
        },
        'wind_speed': {'long_name': 'Wind speed', 'units': 'ms^-1', 'standard_name': 'wind_speed'},
        'wind_direction': {
            'long_name': 'Wind direction, clockwise from North',
            'units': 'deg',
            'standard_name': 'wind_from_direction',
        },
        'pressure': {
            'long_name': 'Station pressure',
            'units': 'mb',
            'standard_name': 'air_pressure',
        },
    }

    for v in ds:
        if v in attrs:
            ds[v].attrs = attrs[v]

    # Add attributes to all QC variables
    qc_vars = [
        'downwelling_global',
        'upwelling_global',
        'direct_normal',
        'downwelling_diffuse',
        'downwelling_ir',
        'downwelling_ir_casetemp',
        'downwelling_ir_dometemp',
        'upwelling_ir',
        'upwelling_ir_casetemp',
        'upwelling_ir_dometemp',
        'global_uvb',
        'par',
        'net_radiation',
        'net_ir',
        'total_net',
        'temperature',
        'relative_humidity',
        'wind_speed',
        'wind_direction',
        'pressure',
    ]

    for v in qc_vars:
        atts = {
            'long_name': 'Quality check results on variable: ' + v,
            'units': '1',
            'description': ''.join(
                [
                    'A QC flag of zero indicates that the corresponding data point is good,',
                    ' having passed all QC checks.  A value greater than 0 indicates that',
                    ' the data failed one level of QC.  For example, a QC value of 1 means',
                    ' that the recorded value is beyond a physically possible range, or it has',
                    ' been affected adversely in some manner to produce a knowingly bad value.',
                    ' A value of 2 indicates that the data value failed the second level QC check,',
                    ' indicating that the data value may be physically possible but should be used',
                    ' with scrutiny, and so on.',
                ]
            ),
        }
        ds['qc_' + v].attrs = atts

    ds.attrs['datastream'] = 'SURFRAD Site: ' + filename[0].split('/')[-1][0:3]

    return ds


def read_gml_aerosol(filename, **kwargs):
    """
    Function to read aerosol data from NOAA GML.

    Parameters
    ----------
    filename : str or pathlib.Path
        Data file full path name.

    Returns
    -------
    ds : xarray.Dataset
        Standard ACT Xarray dataset with the data cleaned up to have units,
        long_name, correct type and some other stuff.
    **kwargs : keywords
        Keywords to pass through to ACT read_csv() routine.

    """
    ds = None
    if filename is None:
        return ds

    if not isinstance(filename, list):
        filename = [filename]

    skiprows = 0
    names = None
    startdate = None
    lat = None
    lon = None
    alt = None
    height = None
    station = None
    station_code = None
    station_gaw_id = None
    missing_value = None
    matrix = None
    instrument_type = None
    inlet_type = None
    with open(filename[0]) as fc:
        while True:
            line = fc.readline().strip()
            if line.startswith('Startdate:'):
                startdate = line.split()[-1].strip()
            if line.startswith('Measurement latitude:'):
                lat = float(line.split()[-1].strip())
            if line.startswith('Measurement longitude:'):
                lon = float(line.split()[-1].strip())
            if line.startswith('Measurement altitude:'):
                alt = float(line.split()[-1].strip().replace('m', ''))
            if line.startswith('Measurement height:'):
                height = float(line.split()[-1].strip().replace('m', ''))
            if line.startswith('Station GAW-Name:'):
                station = line.split(":")[-1].strip()
            if line.startswith('Station code:'):
                station_code = line.split(":")[-1].strip()
            if line.startswith('Station GAW-ID:'):
                station_gaw_id = line.split(":")[-1].strip()
            if line.startswith('Matrix:'):
                matrix = line.split(":")[-1].strip()
            if line.startswith('Instrument type:'):
                instrument_type = line.split(":")[-1].strip()
            if line.startswith('Inlet type:'):
                inlet_type = line.split(":")[-1].strip()
            if line.startswith('9999.999999'):
                missing_value = line.split()
            if line.startswith('start_time'):
                names = line.split()
                skiprows += 1
                break

            skiprows += 1

    missing_value = list(set(missing_value))

    for i, f in enumerate(filename):
        new_df = pd.read_csv(
            f, names=names, skiprows=skiprows, delimiter=r'\s+', na_values=missing_value
        )
        if i == 0:
            df = new_df
        else:
            df = pd.concat([df, new_df])

    # Create time variable and add as the coordinate. There is a start and end time
    # for each time step. Pick a value in the middle.
    ds = df.to_xarray()
    startdate = np.datetime64(datetime.strptime(startdate, '%Y%m%d%H%M%S'))
    startdate = startdate.astype('datetime64[s]')
    start_time = ds['start_time'].values * 24.0 * 60.0 * 60.0
    end_time = ds['end_time'].values * 24.0 * 60.0 * 60.0
    time = (end_time + start_time) / 2.0
    time = np.round(time / 10.0) * 10.0
    time = startdate + time.astype(int)
    ds = ds.assign_coords(index=time)
    ds = ds.rename(index='time')

    del ds['start_time']
    del ds['end_time']

    ds['lat'] = xr.DataArray(
        lat,
        attrs={
            'long_name': 'Latitude',
            'units': 'degree_north',
            'standard_name': 'latitude',
        },
    )
    ds['lon'] = xr.DataArray(
        lon,
        attrs={
            'long_name': 'Longitude',
            'units': 'degree_east',
            'standard_name': 'longitude',
        },
    )
    ds['alt'] = xr.DataArray(
        alt,
        attrs={
            'long_name': 'Altitude',
            'units': 'm',
            'standard_name': 'altitude',
        },
    )
    ds.attrs['height'] = height
    ds.attrs['Station_GAW-Name'] = station
    ds.attrs['Station_code'] = station_code
    ds.attrs['Station_GAW-ID'] = station_gaw_id
    ds.attrs['Matrix'] = matrix
    ds.attrs['Instrument type'] = instrument_type
    ds.attrs['Inlet type'] = inlet_type

    # Add attributes
    attrs = {
        'p_int': {
            'long_name': 'Atmospheric pressure',
            'units': 'hPa',
        },
        'T_int': {
            'long_name': 'Atmospheric temperature',
            'units': 'K',
        },
        'RH_int': {
            'long_name': 'Atmospheric relative humidity',
            'units': '%',
        },
        'sc450': {
            'long_name': 'Aerosol light scattering coefficient at 450 nm',
            'units': '1/Mm',
        },
        'sc550': {
            'long_name': 'Aerosol light scattering coefficient at 550 nm',
            'units': '1/Mm',
        },
        'sc700': {
            'long_name': 'Aerosol light scattering coefficient at 700 nm',
            'units': '1/Mm',
        },
        'bsc450': {
            'long_name': 'Aerosol light backscattering coefficient at 450 nm',
            'units': '1/Mm',
        },
        'bsc550': {
            'long_name': 'Aerosol light backscattering coefficient at 550 nm',
            'units': '1/Mm',
        },
        'bsc700': {
            'long_name': 'Aerosol light backscattering coefficient at 700 nm',
            'units': '1/Mm',
        },
        'sc450pc16': {
            'long_name': 'Aerosol light scattering coefficient at 450 nm percentile:15.87',
            'units': '1/Mm',
        },
        'sc550pc16': {
            'long_name': 'Aerosol light scattering coefficient at 550 nm percentile:15.87',
            'units': '1/Mm',
        },
        'sc700pc16': {
            'long_name': 'Aerosol light scattering coefficient at 700 nm percentile:15.87',
            'units': '1/Mm',
        },
        'bsc450pc16': {
            'long_name': 'Aerosol light backscattering coefficient at 450 nm percentile:15.87',
            'units': '1/Mm',
        },
        'bsc550pc16': {
            'long_name': 'Aerosol light backscattering coefficient at 550 nm percentile:15.87',
            'units': '1/Mm',
        },
        'bsc700pc16': {
            'long_name': 'Aerosol light backscattering coefficient at 700 nm percentile:15.87',
            'units': '1/Mm',
        },
        'sc450pc84': {
            'long_name': 'Aerosol light scattering coefficient at 450 nm percentile:84.13',
            'units': '1/Mm',
        },
        'sc550pc84': {
            'long_name': 'Aerosol light scattering coefficient at 550 nm percentile:84.13',
            'units': '1/Mm',
        },
        'sc700pc84': {
            'long_name': 'Aerosol light scattering coefficient at 700 nm percentile:84.13',
            'units': '1/Mm',
        },
        'bsc450pc84': {
            'long_name': 'Aerosol light backscattering coefficient at 450 nm percentile:84.13',
            'units': '1/Mm',
        },
        'bsc550pc84': {
            'long_name': 'Aerosol light backscattering coefficient at 550 nm percentile:84.13',
            'units': '1/Mm',
        },
        'bsc700pc84': {
            'long_name': 'Aerosol light backscattering coefficient at 700 nm percentile:84.13',
            'units': '1/Mm',
        },
        'numflag': {
            'long_name': 'Numflag',
            'units': '1',
        },
    }

    # Apply attributes to variables
    for v in ds:
        if v in attrs:
            ds[v].attrs = attrs[v]

    return ds