Source code for act.io.sodar

"""
This module contains I/O operations for loading Sodar files.

"""

import datetime as dt
import re

import fsspec
import numpy as np
import pandas as pd
import xarray as xr

from act.io.noaapsl import filter_list


[docs]def read_mfas_sodar(filepath):
    """
    Returns `xarray.Dataset` with stored data and metadata from a user-defined
    Flat Array MFAS Sodar file. More information can be found here:
    https://www.scintec.com/products/flat-array-sodar-mfas/

    Parameters
    ----------
    filepath : str
        Name of file to read.

    Return
    ------
    ds : xarray.Dataset
        Standard Xarray dataset with the data.

    """
    file = fsspec.open(filepath).open()
    lines = file.readlines()
    lines = [x.decode().rstrip()[:] for x in lines]

    # Retrieve number of height values from line 3.
    _, _, len_height = filter_list(lines[3].split()).astype(int)

    # Retrieve metadata
    file_dict, variable_dict = _metadata_retrieval(lines)

    # Retrieve datetimes and time indices from when datetime rows appear
    skip_time_ind = []
    datetimes = []
    fmt = '%Y-%m-%d %H:%M:%S'
    for i, line in enumerate(lines):
        match = re.search(r'\d{4}-\d{2}-\d{2}\ \d{2}:\d{2}:\d{2}', line)
        if match is None:
            continue
        else:
            date_object = dt.datetime.strptime(match.group(0), fmt)
            datetimes.append(date_object)
            skip_time_ind.append(i)

    datetimes = np.delete(datetimes, 0)
    # Create datetime column with matching datetimes to heights
    data_times = pd.DataFrame(datetimes, columns=['Dates'])
    repeat_times = data_times.loc[data_times.index.repeat(len_height)]

    # This is used to pull only actual data.
    # Code can be added as well to read in the metadata from the first few rows.
    skip_meta_ind = np.arange(0, skip_time_ind[1] + 1, 1)
    skip_full_ind = np.append(skip_meta_ind, skip_time_ind)
    skip_full_ind = np.unique(skip_full_ind)

    # Column row appears 1 row after first time, retrieve column names from that.
    columns = np.delete(filter_list(lines[skip_time_ind[1] + 1].split(' ')), 0).tolist()

    # Tmp column allows for the # column to be pushed over and dropped.
    tmp_columns = columns + ['tmp']

    # Parse data to a dataframe skipping rows that aren't data.
    # tmp_columns is used to removed '#' column that causes
    # columns to move over by one.
    df = pd.read_table(
        filepath, sep=r'\s+', skiprows=skip_full_ind, names=tmp_columns, usecols=columns
    )

    df = df[~df['W'].isin(['dir'])].reset_index(drop=True)

    # Set index to datetime column.
    df = df.set_index(repeat_times['Dates'])

    # Convert dataframe to xarray dataset.
    ds = df.to_xarray()

    # Convert height to float.
    ds['z'] = ds.z.astype(float)

    # Convert all variables from string to float.
    ds = ds.astype(float)

    # Convert variables that should be int back to int.
    ds['error'] = ds.error.astype(int)
    ds['PGz'] = ds.PGz.astype(int)

    # Get unique time and height values.
    time_dim = np.unique(ds.Dates.values)
    height_dim = np.unique(ds.z.values)

    # Use unique time and height values to reindex data to be two dimensional.
    ind = pd.MultiIndex.from_product((time_dim, height_dim), names=('time', 'height'))

    # Xarray 2023.9 contains new syntax, adding try and except for
    # previous version.
    try:
        mindex_coords = xr.Coordinates.from_pandas_multiindex(ind, 'Dates')
        ds = ds.assign_coords(mindex_coords).unstack("Dates")
    except AttributeError:
        ds = ds.assign(Dates=ind).unstack("Dates")

    # Add file metadata.
    for key in file_dict.keys():
        ds.attrs[key] = file_dict[key]

    # Add metadata to the attributes of each variable.
    for key in variable_dict.keys():
        ds[key].attrs = variable_dict[key]

    # Change fill values to nans for floats and 0 for ints.
    # We can't use xr.replace as the fill value changes between variables.
    for var in ds.data_vars:
        if var == 'error':
            continue
        elif var == 'PGz':
            data_with_fill = ds[var].values
            data_with_fill[data_with_fill == 99] = 0
            ds[var].values = data_with_fill
        else:
            data_with_fill = ds[var].values
            fill_value = ds[var].attrs['_FillValue']
            data_with_fill[data_with_fill == fill_value] = np.nan
            ds[var].values = data_with_fill

    # Drop z as its already a coordinate and give coordinate the same attributes.
    ds.height.attrs = ds['z'].attrs
    ds = ds.drop_vars('z')

    return ds


def _metadata_retrieval(lines):
    # File format from line 0.
    _format = lines[0]

    # Sodar type from line 2.
    instrument_type = lines[2]

    # Create np.array of lines to use np.argwhere
    line_array = np.array(lines)

    # Retrieve indices of file information and the end of the metadata block.
    file_info_ind = np.argwhere(line_array == '# file information')[0][0]
    file_type_ind = np.argwhere(line_array == '# file type')[0][0]

    # Index the section of file information.
    file_def = line_array[file_info_ind + 2 : file_type_ind - 1]

    # Create a dictionary of file information to be plugged in later to the xarray
    # dataset attributes.
    file_dict = {}
    for line in file_def:
        key, value = filter_list(line.split(':'))
        file_dict[key.strip()] = value.strip()
    file_dict['format'] = _format
    file_dict['instrument_type'] = instrument_type

    # Change values from strings to float where need be.
    file_dict['antenna azimuth angle [deg]'] = float(file_dict['antenna azimuth angle [deg]'])
    file_dict['height above ground [m]'] = float(file_dict['height above ground [m]'])
    file_dict['height above sea level [m]'] = float(file_dict['height above sea level [m]'])

    # Retrieve indices of variable information.
    variable_info_ind = np.argwhere(line_array == '# variable definitions')[0][0]
    data_ind = np.argwhere(line_array == '# beginning of data block')[0][0]

    # Index the section of variable information.
    variable_def = line_array[variable_info_ind + 2 : data_ind - 1]

    # Create a dictionary of variable information to be plugged in later to the xarray
    # variable attributes. Skipping error code as it does not have metadata similar to
    # the rest of the variables.
    variable_dict = {}
    for i, line in enumerate(variable_def):
        if 'error code' in line:
            continue
        else:
            temp_var_dict = {}
            key, symbol, units, _type, error_mask, fill_value = filter_list(line.split('#'))
            temp_var_dict['variable_name'] = key.strip()
            temp_var_dict['symbol'] = symbol.strip()
            temp_var_dict['units'] = units.strip()
            temp_var_dict['type'] = _type.strip()
            temp_var_dict['error_mask'] = error_mask.strip()
            if key.strip() == 'PGz':
                temp_var_dict['_FillValue'] = int(fill_value)
            else:
                temp_var_dict['_FillValue'] = float(fill_value)
            variable_dict[symbol.strip()] = temp_var_dict

    return file_dict, variable_dict