Source code for act.io.text

"""
This module contains I/O operations for loading csv files.

"""

import pathlib

import pandas as pd

from act.io.arm import check_arm_standards


[docs]def read_csv(
    filename, sep=',', engine='python', column_names=None, skipfooter=0, ignore_index=True, **kwargs
):
    """
    Returns an `xarray.Dataset` with stored data and metadata from user-defined
    query of CSV files.

    Parameters
    ----------
    filenames : str or list
        Name of file(s) to read.
    sep : str
        The separator between columns in the csv file.
    column_names : list or None
        The list of column names in the csv file.
    verbose : bool
        If true, will print if a file is not found.
    ignore_index : bool
         Keyword for pandas concat function.  If True, do not use the index
         values along the concatenation axis. The resulting axis will be labeled
         0, …, n - 1. This is useful if you are concatenating datasets where the
         concatenation axis does not have meaningful indexing information. Note
         the index values on the other axes are still respected in the join.

    Additional keyword arguments will be passed into pandas.read_csv.

    Returns
    -------
    ds : xarray.Dataset
        ACT Xarray dataset. Will be None if the file is not found.

    Examples
    --------
    This example will load the example sounding data used for unit testing:

    .. code-block:: python

        import act

        ds = act.io.csv.read(act.tests.sample_files.EXAMPLE_CSV_WILDCARD)

    """

    # Convert to string if filename is a pathlib or not a list
    if isinstance(filename, (pathlib.PurePath, str)):
        filename = [str(filename)]

    if isinstance(filename, list) and isinstance(filename[0], pathlib.PurePath):
        filename = [str(ii) for ii in filename]

    # Read data using pandas read_csv one file at a time and append to
    # list. Then concatinate the list into one pandas dataframe.
    li = []
    for fl in filename:
        df = pd.read_csv(
            fl, sep=sep, names=column_names, skipfooter=skipfooter, engine=engine, **kwargs
        )
        li.append(df)

    if len(li) == 1:
        df = li[0]
    else:
        df = pd.concat(li, axis=0, ignore_index=ignore_index)

    # Set Coordinates if there's a variable date_time
    if 'date_time' in df:
        df.date_time = df.date_time.astype('datetime64[ns]')
        df.time = df.date_time
        df = df.set_index('time')

    # Convert to xarray DataSet
    ds = df.to_xarray()

    # Set additional variables
    # Since we cannot assume a standard naming convention setting
    # file_date and file_time to the first time in the file
    x_coord = ds.coords.to_index().values[0]
    if isinstance(x_coord, str):
        x_coord_dt = pd.to_datetime(x_coord)
        ds.attrs['_file_dates'] = x_coord_dt.strftime('%Y%m%d')
        ds.attrs['_file_times'] = x_coord_dt.strftime('%H%M%S')

    # Check for standard ARM datastream name, if none, assume the file is ARM
    # standard format.
    is_arm_file_flag = check_arm_standards(ds)
    if is_arm_file_flag == 0:
        ds.attrs['_datastream'] = '.'.join(filename[0].split('/')[-1].split('.')[0:2])

    # Add additional attributes, site, standards flag, etc...
    ds.attrs['_site'] = str(ds.attrs['_datastream'])[0:3]
    ds.attrs['_arm_standards_flag'] = is_arm_file_flag

    return ds