Source code for act.io.text

"""
This module contains I/O operations for loading csv files.

"""

import pathlib

import pandas as pd

from act.io.arm import check_arm_standards


[docs]def read_csv( filename, sep=',', engine='python', column_names=None, skipfooter=0, ignore_index=True, **kwargs ): """ Returns an `xarray.Dataset` with stored data and metadata from user-defined query of CSV files. Parameters ---------- filenames : str or list Name of file(s) to read. sep : str The separator between columns in the csv file. column_names : list or None The list of column names in the csv file. verbose : bool If true, will print if a file is not found. ignore_index : bool Keyword for pandas concat function. If True, do not use the index values along the concatenation axis. The resulting axis will be labeled 0, …, n - 1. This is useful if you are concatenating datasets where the concatenation axis does not have meaningful indexing information. Note the index values on the other axes are still respected in the join. Additional keyword arguments will be passed into pandas.read_csv. Returns ------- ds : xarray.Dataset ACT Xarray dataset. Will be None if the file is not found. Examples -------- This example will load the example sounding data used for unit testing: .. code-block:: python import act ds = act.io.csv.read(act.tests.sample_files.EXAMPLE_CSV_WILDCARD) """ # Convert to string if filename is a pathlib or not a list if isinstance(filename, (pathlib.PurePath, str)): filename = [str(filename)] if isinstance(filename, list) and isinstance(filename[0], pathlib.PurePath): filename = [str(ii) for ii in filename] # Read data using pandas read_csv one file at a time and append to # list. Then concatinate the list into one pandas dataframe. li = [] for fl in filename: df = pd.read_csv( fl, sep=sep, names=column_names, skipfooter=skipfooter, engine=engine, **kwargs ) li.append(df) if len(li) == 1: df = li[0] else: df = pd.concat(li, axis=0, ignore_index=ignore_index) # Set Coordinates if there's a variable date_time if 'date_time' in df: df.date_time = df.date_time.astype('datetime64[ns]') df.time = df.date_time df = df.set_index('time') # Convert to xarray DataSet ds = df.to_xarray() # Set additional variables # Since we cannot assume a standard naming convention setting # file_date and file_time to the first time in the file x_coord = ds.coords.to_index().values[0] if isinstance(x_coord, str): x_coord_dt = pd.to_datetime(x_coord) ds.attrs['_file_dates'] = x_coord_dt.strftime('%Y%m%d') ds.attrs['_file_times'] = x_coord_dt.strftime('%H%M%S') # Check for standard ARM datastream name, if none, assume the file is ARM # standard format. is_arm_file_flag = check_arm_standards(ds) if is_arm_file_flag == 0: ds.attrs['_datastream'] = '.'.join(filename[0].split('/')[-1].split('.')[0:2]) # Add additional attributes, site, standards flag, etc... ds.attrs['_site'] = str(ds.attrs['_datastream'])[0:3] ds.attrs['_arm_standards_flag'] = is_arm_file_flag return ds