Source code for act.discovery.improve

"""
Script for downloading data from the IMPROVE network

"""

import numpy as np
import pandas as pd
import xarray as xr


[docs]def get_improve_data(site_id=None, parameter_id=None, start_date=None, end_date=None):
    """
    Retrieve IMPROVE data for the given site and variable ids and store it in an
    xarray dataset. Documentation on the IMPROVE data can be found at
    https://vista.cira.colostate.edu/Improve/data-user-guide/

    Also adds in metadata from the site summary page to the global attributes
    https://views.cira.colostate.edu/adms/Pub/SiteSummary.aspx?dsidse=10001&siidse=244

    Parameters
    ----------
    site_id : str
        Site id number which can be retrieved from the IMPROVE page for each site such as
        https://views.cira.colostate.edu/adms/Pub/SiteSummary.aspx?dsidse=10001&siidse=244
    parameter_id : list
        List of parameter id values to retrieve from the API.
    start_date : str
        Start date formatted as M/D/YEAR such as 1/31/2022
    end_date : str
        End date formatted as M/D/YEAR such as 1/31/2022

    Returns
    -------
    ds : xarray.Dataset
        Returns an Xarray dataset object

    Example
    -------
    act.discovery.get_improve_data(site_id='244')

    """

    # Build URL
    base_url = 'https://views.cira.colostate.edu/fed/svc/DataSvc.aspx?action=getqueryresults&cmdfileid=ServiceSqlCommandFile&cmdid=BasicDataQuery1_Codes'

    if site_id is None:
        raise ValueError('Please provide a site_id')
    else:
        base_url += '&dsidse=10001&siidse=' + str(site_id)

    if parameter_id is None:
        base_url += '&paidse=101,136,907,900,102,104,105,115,116,117,114,3778,142,143,144,145,3016,146,3699,141,3779,3217,108,109,112,113,301,304,303,3716,3717,3718,3719,3720,3721,3722,3730,3731,3732,3733,3734,3735,3736,3694,121,3723,3724,3725,3726,3727,3728,3729,3737,3738,3739,3740,3741,3742,3743,118,148,128,130,132,941,127,903,910,3744,3745,3746,3747,3748,3749,3750,3751,3752,3753,3754,3755,3756,3757,131,138,139,133,3704,3705,3706,3707,3708,3709,3710,3711,3712,3713,3714,3715,147,124,150,3695,3014,153,154,134,911,158,156,151,202,159,160,162,163'
    else:
        base_url += '&paidse=' + ','.join(parameter_id)

    if start_date is None:
        raise ValueError('Please provide a start date')
    else:
        base_url += '&sd=' + start_date
    if end_date is None:
        raise ValueError('Please provide an end date')
    else:
        base_url += '&ed=' + end_date

    # Read data and get variables
    df = pd.read_html(base_url)[0]
    variables = np.unique(df.Param)

    # Print out proper acknowledgement
    print("Please use the following acknowledgment when using IMPROVE data:\n")

    print(
        "IMPROVE is a collaborative association of state, tribal, and federal agencies, and international partners. US Environmental Protection Agency is the primary funding source, with contracting and research support from the National Park Service. The Air Quality Group at the University of California, Davis is the central analytical laboratory, with ion analysis provided by Research Triangle Institute, and carbon analysis provided by Desert Research Institute."
    )

    # Creat mapping of variable names to metadata
    mapping = {
        'ALf': {'name': 'aluminum_fine', 'long_name': 'Aluminum (Fine)', 'epa_code': '88104'},
        'ASf': {'name': 'arsenic_fine', 'long_name': 'Arsenic (Fine)', 'epa_code': '88103'},
        'BRf': {'name': 'bromine_fine', 'long_name': 'Bromine (Fine)', 'epa_code': '88109'},
        'CAf': {'name': 'calcium_fine', 'long_name': 'Calcium (Fine)', 'epa_code': '88111'},
        'CLf': {'name': 'chlorine_fine', 'long_name': 'Chlorine (Fine)', 'epa_code': '88115'},
        'CRf': {'name': 'chromium_fine', 'long_name': 'Chromium (Fine)', 'epa_code': '88112'},
        'CUf': {'name': 'copper_fine', 'long_name': 'Copper (Fine)', 'epa_code': '88114'},
        'FEf': {'name': 'iron_fine', 'long_name': 'Iron (Fine)', 'epa_code': '88126'},
        'PBf': {'name': 'lead_fine', 'long_name': 'Lead (Fine)', 'epa_code': '88128'},
        'MGf': {'name': 'magnesium_fine', 'long_name': 'Magnesium (Fine)', 'epa_code': '88140'},
        'MNf': {'name': 'manganese_fine', 'long_name': 'Manganese (Fine)', 'epa_code': '88132'},
        'NIf': {'name': 'nickel_fine', 'long_name': 'Nickel (Fine)', 'epa_code': '88136'},
        'Pf': {'name': 'phosphorus_fine', 'long_name': 'Phosphorus (Fine)', 'epa_code': '88152'},
        'Kf': {'name': 'potassium_fine', 'long_name': 'Potassium (Fine)', 'epa_code': '88180'},
        'RBf': {'name': 'rubidium_fine', 'long_name': 'Rubidium (Fine)', 'epa_code': '88176'},
        'SEf': {'name': 'selenium_fine', 'long_name': 'Selenium (Fine)', 'epa_code': '88154'},
        'SIf': {'name': 'silicon_fine', 'long_name': 'Silicon (Fine)', 'epa_code': '88165'},
        'NAf': {'name': 'sodium_fine', 'long_name': 'Sodium (Fine)', 'epa_code': '88184'},
        'SRf': {'name': 'strontium_fine', 'long_name': 'Strontium (Fine)', 'epa_code': '88168'},
        'Sf': {'name': 'sulfur_fine', 'long_name': 'Sulfur (Fine)', 'epa_code': '88169'},
        'TIf': {'name': 'titanium_fine', 'long_name': 'Titanium (Fine)', 'epa_code': '88161'},
        'Vf': {'name': 'vanadium_fine', 'long_name': 'Vanadium (Fine)', 'epa_code': '88164'},
        'ZNf': {'name': 'zinc_fine', 'long_name': 'Zinc (Fine)', 'epa_code': '88167'},
        'ZRf': {'name': 'zirconium_fine', 'long_name': 'Zirconium (Fine)', 'epa_code': '88185'},
        'CHLf': {'name': 'chloride_fine', 'long_name': 'Chloride (Fine)', 'epa_code': '88203'},
        'NO3f': {'name': 'nitrate_fine', 'long_name': 'Nitrate (Fine)', 'epa_code': '88306'},
        'N2f': {'name': 'nitrite_fine', 'long_name': 'Nitrite (Fine)', 'epa_code': '88338'},
        'SO4f': {'name': 'sulfate_fine', 'long_name': 'Sulfate (Fine)', 'epa_code': '88403'},
        'OC1f': {
            'name': 'carbon_organic_fraction_1_fine',
            'long_name': 'Carbon, Organic Fraction 1 (Fine)',
            'comments': 'TOR, pure helium (>99.999%) atmosphere, temperature (T) = 140 °C',
            'epa_code': '88324',
        },
        'OC2f': {
            'name': 'carbon_organic_fraction_2_fine',
            'long_name': 'Carbon, Organic Fraction 2 (Fine)',
            'comments': 'TOR, pure helium (>99.999%) atmosphere, temperature (T) = 280 °C',
            'epa_code': '88325',
        },
        'OC3f': {
            'name': 'carbon_organic_fraction_3_fine',
            'long_name': 'Carbon, Organic Fraction 3 (Fine)',
            'comments': 'TOR, pure helium (>99.999%) atmosphere, temperature (T) = 480 °C',
            'epa_code': '88326',
        },
        'OC4f': {
            'name': 'carbon_organic_fraction_4_fine',
            'long_name': 'Carbon, Organic Fraction 4 (Fine)',
            'comments': 'TOR, pure helium (>99.999%) atmosphere, temperature (T) = 580 °C',
            'epa_code': '88327',
        },
        'OPf': {
            'name': 'carbon_organic_reflectance_fine',
            'long_name': 'Carbon, Organic Pyrolized (Fine) by Reflectance',
            'comments': 'TOR, carbon that is measured after the introduction of helium/oxygen atmosphere at °550 C but beforereflectance returns to initial value',
            'epa_code': '88328',
        },
        'OPTf': {
            'name': 'carbon_organic_transmittance_fine',
            'long_name': 'Carbon, Organic Pyrolized (Fine) by Transmittance',
            'comments': 'TOR, carbon that is measured after the introduction of helium/oxygen atmosphere at °550 C but beforetransmittance returns to initial value',
            'epa_code': '88336',
        },
        'OCf': {
            'name': 'carbon_organic_total_fine',
            'long_name': 'Carbon, Organic Total (Fine)',
            'comments': 'Organic carbon from TOR carbon fractions (OC1f+OC2f+OC3f+OC4f+OPf)',
            'epa_code': '88320',
        },
        'EC1f': {
            'name': 'carbon_elemental_fraction_1_fine',
            'long_name': 'Carbon, Elemental Fraction 1 (Fine)',
            'comments': 'TOR, 98% helium, 2% oxygen atmosphere, temperature (T) = 580° C.',
            'epa_code': '88329',
        },
        'EC2f': {
            'name': 'carbon_elemental_fraction_2_fine',
            'long_name': 'Carbon, Elemental Fraction 2 (Fine)',
            'comments': 'TOR, 98% helium, 2% oxygen atmosphere, temperature (T) = 740° C.',
            'epa_code': '88380',
        },
        'EC3f': {
            'name': 'carbon_elemental_fraction_3_fine',
            'long_name': 'Carbon, Elemental Fraction 3 (Fine)',
            'comments': 'TOR, 98% helium, 2% oxygen atmosphere, temperature (T) = 840° C.',
            'epa_code': '88331',
        },
        'ECf': {
            'name': 'carbon_elemental_total_fine',
            'long_name': 'Carbon, Elemental Total (Fine)',
            'comments': 'Elemental carbon from TOR carbon fractions (E1+E2+E3-OP)',
            'epa_code': '88321',
        },
        'fAbs': {
            'name': 'filter_absorption_coeff',
            'long_name': 'Filter Absorption Coefficient',
            'comments': 'A calibrated absorption coefficient measured from a Teflon filter using a hybrid integrating plate and sphere (HIPS) method',
            'epa_code': '63102',
        },
        'FlowRate': {
            'name': 'flow_rate',
            'long_name': 'Flow Rate',
            'comments': 'The rate of air flow through an air sampling instrument',
            'epa_code': '63102',
        },
        'MF': {
            'name': 'mass_pm2_5',
            'long_name': 'Mass, PM2.5 (Fine)',
            'comments': 'Gravimetric mass measurement for particles with aerodynamic diameters less than 2.5 um',
            'epa_code': '88101',
        },
        'MT': {
            'name': 'mass_pm10',
            'long_name': 'Mass, PM10 (Total)',
            'comments': 'Gravimetric mass measurement for particles with aerodynamic diameters less than 10 um',
            'epa_code': '85101',
        },
        'SampDur': {
            'name': 'sample_duration',
            'long_name': 'Sampling Duration',
            'comments': 'The duration of a given sampling period in minutes',
        },
        'ammNO3f': {
            'name': 'ammonium_nitrate_fine',
            'long_name': 'Ammonium Nitrate (Fine)',
            'comments': '1.29 x  NO3f',
        },
        'ammSO4f': {
            'name': 'ammonium_sulfate_fine',
            'long_name': 'Ammonium Sulfate (Fine)',
            'comments': '1.375 x SO4f',
        },
        'OMCf': {
            'name': 'carbon_organic_mass_fine',
            'long_name': 'Carbon, Organic Mass (fine)(1.8*OC)',
            'comments': '1.8 X OCf',
        },
        'TCf': {
            'name': 'carbon_total_fine',
            'long_name': 'Carbon, Total (fine)',
            'comments': 'From TOR carbon fractions (OCf+ECf)',
        },
        'CM_calculated': {
            'name': 'CM_calculated',
            'long_name': 'Mass, PM10-PM2.5 (Coarse)',
            'comments': 'MT-MF',
        },
        'SeaSaltf': {
            'name': 'sea_salt_fine',
            'long_name': 'Sea Salt (Fine)',
            'comments': '1.8XCHLf',
        },
        'SOILf': {
            'name': 'soil_fine',
            'long_name': 'Soil (Fine)',
            'comments': '2.2 × ALf + 2.49 × SIf + 1.63 × CAf + 2.42 × FEf + 1.94 × TIf',
        },
        'RCFM': {
            'name': 'mass_pm2_5_reconstructed',
            'long_name': 'Mass, PM2.5 Reconstructed (Fine)',
            'comments': 'Sum of ammSO4f, ammNO3f, OMCf, ECf, soilf, and seasaltf.',
        },
        'RCTM': {
            'name': 'mass_pmi10_reconstructed',
            'long_name': 'Mass, PM10 Reconstructed (Total)',
            'comments': 'Sum of ammSO4f, ammNO3f, OMCf, ECf, soilf, seasaltf, and CM_calculated.',
        },
    }
    laser_vars = {
        'RefF': {
            'units': 'ratio',
            'comments': 'Final laser reflectance at ',
            'name': 'final_laser_reflectance_',
        },
        'TransF': {
            'units': 'ratio',
            'comments': 'Final laser transmittance at ',
            'name': 'final_laser_transmittance_',
        },
        'RefI': {
            'units': 'ratio',
            'comments': 'Initial laser reflectance at ',
            'name': 'initial_laser_reflectance_',
        },
        'TransI': {
            'units': 'ratio',
            'comments': 'Initial laser transmittance at ',
            'name': 'initial_laser_transmittance_',
        },
        'RefM': {
            'units': 'ratio',
            'comments': 'Minimum laser reflectance at ',
            'name': 'min_laser_reflectance_',
        },
        'TransM': {
            'units': 'ratio',
            'comments': ' Minimum laser transmittance at ',
            'name': 'min_laser_transmittance_',
        },
        'OP_TR': {
            'units': 'ug m-3',
            'comments': 'Organic Pyrolyzed Carbon by Reflectance at ',
            'name': 'organic_pyrolyzed_carbon_reflectance_',
        },
        'OP_TT': {
            'units': 'ug m-3',
            'comments': 'Organic Pyrolyzed Carbon by Transmittance at ',
            'name': 'organic_pyrolyzed_carbon_transmittance_',
        },
    }
    laser_wl = ['405', '445', '532', '635', '780', '808', '980']
    for v in laser_vars:
        for wl in laser_wl:
            name = laser_vars[v]['name'] + wl
            if 'OP' not in v:
                mapping['_'.join([v, wl])] = {
                    'units': laser_vars[v]['units'],
                    'name': name,
                    'long_name': ' '.join([laser_vars[v]['comments'], wl]),
                }
            else:
                var_name = wl.join(v.split('_'))
                mapping[var_name] = {
                    'units': laser_vars[v]['units'],
                    'name': name,
                    'long_name': ' '.join([laser_vars[v]['comments'], wl]),
                }

    # Run through each variable in the dataframe and add it to a dataset
    # along with the appropriate metadata
    ct = 0
    site = np.unique(df.Site)[0]
    attrs = {'url': base_url, 'datastream': site + ' IMPROVE'}

    for v in variables:
        # Find data for just the variable in question
        poc_attrs = {'units': '1', 'long_name': 'Parameter Occurrence Code for ' + v}
        df2 = df[df.Param == v]

        # Get metadata
        unit = np.unique(df2.UnitAbbr)
        if len(unit) > 1:
            raise ValueError('Multiple types of units detected, using first one')

        sites = np.unique(df2.Site)
        if len(sites) > 1:
            raise ValueError('Multiple sites detected, please use only one')

        # Get time, POC, and data
        time = pd.to_datetime(df2.FactDate)
        poc = df2.POC

        data = df2.FactValue

        # Set up attributes
        var_attrs = {'units': unit[0], 'long_name': mapping[v]['long_name'], '_FillValue': -999.0}
        if 'comments' in mapping[v]:
            var_attrs['comments'] = mapping[v]['comments']
        if 'epa_code' in mapping[v]:
            var_attrs['epa_code'] = mapping[v]['epa_code']

        # If the first variable, create the dataset and then add variables to it
        if ct == 0:
            ds = xr.Dataset(
                data_vars={mapping[v]['name']: (['time'], data, var_attrs)},
                coords={'time': time},
                attrs=attrs,
            )
            ds['poc_' + mapping[v]['name']] = xr.DataArray(
                data=poc, dims=['time'], coords={'time': time}, attrs=poc_attrs
            )
            ct += 1
        else:
            ds[mapping[v]['name']] = xr.DataArray(
                data=data, dims=['time'], coords={'time': time}, attrs=var_attrs
            )
            ds['poc_' + mapping[v]['name']] = xr.DataArray(
                data=poc, dims=['time'], coords={'time': time}, attrs=poc_attrs
            )

    # Add in metadata from site summary page
    url = 'https://views.cira.colostate.edu/adms/Pub/SiteSummary.aspx?dsidse=10001&siidse=' + str(
        site_id
    )
    df = pd.read_html(url)
    for i in df[0].index:
        # Add lat/lon as variables
        if df[0][0][i] == 'Latitude':
            attrs = {
                'long_name': 'North latitude',
                'units': 'degree_N',
                'valid_min': -90.0,
                'valid_max': 90.0,
                'standard_name': 'latitude',
            }
            ds['lat'] = xr.DataArray(
                data=float(df[0][1][i]),
                dims=['time'],
                coords={'time': ds['time'].values},
                attrs=attrs,
            )
        elif df[0][0][i] == 'Longitude':
            attrs = {
                'long_name': 'East longitude',
                'units': 'degree_E',
                'valid_min': -180.0,
                'valid_max': 180.0,
                'standard_name': 'longitude',
            }
            ds['lon'] = xr.DataArray(
                data=float(df[0][1][i]),
                dims=['time'],
                coords={'time': ds['time'].values},
                attrs=attrs,
            )
        else:
            ds.attrs[df[0][0][i]] = df[0][1][i]

    # Add in problem information from the site summary page
    problem = ''
    for i in df[-1].index:
        problem += '_'.join(
            [df[-1]['EventDate'][i], df[-1]['EventType'][i], df[-1]['Notes'][i], '\n']
        )
    ds.attrs['site_problems'] = problem

    ds = ds.sortby('time')

    return ds