Source code for act.qc.arm

"""
Functions specifically for working with QC/DQRs from
the Atmospheric Radiation Measurement Program (ARM).

"""

import datetime as dt
import numpy as np
import requests
import json
import textwrap
from dateutil import parser

from act.config import DEFAULT_DATASTREAM_NAME


[docs]def add_dqr_to_qc(
    ds,
    variable=None,
    assessment='incorrect,suspect',
    exclude=None,
    include=None,
    normalize_assessment=True,
    cleanup_qc=True,
    dqr_link=False,
    skip_location_vars=False,
    create_missing_qc_variables=True,
):
    """
    Function to query the ARM DQR web service for reports and
    add as a new quality control test to ancillary quality control
    variable. If no anicllary quality control variable exist a new
    one will be created and lined to the data variable through
    ancillary_variables attribure.

    See online documentation from ARM Data
    Quality Office on the use of the DQR web service.

    https://code.arm.gov/docs/dqrws-examples/wikis/home

    Information about the DQR web-service avaible at
    https://adc.arm.gov/dqrws/

    Parameters
    ----------
    ds : xarray.Dataset
        Xarray dataset
    variable : string, or list of str, or None
        Variables to check DQR web service. If set to None will
        attempt to update all variables.
    assessment : string
        assessment type to get DQRs. Current options include
        'missing', 'suspect', 'incorrect' or any combination separated
        by a comma.
    exclude : list of strings
        DQR IDs to exclude from adding into QC
    include : list of strings
        List of DQR IDs to include in flagging of data. Any other DQR IDs
        will be ignored.
    normalize_assessment : boolean
        The DQR assessment term is different than the embedded QC
        term. Embedded QC uses "Bad" and "Indeterminate" while
        DQRs use "Incorrect" and "Suspect". Setting this will ensure
        the same terms are used for both.
    cleanup_qc : boolean
        Call clean.cleanup() method to convert to standardized ancillary
        quality control variables. Has a little bit of overhead so
        if the Dataset has already been cleaned up, no need to run.
    dqr_link : boolean
        Prints out a link for each DQR to read the full DQR.  Defaults to False
    skip_location_vars : boolean
        Does not apply DQRs to location variables.  This can be useful in the event
        the submitter has erroneously selected all variables.
    create_missing_qc_variables : boolean
        If a quality control variable for the data variable does not exist,
        create the quality control varible and apply DQR.

    Returns
    -------
    ds : xarray.Dataset
        Xarray dataset containing new or updated quality control variables

    Examples
    --------
        .. code-block:: python

            from act.qc.arm import add_dqr_to_qc
            ds = add_dqr_to_qc(ds, variable=['temp_mean', 'atmos_pressure'])


    """

    # DQR Webservice goes off datastreams, pull from the dataset
    if 'datastream' in ds.attrs:
        datastream = ds.attrs['datastream']
    elif '_datastream' in ds.attrs:
        datastream = ds.attrs['_datastream']
    else:
        raise ValueError('Dataset does not have datastream attribute')

    if datastream == DEFAULT_DATASTREAM_NAME:
        raise ValueError(
            "'datastream' name required for DQR service set to default value "
            f"{datastream}. Unable to perform DQR service query."
        )

    # Clean up QC to conform to CF conventions
    if cleanup_qc:
        ds.clean.cleanup()

    # Get time from Dataset
    time = ds['time'].values

    # If the time is not a datetime64 because the read routine was not asked to
    # convert CF variables, convert the time variable for this routine only.
    if not np.issubdtype(time.dtype, np.datetime64):
        units_strings = [
            'seconds since ',
            'minutes since ',
            'hours since ',
            'days since ',
            'milliseconds since ',
            'months since ',
            'years since ',
        ]
        td64_strings = ['s', 'm', 'h', 'D', 'ms', 'M', 'Y']
        units = ds['time'].attrs['units']
        for ii, _ in enumerate(units_strings):
            if units.startswith(units_strings[ii]):
                units = units.replace(units_strings[ii], '')
                td64_string = td64_strings[ii]
                break

        start_time = parser.parse(units)
        start_time = np.datetime64(start_time, td64_string)
        time = start_time + ds['time'].values.astype('timedelta64[s]')

    start_date = time[0].astype('datetime64[s]').astype(dt.datetime).strftime('%Y%m%d')
    end_date = time[-1].astype('datetime64[s]').astype(dt.datetime).strftime('%Y%m%d')

    # Clean up assessment to ensure it is a string with no spaces.
    if isinstance(assessment, (list, tuple)):
        assessment = ','.join(assessment)

    # Not strictly needed but should make things more better.
    assessment = assessment.replace(' ', '')
    assessment = assessment.lower()

    # Create URL
    url = 'https://dqr-web-service.svcs.arm.gov/dqr_full'
    url += f"/{datastream}"
    url += f"/{start_date}/{end_date}"
    url += f"/{assessment}"

    # Call web service
    req = requests.get(url)

    # Check status values and raise error if not successful
    status = req.status_code
    if status == 400:
        raise ValueError('Check parameters')
    if status == 500:
        raise ValueError('DQR Webservice Temporarily Down')

    # Convert from string to dictionary
    docs = json.loads(req.text)

    # If no DQRs found will not have a key with datastream.
    # The status will also be 404.
    try:
        docs = docs[datastream]
    except KeyError:
        return ds

    dqr_results = {}
    for quality_category in docs:
        for dqr_number in docs[quality_category]:
            if exclude is not None and dqr_number in exclude:
                continue

            if include is not None and dqr_number not in include:
                continue

            index = np.array([], dtype=np.int32)
            for time_range in docs[quality_category][dqr_number]['dates']:
                starttime = np.datetime64(time_range['start_date'])
                endtime = np.datetime64(time_range['end_date'])
                ind = np.where((time >= starttime) & (time <= endtime))
                if ind[0].size > 0:
                    index = np.append(index, ind[0])

            if index.size > 0:
                dqr_results[dqr_number] = {
                    'index': index,
                    'test_assessment': quality_category.lower().capitalize(),
                    'test_meaning': f"{dqr_number} : {docs[quality_category][dqr_number]['description']}",
                    'variables': docs[quality_category][dqr_number]['variables'],
                }

            if dqr_link:
                print(
                    f"{dqr_number} - {quality_category.lower().capitalize()}: "
                    f"https://adc.arm.gov/ArchiveServices/DQRService?dqrid={dqr_number}"
                )

    # Check to ensure variable is list
    if variable and not isinstance(variable, (list, tuple)):
        variable = [variable]

    loc_vars = ['lat', 'lon', 'alt', 'latitude', 'longitude', 'altitude']
    for key, value in dqr_results.items():
        for var_name in value['variables']:
            # Do not process on location variables
            if skip_location_vars and var_name in loc_vars:
                continue

            # Do not process time variables
            if var_name in ['time', 'time_offset', 'time_bounds']:
                continue

            # Only process provided variable names
            if variable is not None and var_name not in variable:
                continue

            # Do not process quality control variables as this will create a new
            # quality control variable for the quality control varible.
            try:
                if ds[var_name].attrs['standard_name'] == 'quality_flag':
                    continue
            except KeyError:
                pass

            if (
                create_missing_qc_variables is False
                and ds.qcfilter.check_for_ancillary_qc(var_name, add_if_missing=False) is None
            ):
                continue

            try:
                ds.qcfilter.add_test(
                    var_name,
                    index=np.unique(value['index']),
                    test_meaning=value['test_meaning'],
                    test_assessment=value['test_assessment'],
                )

            except KeyError:  # Variable name not in Dataset
                continue

            except IndexError:
                print(f"Skipping '{var_name}' DQR application because of IndexError")
                continue

            if normalize_assessment:
                ds.clean.normalize_assessment(variables=var_name)

    return ds


[docs]def print_dqr(
    datastream,
    start_date,
    end_date,
    variable=None,
    assessment='incorrect,suspect',
    pretty_print=True,
):
    """
    Function to query the ARM DQR web service for reports and
    print the information out in a "pretty" format or in a format
    conducive for pasting into a spreadsheet.

    Information about the DQR web-service avaible at
    https://adc.arm.gov/dqrws/

    Parameters
    ----------
    datastream : string
        Datastream name to query the API for.  Example: sgpmetE13.b1
    start_date : string
        Start date for querying DQR start dates in YYYYMMDD format.  Example: 20240101
    end_date : string
        End date for querying DQR start dates in YYYYMMDD format.  Example: 20240101
    variable : string
        Variable to query DQRs for.  Will only print out DQRs with similar matches.
    assessment : string
        assessment type to get DQRs. Current options include
        'missing', 'suspect', 'incorrect' or any combination separated
        by a comma.
    pretty_print : boolean
        Set to print out in a "pretty" format that is easy to read as default.
        Set to False will print DQR information out in a format that can be pasted
        into a spreadsheet and converted to columns with a comma delimiter

    Returns
    -------
    dqr_results : dict
        Dictionary of DQR information

    """

    # Create URL
    url = 'https://dqr-web-service.svcs.arm.gov/dqr_full'
    url += f"/{datastream}"
    url += f"/{start_date}/{end_date}"
    url += f"/{assessment}"

    # Call web service
    req = requests.get(url)

    # Check status values and raise error if not successful
    status = req.status_code
    if status == 400:
        raise ValueError('Check parameters')
    if status == 404:
        raise ValueError('Check parameters - No DQRs found')
    if status == 500:
        raise ValueError('DQR Webservice Temporarily Down')

    # Convert from string to dictionary
    docs = json.loads(req.text)

    # If no DQRs found will not have a key with datastream.
    # The status will also be 404.
    try:
        docs = docs[datastream]
    except KeyError:
        return

    # Run through results and print out DQR information
    dqr_results = {}
    for quality_category in docs:
        for dqr_number in docs[quality_category]:
            subject = docs[quality_category][dqr_number]['subject']
            description = docs[quality_category][dqr_number]['description']
            suggestions = docs[quality_category][dqr_number]['suggestions']
            variables = docs[quality_category][dqr_number]['variables']
            if variable is not None:
                if variable not in variables:
                    continue
            dates = docs[quality_category][dqr_number]['dates']
            url = 'https://adc.arm.gov/ArchiveServices/DQRService?dqrid=' + dqr_number
            if pretty_print:
                print('\n')
                print(
                    '\t'.join(
                        ['\033[1m' + dqr_number, datastream, quality_category, subject + '\033[0m']
                    )
                )
                print(url)
                print('\n')
                print('\033[1m' + 'Description:' + '\033[0m')
                [print(t) for t in textwrap.wrap(description, width=100)]
                if suggestions is not None:
                    print('\n')
                    print('\033[1m' + 'Suggestions:' + '\033[0m')
                    [print(t) for t in textwrap.wrap(suggestions, width=100)]

                print('\n')
                print('\033[1m' + 'Variables:' + '\033[0m')
                [print('    ', v) for v in variables]
                print('\n')
                print('\033[1m' + 'Dates:' + '\033[0m')
                print('\t', 'Start Date:', '\t\t', 'End Date')
                [print('\t', d['start_date'], '\t', d['end_date']) for d in dates]
                print('-' * 100)
            else:
                if suggestions is None:
                    suggestions = 'None'
                text = ','.join(
                    [dqr_number, datastream, quality_category, subject, description, suggestions]
                )
                text += ',' + ';'.join(variables)
                text += ','
                [print(text + ','.join([d['start_date'], d['end_date']])) for d in dates]

            dqr_results[dqr_number] = {
                'test_assessment': quality_category,
                'subject': subject,
                'description': description,
                'suggestions': suggestions,
                'variables': docs[quality_category][dqr_number]['variables'],
                'dates': dates,
            }

    # Return base information if needed
    return dqr_results