"""
Functions specifically for working with QC/DQRs from
the Atmospheric Radiation Measurement Program (ARM).
"""
import datetime as dt
import numpy as np
import requests
import json
from dateutil import parser
from act.config import DEFAULT_DATASTREAM_NAME
[docs]def add_dqr_to_qc(
ds,
variable=None,
assessment='incorrect,suspect',
exclude=None,
include=None,
normalize_assessment=True,
cleanup_qc=True,
dqr_link=False,
skip_location_vars=False,
create_missing_qc_variables=True,
):
"""
Function to query the ARM DQR web service for reports and
add as a new quality control test to ancillary quality control
variable. If no anicllary quality control variable exist a new
one will be created and lined to the data variable through
ancillary_variables attribure.
See online documentation from ARM Data
Quality Office on the use of the DQR web service.
https://code.arm.gov/docs/dqrws-examples/wikis/home
Information about the DQR web-service avaible at
https://adc.arm.gov/dqrws/
Parameters
----------
ds : xarray.Dataset
Xarray dataset
variable : string, or list of str, or None
Variables to check DQR web service. If set to None will
attempt to update all variables.
assessment : string
assessment type to get DQRs. Current options include
'missing', 'suspect', 'incorrect' or any combination separated
by a comma.
exclude : list of strings
DQR IDs to exclude from adding into QC
include : list of strings
List of DQR IDs to include in flagging of data. Any other DQR IDs
will be ignored.
normalize_assessment : boolean
The DQR assessment term is different than the embedded QC
term. Embedded QC uses "Bad" and "Indeterminate" while
DQRs use "Incorrect" and "Suspect". Setting this will ensure
the same terms are used for both.
cleanup_qc : boolean
Call clean.cleanup() method to convert to standardized ancillary
quality control variables. Has a little bit of overhead so
if the Dataset has already been cleaned up, no need to run.
dqr_link : boolean
Prints out a link for each DQR to read the full DQR. Defaults to False
skip_location_vars : boolean
Does not apply DQRs to location variables. This can be useful in the event
the submitter has erroneously selected all variables.
create_missing_qc_variables : boolean
If a quality control variable for the data variable does not exist,
create the quality control varible and apply DQR.
Returns
-------
ds : xarray.Dataset
Xarray dataset containing new or updated quality control variables
Examples
--------
.. code-block:: python
from act.qc.arm import add_dqr_to_qc
ds = add_dqr_to_qc(ds, variable=['temp_mean', 'atmos_pressure'])
"""
# DQR Webservice goes off datastreams, pull from the dataset
if 'datastream' in ds.attrs:
datastream = ds.attrs['datastream']
elif '_datastream' in ds.attrs:
datastream = ds.attrs['_datastream']
else:
raise ValueError('Dataset does not have datastream attribute')
if datastream == DEFAULT_DATASTREAM_NAME:
raise ValueError(
"'datastream' name required for DQR service set to default value "
f"{datastream}. Unable to perform DQR service query."
)
# Clean up QC to conform to CF conventions
if cleanup_qc:
ds.clean.cleanup()
# Get time from Dataset
time = ds['time'].values
# If the time is not a datetime64 because the read routine was not asked to
# convert CF variables, convert the time variable for this routine only.
if not np.issubdtype(time.dtype, np.datetime64):
units_strings = [
'seconds since ',
'minutes since ',
'hours since ',
'days since ',
'milliseconds since ',
'months since ',
'years since ',
]
td64_strings = ['s', 'm', 'h', 'D', 'ms', 'M', 'Y']
units = ds['time'].attrs['units']
for ii, _ in enumerate(units_strings):
if units.startswith(units_strings[ii]):
units = units.replace(units_strings[ii], '')
td64_string = td64_strings[ii]
break
start_time = parser.parse(units)
start_time = np.datetime64(start_time, td64_string)
time = start_time + ds['time'].values.astype('timedelta64[s]')
start_date = time[0].astype('datetime64[s]').astype(dt.datetime).strftime('%Y%m%d')
end_date = time[-1].astype('datetime64[s]').astype(dt.datetime).strftime('%Y%m%d')
# Clean up assessment to ensure it is a string with no spaces.
if isinstance(assessment, (list, tuple)):
assessment = ','.join(assessment)
# Not strictly needed but should make things more better.
assessment = assessment.replace(' ', '')
assessment = assessment.lower()
# Create URL
url = 'https://dqr-web-service.svcs.arm.gov/dqr_full'
url += f"/{datastream}"
url += f"/{start_date}/{end_date}"
url += f"/{assessment}"
# Call web service
req = requests.get(url)
# Check status values and raise error if not successful
status = req.status_code
if status == 400:
raise ValueError('Check parameters')
if status == 500:
raise ValueError('DQR Webservice Temporarily Down')
# Convert from string to dictionary
docs = json.loads(req.text)
# If no DQRs found will not have a key with datastream.
# The status will also be 404.
try:
docs = docs[datastream]
except KeyError:
return ds
dqr_results = {}
for quality_category in docs:
for dqr_number in docs[quality_category]:
if exclude is not None and dqr_number in exclude:
continue
if include is not None and dqr_number not in include:
continue
index = np.array([], dtype=np.int32)
for time_range in docs[quality_category][dqr_number]['dates']:
starttime = np.datetime64(time_range['start_date'])
endtime = np.datetime64(time_range['end_date'])
ind = np.where((time >= starttime) & (time <= endtime))
if ind[0].size > 0:
index = np.append(index, ind[0])
if index.size > 0:
dqr_results[dqr_number] = {
'index': index,
'test_assessment': quality_category.lower().capitalize(),
'test_meaning': f"{dqr_number} : {docs[quality_category][dqr_number]['description']}",
'variables': docs[quality_category][dqr_number]['variables'],
}
if dqr_link:
print(
f"{dqr_number} - {quality_category.lower().capitalize()}: "
f"https://adc.arm.gov/ArchiveServices/DQRService?dqrid={dqr_number}"
)
# Check to ensure variable is list
if variable and not isinstance(variable, (list, tuple)):
variable = [variable]
loc_vars = ['lat', 'lon', 'alt', 'latitude', 'longitude', 'altitude']
for key, value in dqr_results.items():
for var_name in value['variables']:
# Do not process on location variables
if skip_location_vars and var_name in loc_vars:
continue
# Do not process time variables
if var_name in ['time', 'time_offset', 'time_bounds']:
continue
# Only process provided variable names
if variable is not None and var_name not in variable:
continue
# Do not process quality control variables as this will create a new
# quality control variable for the quality control varible.
try:
if ds[var_name].attrs['standard_name'] == 'quality_flag':
continue
except KeyError:
pass
if (
create_missing_qc_variables is False
and ds.qcfilter.check_for_ancillary_qc(var_name, add_if_missing=False) is None
):
continue
try:
ds.qcfilter.add_test(
var_name,
index=np.unique(value['index']),
test_meaning=value['test_meaning'],
test_assessment=value['test_assessment'],
)
except KeyError: # Variable name not in Dataset
continue
except IndexError:
print(f"Skipping '{var_name}' DQR application because of IndexError")
continue
if normalize_assessment:
ds.clean.normalize_assessment(variables=var_name)
return ds