Source code for act.qc.arm

"""
Functions specifically for working with QC/DQRs from
the Atmospheric Radiation Measurement Program (ARM).

"""

import datetime as dt
import numpy as np
import requests
import json
from dateutil import parser

from act.config import DEFAULT_DATASTREAM_NAME


[docs]def add_dqr_to_qc( ds, variable=None, assessment='incorrect,suspect', exclude=None, include=None, normalize_assessment=True, cleanup_qc=True, dqr_link=False, skip_location_vars=False, create_missing_qc_variables=True, ): """ Function to query the ARM DQR web service for reports and add as a new quality control test to ancillary quality control variable. If no anicllary quality control variable exist a new one will be created and lined to the data variable through ancillary_variables attribure. See online documentation from ARM Data Quality Office on the use of the DQR web service. https://code.arm.gov/docs/dqrws-examples/wikis/home Information about the DQR web-service avaible at https://adc.arm.gov/dqrws/ Parameters ---------- ds : xarray.Dataset Xarray dataset variable : string, or list of str, or None Variables to check DQR web service. If set to None will attempt to update all variables. assessment : string assessment type to get DQRs. Current options include 'missing', 'suspect', 'incorrect' or any combination separated by a comma. exclude : list of strings DQR IDs to exclude from adding into QC include : list of strings List of DQR IDs to include in flagging of data. Any other DQR IDs will be ignored. normalize_assessment : boolean The DQR assessment term is different than the embedded QC term. Embedded QC uses "Bad" and "Indeterminate" while DQRs use "Incorrect" and "Suspect". Setting this will ensure the same terms are used for both. cleanup_qc : boolean Call clean.cleanup() method to convert to standardized ancillary quality control variables. Has a little bit of overhead so if the Dataset has already been cleaned up, no need to run. dqr_link : boolean Prints out a link for each DQR to read the full DQR. Defaults to False skip_location_vars : boolean Does not apply DQRs to location variables. This can be useful in the event the submitter has erroneously selected all variables. create_missing_qc_variables : boolean If a quality control variable for the data variable does not exist, create the quality control varible and apply DQR. Returns ------- ds : xarray.Dataset Xarray dataset containing new or updated quality control variables Examples -------- .. code-block:: python from act.qc.arm import add_dqr_to_qc ds = add_dqr_to_qc(ds, variable=['temp_mean', 'atmos_pressure']) """ # DQR Webservice goes off datastreams, pull from the dataset if 'datastream' in ds.attrs: datastream = ds.attrs['datastream'] elif '_datastream' in ds.attrs: datastream = ds.attrs['_datastream'] else: raise ValueError('Dataset does not have datastream attribute') if datastream == DEFAULT_DATASTREAM_NAME: raise ValueError( "'datastream' name required for DQR service set to default value " f"{datastream}. Unable to perform DQR service query." ) # Clean up QC to conform to CF conventions if cleanup_qc: ds.clean.cleanup() # Get time from Dataset time = ds['time'].values # If the time is not a datetime64 because the read routine was not asked to # convert CF variables, convert the time variable for this routine only. if not np.issubdtype(time.dtype, np.datetime64): units_strings = [ 'seconds since ', 'minutes since ', 'hours since ', 'days since ', 'milliseconds since ', 'months since ', 'years since ', ] td64_strings = ['s', 'm', 'h', 'D', 'ms', 'M', 'Y'] units = ds['time'].attrs['units'] for ii, _ in enumerate(units_strings): if units.startswith(units_strings[ii]): units = units.replace(units_strings[ii], '') td64_string = td64_strings[ii] break start_time = parser.parse(units) start_time = np.datetime64(start_time, td64_string) time = start_time + ds['time'].values.astype('timedelta64[s]') start_date = time[0].astype('datetime64[s]').astype(dt.datetime).strftime('%Y%m%d') end_date = time[-1].astype('datetime64[s]').astype(dt.datetime).strftime('%Y%m%d') # Clean up assessment to ensure it is a string with no spaces. if isinstance(assessment, (list, tuple)): assessment = ','.join(assessment) # Not strictly needed but should make things more better. assessment = assessment.replace(' ', '') assessment = assessment.lower() # Create URL url = 'https://dqr-web-service.svcs.arm.gov/dqr_full' url += f"/{datastream}" url += f"/{start_date}/{end_date}" url += f"/{assessment}" # Call web service req = requests.get(url) # Check status values and raise error if not successful status = req.status_code if status == 400: raise ValueError('Check parameters') if status == 500: raise ValueError('DQR Webservice Temporarily Down') # Convert from string to dictionary docs = json.loads(req.text) # If no DQRs found will not have a key with datastream. # The status will also be 404. try: docs = docs[datastream] except KeyError: return ds dqr_results = {} for quality_category in docs: for dqr_number in docs[quality_category]: if exclude is not None and dqr_number in exclude: continue if include is not None and dqr_number not in include: continue index = np.array([], dtype=np.int32) for time_range in docs[quality_category][dqr_number]['dates']: starttime = np.datetime64(time_range['start_date']) endtime = np.datetime64(time_range['end_date']) ind = np.where((time >= starttime) & (time <= endtime)) if ind[0].size > 0: index = np.append(index, ind[0]) if index.size > 0: dqr_results[dqr_number] = { 'index': index, 'test_assessment': quality_category.lower().capitalize(), 'test_meaning': f"{dqr_number} : {docs[quality_category][dqr_number]['description']}", 'variables': docs[quality_category][dqr_number]['variables'], } if dqr_link: print( f"{dqr_number} - {quality_category.lower().capitalize()}: " f"https://adc.arm.gov/ArchiveServices/DQRService?dqrid={dqr_number}" ) # Check to ensure variable is list if variable and not isinstance(variable, (list, tuple)): variable = [variable] loc_vars = ['lat', 'lon', 'alt', 'latitude', 'longitude', 'altitude'] for key, value in dqr_results.items(): for var_name in value['variables']: # Do not process on location variables if skip_location_vars and var_name in loc_vars: continue # Do not process time variables if var_name in ['time', 'time_offset', 'time_bounds']: continue # Only process provided variable names if variable is not None and var_name not in variable: continue # Do not process quality control variables as this will create a new # quality control variable for the quality control varible. try: if ds[var_name].attrs['standard_name'] == 'quality_flag': continue except KeyError: pass if ( create_missing_qc_variables is False and ds.qcfilter.check_for_ancillary_qc(var_name, add_if_missing=False) is None ): continue try: ds.qcfilter.add_test( var_name, index=np.unique(value['index']), test_meaning=value['test_meaning'], test_assessment=value['test_assessment'], ) except KeyError: # Variable name not in Dataset continue except IndexError: print(f"Skipping '{var_name}' DQR application because of IndexError") continue if normalize_assessment: ds.clean.normalize_assessment(variables=var_name) return ds