Source code for act.discovery.arm

Script for downloading data from ARM's Live Data Webservice


import json
import os
from datetime import timedelta
import requests
import textwrap

    from urllib.request import urlopen
except ImportError:
    from urllib import urlopen

from act.utils import date_parser

[docs]def download_arm_data(username, token, datastream, startdate, enddate, time=None, output=None): """ This tool will help users utilize the ARM Live Data Webservice to download ARM data. Parameters ---------- username : str The username to use for logging into the ADC archive. token : str The access token for accessing the ADC archive. datastream : str The name of the datastream to acquire. startdate : str The start date of the data to acquire. Formats accepted are YYYY-MM-DD, DD.MM.YYYY, DD/MM/YYYY, YYYYMMDD, YYYY/MM/DD or any of the previous formats with THH:MM:SS added onto the end (ex. 2020-09-15T12:00:00). enddate : str The end date of the data to acquire. Formats accepted are YYYY-MM-DD, DD.MM.YYYY, DD/MM/YYYY, YYYYMMDD or YYYY/MM/DD, or any of the previous formats with THH:MM:SS added onto the end (ex. 2020-09-15T13:00:00). time: str or None The specific time. Format is HHMMSS. Set to None to download all files in the given date interval. output : str The output directory for the data. Set to None to make a folder in the current working directory with the same name as *datastream* to place the files in. Returns ------- files : list Returns list of files retrieved Notes ----- This programmatic interface allows users to query and automate machine-to-machine downloads of ARM data. This tool uses a REST URL and specific parameters (saveData, query), user ID and access token, a datastream name, a start date, and an end date, and data files matching the criteria will be returned to the user and downloaded. By using this web service, users can setup cron jobs and automatically download data from /data/archive into their workspace. This will also eliminate the manual step of following a link in an email to download data. All other data files, which are not on the spinning disk (on HPSS), will have to go through the regular ordering process. More information about this REST API and tools can be found on `ARM Live <>`_. To login/register for an access token click `here <>`_. Author: Michael Giansiracusa Email: Examples -------- This code will download the netCDF files from the sgpmetE13.b1 datastream and place them in a directory named sgpmetE13.b1. The data from 14 Jan to 20 Jan 2017 will be downloaded. Replace *userName* and *XXXXXXXXXXXXXXXX* with your username and token for ARM Data Discovery. See the Notes for information on how to obtain a username and token. .. code-block:: python act.discovery.download_data( "userName", "XXXXXXXXXXXXXXXX", "sgpmetE13.b1", "2017-01-14", "2017-01-20" ) """ # default start and end are empty start, end = '', '' # start and end strings for query_url are constructed # if the arguments were provided if startdate: start_datetime = date_parser(startdate, return_datetime=True) start = start_datetime.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z' start = f'&start={start}' if enddate: end_datetime = date_parser(enddate, return_datetime=True) # If the start and end date are the same, and a day to the end date if start_datetime == end_datetime: end_datetime += timedelta(hours=23, minutes=59, seconds=59) end = end_datetime.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z' end = f'&end={end}' # build the url to query the web service using the arguments provided query_url = ( '' + 'user={0}&ds={1}{2}{3}&wt=json' ).format(':'.join([username, token]), datastream, start, end) # get url response, read the body of the message, # and decode from bytes type to utf-8 string response_body = urlopen(query_url).read().decode('utf-8') # if the response is an html doc, then there was an error with the user if response_body[1:14] == '!DOCTYPE html': raise ConnectionRefusedError('Error with user. Check username or token.') # parse into json object response_body_json = json.loads(response_body) # construct output directory if output: # output files to directory specified output_dir = os.path.join(output) else: # if no folder given, add datastream folder # to current working dir to prevent file mix-up output_dir = os.path.join(os.getcwd(), datastream) # not testing, response is successful and files were returned if response_body_json is None: print('ARM Data Live Webservice does not appear to be functioning') return [] num_files = len(response_body_json['files']) file_names = [] if response_body_json['status'] == 'success' and num_files > 0: for fname in response_body_json['files']: if time is not None: if time not in fname: continue # construct link to web service saveData function save_data_url = ( '' + 'saveData?user={0}&file={1}' ).format(':'.join([username, token]), fname) output_file = os.path.join(output_dir, fname) # make directory if it doesn't exist if not os.path.isdir(output_dir): os.makedirs(output_dir) # create file and write bytes to file with open(output_file, 'wb') as open_bytes_file: data = urlopen(save_data_url).read() if 'This data file is not available' in str(data): print(fname + ' is not available for download') continue else: print(f'[DOWNLOADING] {fname}') open_bytes_file.write(data) file_names.append(output_file) # Get ARM DOI and print it out doi = get_arm_doi( datastream, start_datetime.strftime('%Y-%m-%d'), end_datetime.strftime('%Y-%m-%d') ) print('\nIf you use these data to prepare a publication, please cite:\n') print(textwrap.fill(doi, width=80)) print('') else: print( 'No files returned or url status error.\n' 'Check datastream name, start, and end date.' ) return file_names
[docs]def get_arm_doi(datastream, startdate, enddate): """ This function will return a citation with DOI, if available, for specified datastream and date range Parameters ---------- datastream : str The name of the datastream to get a DOI for. This must be ARM standard names startdate : str Start date for the citation in the format YY-MM-DD enddate : str End date for the citation in the format YY-MM-DD Returns ------- doi : str Returns the citation as a string """ # Get the DOI information doi_url = ( '' + datastream + '&citationType=apa' ) doi_url += '&startDate=' + startdate doi_url += '&endDate=' + enddate try: doi = requests.get(url=doi_url) except ValueError as err: return "Webservice potentially down or arguments are not valid: " + err if len(doi.text) > 0: doi = doi.json()['citation'] else: doi = 'Please check your arguments. No DOI Found' return doi