Source code for validate.data_loader

"""
data_loader
===============

THis module contains functions that will load data from
netCDF files needed to produce plots. It uses various cdo
commands to manipulate the netCDF files if they need to be
processed before the data is extracted.

.. moduleauthor:: David Fallis
"""

import os
from netCDF4 import Dataset, num2date, date2num
import numpy as np
import datetime
from validate.functions import external
from . import constants
import cdo
cdo = cdo.Cdo()

preprocessed_data_root = ''

[docs]def silent_remove(name): """ Removes a file if it exists and does nothing if it doesn't exist """ try: os.remove(name) except OSError: pass
def _check_averaged(ifile): """ Returns True if there is only one timestep in the netcdf file """ nc = Dataset(ifile, 'r') try: time = nc.variables['time'][:].squeeze() except: return True return time.size == 1
[docs]def year_mon_day(datestring): """ Seperates a string of from yyyy-mm-dd in to three integers and returns the tuple year,mon,day """ year = datestring.split('-')[0] try: mon = datestring.split('-')[1] except: mon = '01' try: day = datestring.split('-')[2] except: day = '01' return int(year), int(mon), int(day)
def _check_dates_outside(ifile, start_date, end_date): """ Checks if the comparison data is outside of the dates for the plot Returns True if the dates of the data are completely outside of the desired dates. Returns False if the dates overlap at all, but prints a warning if it is only a subset. """ # Load data from file into Dataset object nc = Dataset(ifile, 'r') nc_time = nc.variables['time'] try: cal = nc_time.calendar except: cal = 'standard' # convert dates to datetime object start_dt = datetime.datetime(*year_mon_day(start_date)) end_dt = datetime.datetime(*year_mon_day(end_date)) # convert datetime objects to integers start = date2num(start_dt, nc_time.units, calendar=cal) end = date2num(end_dt, nc_time.units, calendar=cal) # get start and end dates of file compstart = nc_time[:][0] compend = nc_time[:][-1] # make comparison if compstart > end or compend < start: # there is no overlap return True elif compstart > start or compend < end: compstart_dt = num2date(compstart,nc_time.units,calendar=cal) compend_dt = num2date(compend,nc_time.units,calendar=cal) with open('logs/log.txt', 'a') as outfile: outfile.write('WARNING: Comparison data does not cover entire time period... Used subset of {}\n'.format(ifile)) outfile.write('\t Datafile dates:\n \t\tstart: \t{} \tend: \t{}\n'.format(compstart_dt,compend_dt)) outfile.write('\t Desired dates:\n \t\tstart: \t{} \tend: \t{}\n\n'.format(start_dt,end_dt)) return False def _check_dates(ifile, dates): """ Prints warnings or raises exception if the desired dates are not within the date bounds of the file. """ try: if _check_averaged(ifile): with open('logs/log.txt', 'a') as outfile: outfile.write('WARNING: Comparison data is time averaged in {}\n'.format(ifile)) return True elif _check_dates_outside(ifile, **dates): with open('logs/log.txt', 'a') as outfile: outfile.write('WARNING: Comparison data is not from time period in {}\n'.format(ifile)) raise Exception return False except: with open('logs/log.txt', 'a') as outfile: outfile.write('WARNING: Comparison data time period could not be checked in {}\n'.format(ifile)) return False def _ncvar(ds, var): try: ncvar = ds.variables[var] except: try: # try upper case key varu = var.upper() ncvar = ds.variables[varu] except: # look at non-dimensional variables, # if only one is available, use that # else raise an error non_dim_vars = [ v for v in list(ds.variables.keys()) if v not in ['lon','lat','time','time_bnds'] ] if len(non_dim_vars) == 1: nd_var = non_dim_vars[0] warning_str = ("\nWARNING: {} was not found in {}".format(var,ds.filepath()) + ", validate defaulted to using only non-dimensional var in file:" + " \n\t {}\n".format(nd_var)) with open('logs/log.txt','a') as outfile: outfile.write(warning_str) print(warning_str) ncvar = ds.variables[nd_var] else: error_str = ("\nError: {} was not found in {}".format(var,ds.filepath()) + " and more than non-dimensional var is present in the file" + " ({}).... validate doesn't know what variable".format(" ".join(list(ds.variables.keys()))) + " to use....aborting...\n") with open('logs/log.txt','a') as outfile: outfile.write(error_str) print(error_str) raise KeyError("{} not found in {}".format(var,ds.filepath())) return ncvar def _units(ncvar, scale, shift): try: units = ncvar.units except: units = '' if shift < 0: units = '(' + units + ' - ' + str(abs(shift)) + ')' if shift > 0: units = '(' + units + ' + ' + str(shift) + ')' if scale != 1: units = units + ' * ' + str(scale) return units def _depth(ds, ncvar): for dimension in ncvar.dimensions: try: if ds.variables[dimension].axis == 'Z': depth = ds.variables[dimension][:] break except: # keep looping if the dimension doen't have an 'axis' attribute pass else: depth = [0] return np.round(depth) def _lon_lat(ds): try: lon = ds.variables['lon'][:].squeeze() except: try: lon = ds.variables['x'][:].squeeze() except: lon = None try: lat = ds.variables['lat'][:].squeeze() except: try: lat = ds.variables['y'][:].squeeze() except: lat = None return lon, lat def _time(ds, time_averaged): if time_averaged: return None try: nc_time = ds.variables['time'] except: return None try: cal = nc_time.calendar except: cal = 'standard' x = num2date(nc_time[:], nc_time.units, cal) x = [datetime.datetime(*item.timetuple()[:6]) for item in x] x = np.array(x) return x
[docs]def get_external_function(name): """ Returns a function from the external module based on the function name. """ def external_functions(function_name): return {'sample': external.sample, 'field_integral': external.field_integral, 'anomaly': external.anomaly, }[function_name] return external_functions(name)
[docs]def dataload(ifile, var, dates, realm='atmos', scale=1, shift=0, remapf='remapdis', remapgrid='r360x180', seasons=None, datatype='full', depthneeded=None, section=False, fieldmean=False, gridweights=False, cdostring=None, yearmean=False, external_function=None, external_function_args={}): """ Manipulates a file used a series of cdo commands which produce intermediate files, and returns data about the the final file produced based on the specified parameters. Parameters ---------- ifile : string the name of the original input file var : string variable name dates : dictionary of the date range as strings of the form 'yyyy-mm' start_date and end_date keys should be specified realm : string realm category (used for masking data) default : 'atmos' scale : float scales the data by this value default : 1 shift : float shifts the data by this valee default : 0 remapf : string name of the cdo remapping default : remapdis remapgrid : string grid to remap the data to default : 'r360x180 seasons : list of strings seasons to be selected out of ['DJF', 'MAM', 'JJA', 'SON'] None will select all of the seasons default : None datatype : string cdo operation to perform all the time axis options are 'climatology', 'trends', 'detrend' anything else not perform any cdo operation default 'full' depthneeded : list of floats list of the depths to interpolate the data to in z-axis default : None section : boolean set to True to take a zonal mean of the data default : False fieldmean : boolean set to True to take a fieldmean of the data default : False gridweights : boolean set to True to calculate the area weights of each grid cell cdostring : string custom to cdo string to be applied to the input file default : None yearmean : boolean take an annual mean of the data before manipulating default : False external_function : string name of external function to call default : None external_function_args : dictionary keyword arguments to pass to the external function Returns ------- numpy array of final data numpy array of longitudinal cooridinates numpy array of latitudinal coordinates numpy array of depths string of the units numpy array of the time axis numpy area of the area weights of the grid cells """ time_averaged_bool = _check_dates(ifile, dates) sel_var_file = sel_var(ifile, var) masked_file = mask(sel_var_file, realm) c_file = setc(masked_file, realm) if cdostring is not None: c_file = cdos(c_file, cdostring) remapped_file = remap(c_file, remapf, remapgrid) seasonal_file = season(remapped_file, seasons) ofile = sel_date(seasonal_file, dates['start_date'], dates['end_date'], time_averaged_bool) if external_function is not None: ofile = get_external_function(external_function)(ofile, **external_function_args) if yearmean: ofile = year_mean(ofile) if datatype == 'climatology': ofile = time_mean(ofile, time_averaged_bool) elif datatype == 'trends': ofile = trend(ofile) elif datatype == 'detrend': ofile = detrend(ofile) if depthneeded: ofile = intlevel(ofile, depthneeded) if section: ofile = zonal_mean(ofile) if fieldmean: ofile = field_mean(ofile) dataset = Dataset(ofile, 'r') ncvar = _ncvar(dataset, var) rawdata = ncvar[:].squeeze() data = (rawdata + shift) * scale units = _units(ncvar, scale, shift) depth = _depth(dataset, ncvar) lon, lat = _lon_lat(dataset) time = _time(dataset, time_averaged_bool) if gridweights: gfile = grid_weights(ofile) gdataset = Dataset(gfile, 'r') gncvar = _ncvar(gdataset, 'cell_weights') weights = gncvar[:].squeeze() else: weights = None return data, lon, lat, depth, units, time, weights
[docs]def split(name): """ Returns the name of a file without the directory path """ path, filename = os.path.split(name) return filename
[docs]def sel_date(name, start_date, end_date, time_average=False): if time_average: return name out = 'netcdf/seldate_' + start_date + '_' + end_date + '_' + split(name) already_exists = already_calculated(out) if already_exists is not None: return already_exists else: datestring = start_date + ',' + end_date cdo.seldate(datestring, input=name, output=out) return out
[docs]def sel_var(name, variable): out = 'netcdf/sel_' + split(name) already_exists = already_calculated(out) if already_exists is not None: return already_exists else: cdo.selvar(variable, input=name, output=out) return out
[docs]def mask(name, realm): out = 'netcdf/masked_' + split(name) already_exists = already_calculated(out) if already_exists is not None: return already_exists else: if realm == 'ocean': try: cdo.ifthen(input='mask/ocean ' + name, output=out) except: with open('logs/log.txt', 'a') as outfile: outfile.write('WARNING: Land data was not masked\n') silent_remove(out) return name elif realm == 'land': try: cdo.ifthen(input='mask/land ' + name, output=out) except: with open('logs/log.txt', 'a') as outfile: outfile.write('WARNING: Ocean data was not masked\n') silent_remove(out) return name else: out = name return out
[docs]def time_mean(name, time_average=False): if time_average: return name out = 'netcdf/climate_' + split(name) already_exists = already_calculated(out) if already_exists is not None: return already_exists else: cdo.timmean(input=name, output=out) return out
[docs]def trend(name): out = 'netcdf/slope_' + split(name) outintercept = 'netcdf/intercept_' + split(name) already_exists = already_calculated(out) if already_exists is not None: return already_exists else: cdo.trend(input=name, output=outintercept + ' ' + out) return out
[docs]def detrend(name): out = 'netcdf/detrend_' + split(name) already_exists = already_calculated(out) if already_exists is not None: return already_exists else: cdo.detrend(input=name, output=out) return out
[docs]def setc(name, realm='ocean'): if realm == 'atmos': return name out = 'netcdf/setc_' + split(name) already_exists = already_calculated(out) if already_exists is not None: return already_exists else: cdo.setctomiss(0, input=name, output=out) return out
[docs]def get_remap_function(remap): """ Returns a cdo function from string of the same name. """ def cdoremap(r): return {'remapbil': cdo.remapbil, 'remapbic': cdo.remapbic, 'remapdis': cdo.remapdis, 'remapnn': cdo.remapnn, 'remapcon': cdo.remapcon, 'remapcon2': cdo.remapcon2, 'remapplaf': cdo.remaplaf, }[r] return cdoremap(remap)
[docs]def remap(name, remapname, remapgrid): out = 'netcdf/' + remapname + '-' + os.path.basename(remapgrid) + '_' + split(name) already_exists = already_calculated(out) if already_exists is not None: return already_exists else: remap = get_remap_function(remapname) try: remap(remapgrid, input=name, output=out) except: try: os.remove(out) except: pass return name return out
[docs]def field_mean(name): out = 'netcdf/fldmean_' + split(name) already_exists = already_calculated(out) if already_exists is not None: return already_exists else: cdo.fldmean(input=name, output=out) return out
[docs]def zonal_mean(name): out = 'netcdf/zonmean_' + split(name) already_exists = already_calculated(out) if already_exists is not None: return already_exists else: cdo.zonmean(input=name, output=out) return out
[docs]def depthstring(depthlist): depthneeded = ["%.2f" % number for number in depthlist] for i in range(len(depthneeded)): depthneeded[i] = str(depthneeded[i]) return ','.join(depthneeded)
[docs]def intlevel(name, depthlist): if depthlist == None or depthlist == [] or depthlist == [""] or depthlist == [None]: return name depth = depthstring(depthlist) depthname = depth.replace(' ', '') if len(depthname) > 100: depthname = depthname[:99] out = 'netcdf/level-' + str(depthname) + '_' + split(name) if depth: already_exists = already_calculated(out) if already_exists is not None: return already_exists else: try: cdo.intlevelx(str(depth), input=name, output=out) except: return name else: return name return out
[docs]def season(name, seasonlist): if seasonlist == None or seasonlist == ['DJF', 'MAM', 'JJA', 'SON']: return name seasonstring = ','.join(seasonlist) outputstring = ''.join(seasonlist) out = 'netcdf/selseason-' + outputstring + '_' + split(name) already_exists = already_calculated(out) if already_exists is not None: return already_exists else: cdo.selseas(seasonstring, input=name, output=out) return out
[docs]def cdos(name, string): if string: out = 'netcdf/cdo_' + split(name) if not os.path.isfile(out): s = 'cdo ' + string + ' ' + name + ' ' + out os.system(s) return out return name
[docs]def grid_weights(name): out = 'netcdf/gridweights_' + split(name) already_exists = already_calculated(out) if already_exists is not None: return already_exists else: cdo.gridweights(input=name, output=out) return out
[docs]def year_mean(name): out = 'netcdf/yearmean_' + split(name) already_exists = already_calculated(out) if already_exists is not None: return already_exists else: cdo.yearmean(input=name, output=out) return out
[docs]def already_calculated(name): if os.path.isfile(name): return name try: processed_root = constants.processed_cmip5_root except: return None precalc = processed_root + '/' + split(name) try: if os.path.isfile(precalc): return precalc except: return None
if __name__ == "__main__": pass