Source code for validate.data_loader

"""
data_loader
===============

THis module contains functions that will load data from
netCDF files needed to produce plots. It uses various cdo
commands to manipulate the netCDF files if they need to be
processed before the data is extracted.

.. moduleauthor:: David Fallis
"""

import os
from netCDF4 import Dataset, num2date, date2num
import numpy as np
import datetime
from validate.functions import external
from . import constants
import cdo
cdo = cdo.Cdo()

preprocessed_data_root = ''

[docs]def silent_remove(name):
    """ Removes a file if it exists and does nothing if it doesn't exist    
    """    
    try: 
        os.remove(name)
    except OSError:
        pass

def _check_averaged(ifile):
    """ Returns True if there is only one timestep in the netcdf file
    """
    nc = Dataset(ifile, 'r')
    try:
        time = nc.variables['time'][:].squeeze()
    except:
        return True
    return time.size == 1


[docs]def year_mon_day(datestring):
    """ Seperates a string of from yyyy-mm-dd in to
        three integers and returns the tuple year,mon,day
    """
    year = datestring.split('-')[0]
    try:
        mon = datestring.split('-')[1]
    except:
        mon = '01'
    try:
        day = datestring.split('-')[2]
    except:
        day = '01'
    return int(year), int(mon), int(day)


def _check_dates_outside(ifile, start_date, end_date):
    """ Checks if the comparison data is outside of the dates for the plot
        Returns True if the dates of the data are completely outside of the
        desired dates.
        Returns False if the dates overlap at all, but prints a warning if 
        it is only a subset.
    """
    # Load data from file into Dataset object
    nc = Dataset(ifile, 'r')
    
    nc_time = nc.variables['time']
    try:
        cal = nc_time.calendar
    except:
        cal = 'standard'
    
    # convert dates to datetime object
    start_dt = datetime.datetime(*year_mon_day(start_date))
    end_dt = datetime.datetime(*year_mon_day(end_date))
    # convert datetime objects to integers
    start = date2num(start_dt, nc_time.units, calendar=cal)
    end = date2num(end_dt, nc_time.units, calendar=cal)
    
    # get start and end dates of file
    compstart = nc_time[:][0]
    compend = nc_time[:][-1]
    
    # make comparison
    if compstart > end or compend < start:
        # there is no overlap
        return True
    elif compstart > start or compend < end:
        compstart_dt = num2date(compstart,nc_time.units,calendar=cal)
        compend_dt = num2date(compend,nc_time.units,calendar=cal)
        with open('logs/log.txt', 'a') as outfile:
            outfile.write('WARNING: Comparison data does not cover entire time period... Used subset of {}\n'.format(ifile))
            outfile.write('\t Datafile dates:\n \t\tstart: \t{} \tend: \t{}\n'.format(compstart_dt,compend_dt))
            outfile.write('\t Desired dates:\n \t\tstart: \t{} \tend: \t{}\n\n'.format(start_dt,end_dt)) 
    return False


def _check_dates(ifile, dates):
    """ Prints warnings or raises exception if the desired dates
        are not within the date bounds of the file.
    """
    try:
        if _check_averaged(ifile):
            with open('logs/log.txt', 'a') as outfile:
                outfile.write('WARNING: Comparison data is time averaged in {}\n'.format(ifile))
            return True
        elif _check_dates_outside(ifile, **dates):
            with open('logs/log.txt', 'a') as outfile:
                outfile.write('WARNING: Comparison data is not from time period in {}\n'.format(ifile))
            raise Exception
        return False
    except:
        with open('logs/log.txt', 'a') as outfile:
            outfile.write('WARNING: Comparison data time period could not be checked in {}\n'.format(ifile)) 
        return False       


def _ncvar(ds, var):
    try:
        ncvar = ds.variables[var]
    except:
        try:
            # try upper case key
            varu = var.upper()
            ncvar = ds.variables[varu]
        except:
            # look at non-dimensional variables,
            #   if only one is available, use that
            #   else raise an error
            non_dim_vars = [ v for v in list(ds.variables.keys()) if v not in ['lon','lat','time','time_bnds'] ]
            if len(non_dim_vars) == 1:
                nd_var = non_dim_vars[0]
                warning_str = ("\nWARNING: {} was not found in {}".format(var,ds.filepath()) + 
                                ", validate defaulted to using only non-dimensional var in file:" +
                                " \n\t {}\n".format(nd_var))
                with open('logs/log.txt','a') as outfile:
                    outfile.write(warning_str)
                print(warning_str)
                ncvar = ds.variables[nd_var]
            else:
                error_str = ("\nError: {} was not found in {}".format(var,ds.filepath()) +
                             " and more than non-dimensional var is present in the file" +
                             " ({}).... validate doesn't know what variable".format(" ".join(list(ds.variables.keys()))) +
                             " to use....aborting...\n")
                with open('logs/log.txt','a') as outfile:
                    outfile.write(error_str)
                print(error_str)
                raise KeyError("{} not found in {}".format(var,ds.filepath()))
    return ncvar

def _units(ncvar, scale, shift):
    try:
        units = ncvar.units
    except:
        units = ''

    if shift < 0:
        units = '(' + units + ' - ' + str(abs(shift)) + ')'
    if shift > 0:
        units = '(' + units + ' + ' + str(shift) + ')'   
    if scale != 1:
        units = units + ' * ' + str(scale)
       
    return units

def _depth(ds, ncvar):
    for dimension in ncvar.dimensions:
        try:
            if ds.variables[dimension].axis == 'Z':
                depth = ds.variables[dimension][:]
                break
        except:
            # keep looping if the dimension doen't have an 'axis' attribute
            pass
    else:
        depth = [0]
    return np.round(depth)

def _lon_lat(ds):
    try:
        lon = ds.variables['lon'][:].squeeze()
    except:
        try:
            lon = ds.variables['x'][:].squeeze()
        except:
            lon = None
    try:
        lat = ds.variables['lat'][:].squeeze()
    except:
        try:
            lat = ds.variables['y'][:].squeeze()
        except:
            lat = None
    return lon, lat


def _time(ds, time_averaged):
    if time_averaged:
        return None
    try:
        nc_time = ds.variables['time']
    except:
        return None
    try:
        cal = nc_time.calendar
    except:
        cal = 'standard'
    x = num2date(nc_time[:], nc_time.units, cal)
    x = [datetime.datetime(*item.timetuple()[:6]) for item in x]
    x = np.array(x)
    return x

[docs]def get_external_function(name):
    """ Returns a function from the external module based on the function name.
    """
    def external_functions(function_name):
        return {'sample': external.sample,
                'field_integral': external.field_integral,
                'anomaly': external.anomaly,
               }[function_name]
    return external_functions(name)
    
[docs]def dataload(ifile, var, dates, realm='atmos', scale=1, shift=0, 
             remapf='remapdis', remapgrid='r360x180', seasons=None,
             datatype='full', depthneeded=None, section=False, fieldmean=False, gridweights=False,
             cdostring=None, yearmean=False, external_function=None, external_function_args={}):

    """ Manipulates a file used a series of cdo commands which produce intermediate files,
        and returns data about the the final file produced based on the specified parameters.
        
    Parameters
    ----------
    ifile : string
            the name of the original input file
    var : string 
          variable name
    dates : dictionary of the date range as strings of the form 'yyyy-mm'
            start_date and end_date keys should be specified
    realm : string
            realm category (used for masking data)
            default : 'atmos'
    scale : float
            scales the data by this value
            default : 1
    shift : float
            shifts the data by this valee 
            default : 0
    remapf : string
             name of the cdo remapping
             default : remapdis
    remapgrid : string
                grid to remap the data to
                default : 'r360x180
    seasons : list of strings
              seasons to be selected out of ['DJF', 'MAM', 'JJA', 'SON']
              None will select all of the seasons
              default : None
    datatype : string
               cdo operation to perform all the time axis
               options are 'climatology', 'trends', 'detrend'
               anything else not perform any cdo operation
               default 'full'
    depthneeded : list of floats
                  list of the depths to interpolate the data to in z-axis
                  default : None
    section : boolean
              set to True to take a zonal mean of the data
              default : False
    fieldmean : boolean
                set to True to take a fieldmean of the data
                default : False
    gridweights : boolean
                  set to True to calculate the area weights of each grid cell
    cdostring : string
                custom to cdo string to be applied to the input file
                default : None
    yearmean : boolean
               take an annual mean of the data before manipulating
               default : False
    external_function : string
                        name of external function to call
                        default : None
    external_function_args : dictionary
                             keyword arguments to pass to the external function
    
    Returns
    -------
    numpy array of final data
    numpy array of longitudinal cooridinates
    numpy array of latitudinal coordinates
    numpy array of depths
    string of the units
    numpy array of the time axis
    numpy area of the area weights of the grid cells 
    """
    time_averaged_bool = _check_dates(ifile, dates)

    sel_var_file = sel_var(ifile, var)
    masked_file = mask(sel_var_file, realm)
    c_file = setc(masked_file, realm)
    
    if cdostring is not None:
        c_file = cdos(c_file, cdostring)

    remapped_file = remap(c_file, remapf, remapgrid)
    seasonal_file = season(remapped_file, seasons)
    ofile = sel_date(seasonal_file, dates['start_date'], dates['end_date'], time_averaged_bool)

    if external_function is not None:
        ofile = get_external_function(external_function)(ofile, **external_function_args)        
    
    if yearmean:
        ofile = year_mean(ofile)
    
    if datatype == 'climatology':
        ofile = time_mean(ofile, time_averaged_bool)
    elif datatype == 'trends':
        ofile = trend(ofile)
    elif datatype == 'detrend':
        ofile = detrend(ofile)
    
    if depthneeded:
        ofile = intlevel(ofile, depthneeded)
    
    if section:
        ofile = zonal_mean(ofile)

    if fieldmean:
        ofile = field_mean(ofile)
    
    dataset = Dataset(ofile, 'r')
    ncvar = _ncvar(dataset, var)
    rawdata = ncvar[:].squeeze()
    data = (rawdata + shift) * scale
    units = _units(ncvar, scale, shift)
    depth = _depth(dataset, ncvar)
    lon, lat = _lon_lat(dataset)
    time = _time(dataset, time_averaged_bool)

    if gridweights:
        gfile = grid_weights(ofile)    
        gdataset = Dataset(gfile, 'r')
        gncvar = _ncvar(gdataset, 'cell_weights')
        weights = gncvar[:].squeeze()
    else:
        weights = None
    return data, lon, lat, depth, units, time, weights


[docs]def split(name):
    """ Returns the name of a file without the directory path
    """
    path, filename = os.path.split(name)
    return filename

[docs]def sel_date(name, start_date, end_date, time_average=False):
    if time_average:     
        return name
    out = 'netcdf/seldate_' + start_date + '_' + end_date + '_' + split(name)
    already_exists = already_calculated(out)
    if already_exists is not None:
        return already_exists
    else:
        datestring = start_date + ',' + end_date
        cdo.seldate(datestring, input=name, output=out)
    return out
    
[docs]def sel_var(name, variable):
    out = 'netcdf/sel_' + split(name)
    already_exists = already_calculated(out)
    if already_exists is not None:
        return already_exists
    else:
        cdo.selvar(variable, input=name, output=out) 
    return out

[docs]def mask(name, realm):
    out = 'netcdf/masked_' + split(name)
    already_exists = already_calculated(out)
    if already_exists is not None:
        return already_exists
    else:
        if realm == 'ocean':
            try:
                cdo.ifthen(input='mask/ocean ' + name, output=out)
            except:
                with open('logs/log.txt', 'a') as outfile:
                    outfile.write('WARNING: Land data was not masked\n')
                silent_remove(out)
                return name
        elif realm == 'land':
            try:
                cdo.ifthen(input='mask/land ' + name, output=out) 
            except:
                with open('logs/log.txt', 'a') as outfile:
                    outfile.write('WARNING: Ocean data was not masked\n')
                silent_remove(out)
                return name
        else:
            out = name
    return out

[docs]def time_mean(name, time_average=False):
    if time_average:
       return name
    out = 'netcdf/climate_' + split(name)
    already_exists = already_calculated(out)
    if already_exists is not None:
        return already_exists
    else:
        cdo.timmean(input=name, output=out)
    return out  

[docs]def trend(name):
    out = 'netcdf/slope_' + split(name)
    outintercept = 'netcdf/intercept_' + split(name)
    already_exists = already_calculated(out)
    if already_exists is not None:
        return already_exists
    else:
        cdo.trend(input=name, output=outintercept + ' ' + out)
    return out

[docs]def detrend(name):
    out = 'netcdf/detrend_' + split(name)
    already_exists = already_calculated(out)
    if already_exists is not None:
        return already_exists
    else:
        cdo.detrend(input=name, output=out)
    return out    

[docs]def setc(name, realm='ocean'):
    if realm == 'atmos':
        return name
    out = 'netcdf/setc_' + split(name)
    already_exists = already_calculated(out)
    if already_exists is not None:
        return already_exists
    else:
        cdo.setctomiss(0, input=name, output=out)
    return out

[docs]def get_remap_function(remap):
    """ Returns a cdo function from string of the same name.
    """
    def cdoremap(r):
        return {'remapbil': cdo.remapbil,
                'remapbic': cdo.remapbic,
                'remapdis': cdo.remapdis,
                'remapnn': cdo.remapnn,
                'remapcon': cdo.remapcon,
                'remapcon2': cdo.remapcon2,
                'remapplaf': cdo.remaplaf,
                }[r]
    return cdoremap(remap)

[docs]def remap(name, remapname, remapgrid):
    out = 'netcdf/' + remapname + '-' + os.path.basename(remapgrid) + '_' + split(name)
    already_exists = already_calculated(out)
    if already_exists is not None:
        return already_exists
    else:
        remap = get_remap_function(remapname)
        try:
            remap(remapgrid, input=name, output=out)
        except:
            try:
                os.remove(out)
            except:
                pass
            return name
    return out

[docs]def field_mean(name):
    out = 'netcdf/fldmean_' + split(name)
    already_exists = already_calculated(out)
    if already_exists is not None:
        return already_exists
    else:
        cdo.fldmean(input=name, output=out)
    return out

[docs]def zonal_mean(name):
    out = 'netcdf/zonmean_' + split(name)
    already_exists = already_calculated(out)
    if already_exists is not None:
        return already_exists
    else:
        cdo.zonmean(input=name, output=out)
    return out
    
[docs]def depthstring(depthlist):
    depthneeded = ["%.2f" % number for number in depthlist]
    for i in range(len(depthneeded)):
        depthneeded[i] = str(depthneeded[i])
    return ','.join(depthneeded)
    
       
[docs]def intlevel(name, depthlist):
    if depthlist == None or depthlist == [] or depthlist == [""] or depthlist == [None]:
        return name
    depth = depthstring(depthlist)
    depthname = depth.replace(' ', '')
    if len(depthname) > 100:
        depthname = depthname[:99]
    out = 'netcdf/level-' + str(depthname) + '_' + split(name)
    if depth:
        already_exists = already_calculated(out)
        if already_exists is not None:
            return already_exists
        else:
            try:
                cdo.intlevelx(str(depth), input=name, output=out)
            except:
                return name
    else:
        return name
    return out        
   
[docs]def season(name, seasonlist):
    if seasonlist == None or seasonlist == ['DJF', 'MAM', 'JJA', 'SON']:
        return name
    seasonstring = ','.join(seasonlist)
    outputstring = ''.join(seasonlist)
    out = 'netcdf/selseason-' + outputstring + '_' + split(name)
    already_exists = already_calculated(out)
    if already_exists is not None:
        return already_exists
    else:
        cdo.selseas(seasonstring, input=name, output=out)
    return out

[docs]def cdos(name, string):
    if string:
        out = 'netcdf/cdo_' + split(name)
        if not os.path.isfile(out):
            s = 'cdo ' + string + ' ' + name + ' ' + out
            os.system(s)
        return out
    return name

[docs]def grid_weights(name):
    out = 'netcdf/gridweights_' + split(name)
    already_exists = already_calculated(out)
    if already_exists is not None:
         return already_exists
    else:
        cdo.gridweights(input=name, output=out)
    return out

[docs]def year_mean(name):
    out = 'netcdf/yearmean_' + split(name)
    already_exists = already_calculated(out)
    if already_exists is not None:
        return already_exists
    else:
        cdo.yearmean(input=name, output=out)
    return out


[docs]def already_calculated(name):
    if os.path.isfile(name):
        return name
    try:
        processed_root = constants.processed_cmip5_root
    except:   
        return None
    
    precalc = processed_root + '/' + split(name)
    try:
        if os.path.isfile(precalc):
            return precalc
    except:
        return None
    
if __name__ == "__main__":
    pass