Source code for scripts.analysis.amwg_table

import numpy as np
import xarray as xr
import sys
from pathlib import Path
import warnings  # use to warn user about missing files.

#Import "special" modules:
try:
    import scipy.stats as stats # for easy linear regression and testing
except ImportError:
    raise ImportError(
        "Scipy module does not exist in python path, but is needed for amwg_table. Please install module, e.g. 'pip install scipy'."
    )
#End except

try:
    import pandas as pd
except ImportError:
    raise ImportError(
        "Pandas module does not exist in python path, but is needed for amwg_table. Please install module, e.g. 'pip install pandas'."
    )
#End except

#Import ADF-specific modules:
import plotting_functions as pf

[docs] def amwg_table(adf): """ Main function goes through series of steps: - load the variable data - Determine whether there are spatial dims; if yes, do global average (TODO: regional option) - Apply annual average (TODO: add seasonal here) - calculates the statistics + mean + sample size + standard deviation + standard error of the mean + 5/95% confidence interval of the mean + linear trend + p-value of linear trend - puts statistics into a CSV file - generates simple HTML that can display the data Description of needed inputs from ADF: case_names -> Name(s) of CAM case provided by "cam_case_name" input_ts_locs -> Location(s) of CAM time series files provided by "cam_ts_loc" output_loc -> Location to write AMWG table files to, provided by "cam_diag_plot_loc" var_list -> List of CAM output variables provided by "diag_var_list" var_defaults -> Dict that has keys that are variable names and values that are plotting preferences/defaults. and if doing a CAM baseline comparison: baseline_name -> Name of CAM baseline case provided by "cam_case_name" input_ts_baseline -> Location of CAM baseline time series files provied by "cam_ts_loc" """ #Import necessary modules: from lib.adf_base import AdfError #Additional information: #---------------------- # GOAL: replace the "Tables" set in AMWG # Set Description # 1 Tables of ANN, DJF, JJA, global and regional means and RMSE. # # STRATEGY: # I think the right solution is to generate one CSV (or other?) file that # contains all of the data. # So we need: # - a function that would produces the data, and # - then call a function that adds the data to a file # - another function(module?) that uses the file to produce a "web page" # IMPLEMENTATION: # - assume that we will have time series of global averages already ... that should be done ahead of time # - given a variable or file for a variable (equivalent), we will calculate the all-time, DJF, JJA, MAM, SON # + mean # + standard error of the mean # -- 95% confidence interval for the mean, estimated by: # ---- CI95 = mean + (SE * 1.96) # ---- CI05 = mean - (SE * 1.96) # + standard deviation # AMWG also includes the RMSE b/c it is comparing two things, but I will put that off for now. # DETAIL: we use python's type hinting as much as possible # in future, provide option to do multiple domains # They use 4 pre-defined domains: domains = {"global": (0, 360, -90, 90), "tropics": (0, 360, -20, 20), "southern": (0, 360, -90, -20), "northern": (0, 360, 20, 90)} # and then in time it is DJF JJA ANN #Notify user that script has started: msg = "\n Calculating AMWG variable tables..." print(f"{msg}\n {'-' * (len(msg)-3)}") # within each domain and season # the result is just a table of # VARIABLE-NAME, RUN VALUE, OBS VALUE, RUN-OBS, RMSE #---------------------- #Extract needed quantities from ADF object: #----------------------------------------- var_list = adf.diag_var_list var_defaults = adf.variable_defaults #Check if ocean or land fraction exist #in the variable list: for var in ["OCNFRAC", "LANDFRAC"]: if var in var_list: #If so, then move them to the front of variable list so #that they can be used to mask or vertically interpolate #other model variables if need be: var_idx = var_list.index(var) var_list.pop(var_idx) var_list.insert(0,var) #End if #End if #Special ADF variable which contains the output paths for #all generated plots and tables for each case: output_locs = adf.plot_location #CAM simulation variables (these quantities are always lists): case_names = adf.get_cam_info("cam_case_name", required=True) input_ts_locs = adf.get_cam_info("cam_ts_loc", required=True) #Grab case years syear_cases = adf.climo_yrs["syears"] eyear_cases = adf.climo_yrs["eyears"] #Check if a baseline simulation is also being used: if not adf.get_basic_info("compare_obs"): #Extract CAM baseline variaables: baseline_name = adf.get_baseline_info("cam_case_name", required=True) input_ts_baseline = adf.get_baseline_info("cam_ts_loc", required=True) case_names.append(baseline_name) input_ts_locs.append(input_ts_baseline) #Grab baseline years (which may be empty strings if using Obs): syear_baseline = adf.climo_yrs["syear_baseline"] eyear_baseline = adf.climo_yrs["eyear_baseline"] syear_cases.append(syear_baseline) eyear_cases.append(eyear_baseline) #Save the baseline to the first case's plots directory: output_locs.append(output_locs[0]) else: print("\t WARNING: AMWG table doesn't currently work with obs, so obs table won't be created.") #End if #----------------------------------------- #Loop over CAM cases: #Initialize list of case name csv files for case comparison check later csv_list = [] for case_idx, case_name in enumerate(case_names): #Convert output location string to a Path object: output_location = Path(output_locs[case_idx]) #Generate input file path: input_location = Path(input_ts_locs[case_idx]) #Check that time series input directory actually exists: if not input_location.is_dir(): errmsg = f"Time series directory '{input_location}' not found. Script is exiting." raise AdfError(errmsg) #Write to debug log if enabled: adf.debug_log(f"DEBUG: location of files is {str(input_location)}") #Notify user that script has started: print(f"\n Creating table for '{case_name}'...") if eyear_cases[case_idx]-syear_cases[case_idx] == 0: calc_stats = False print("\t INFO: Looks like there is only one year of data, will skip statistics and just add means to table") else: calc_stats = True #Create output file name: output_csv_file = output_location / f"amwg_table_{case_name}.csv" #Given that this is a final, user-facing analysis, go ahead and re-do it every time: if Path(output_csv_file).is_file(): Path.unlink(output_csv_file) #End if #Create/reset new variable that potentially stores the re-gridded #ocean fraction xarray data-array: ocn_frc_da = None #Loop over CAM output variables: for var in var_list: #Notify users of variable being added to table: print(f"\t - Variable '{var}' being added to table") #Create list of time series files present for variable: ts_filenames = f'{case_name}.*.{var}.*nc' ts_files = sorted(input_location.glob(ts_filenames)) # If no files exist, try to move to next variable. --> Means we can not proceed with this variable, and it'll be problematic later. if not ts_files: errmsg = f"\t WARNING: Time series files for variable '{var}' not found. Script will continue to next variable." warnings.warn(errmsg) continue #End if #TEMPORARY: For now, make sure only one file exists: if len(ts_files) != 1: errmsg = "\t WARNING: Currently the AMWG table script can only handle one time series file per variable." errmsg += f" Multiple files were found for the variable '{var}', so it will be skipped." print(errmsg) continue #End if #Load model variable data from file: ds = pf.load_dataset(ts_files) data = ds[var] #Extract units string, if available: if hasattr(data, 'units'): unit_str = data.units else: unit_str = '--' #Check if variable has a vertical coordinate: if 'lev' in data.coords or 'ilev' in data.coords: print(f"\t WARNING: Variable '{var}' has a vertical dimension, "+\ "which is currently not supported for the AMWG Table. Skipping...") #Skip this variable and move to the next variable in var_list: continue #End if #Extract defaults for variable: var_default_dict = var_defaults.get(var, {}) #Check if variable should be masked: if 'mask' in var_default_dict: if var_default_dict['mask'].lower() == 'ocean': #Check if the ocean fraction has already been regridded #and saved: if ocn_frc_da is not None: ofrac = ocn_frc_da # set the bounds of regridded ocnfrac to 0 to 1 ofrac = xr.where(ofrac>1,1,ofrac) ofrac = xr.where(ofrac<0,0,ofrac) # apply ocean fraction mask to variable data = pf.mask_land_or_ocean(data, ofrac, use_nan=True) #data = var_tmp else: print(f"\t WARNING: OCNFRAC not found, unable to apply mask to '{var}'") #End if else: #Currently only an ocean mask is supported, so print warning here: wmsg = "\t WARNING: Currently the only variable mask option is 'ocean'," wmsg += f"not '{var_default_dict['mask'].lower()}'" print(wmsg) #End if #End if #If the variable is ocean fraction, then save the dataset for use later: if var == 'OCNFRAC': ocn_frc_da = data #End if # we should check if we need to do area averaging: if len(data.dims) > 1: # flags that we have spatial dimensions # Note: that could be 'lev' which should trigger different behavior # Note: we should be able to handle (lat, lon) or (ncol,) cases, at least data = pf.spatial_average(data) # changes data "in place" # In order to get correct statistics, average to annual or seasonal data = pf.annual_mean(data, whole_years=True, time_name='time') # Set values for columns cols = ['variable', 'unit', 'mean', 'sample size', 'standard dev.', 'standard error', '95% CI', 'trend', 'trend p-value'] if not calc_stats: row_values = [var, unit_str] + [data.data.mean()] + ["-","-","-","-","-","-"] else: stats_list = _get_row_vals(data) row_values = [var, unit_str] + stats_list # Format entries: dfentries = {c:[row_values[i]] for i,c in enumerate(cols)} # Add entries to Pandas structure and create a dataframe: df = pd.DataFrame(dfentries) # Check if the output CSV file exists, # if so, then append to it: if output_csv_file.is_file(): df.to_csv(output_csv_file, mode='a', header=False, index=False) else: df.to_csv(output_csv_file, header=cols, index=False) #End of var_list loop #-------------------- # Move RESTOM to top of table (if applicable) #-------------------------------------------- try: table_df = pd.read_csv(output_csv_file) if 'RESTOM' in table_df['variable'].values: table_df = pd.concat([table_df[table_df['variable'] == 'RESTOM'], table_df]).reset_index(drop = True) table_df = table_df.drop_duplicates() table_df.to_csv(output_csv_file, header=cols, index=False) # last step is to add table dataframe to website (if enabled): adf.add_website_data(table_df, case_name, case_name, plot_type="Tables") except FileNotFoundError: print(f"\n\tAMWG table for '{case_name}' not created.\n") #End try/except #Keep track of case csv files for comparison table check later csv_list.extend(sorted(output_location.glob(f"amwg_table_{case_name}.csv"))) #End of model case loop #---------------------- #Start case comparison tables #---------------------------- #Check if observations are being compared to, if so skip table comparison... if not adf.get_basic_info("compare_obs"): #Check if all tables were created to compare against, if not, skip table comparison... if len(csv_list) != len(case_names): print("\tNot enough cases to compare, skipping comparison table...") else: #Create comparison table for both cases print("\n Making comparison table...") _df_comp_table(adf, output_location, case_names) print(" ... Comparison table has been generated successfully") #End if else: print(" No comparison table will be generated due to running against obs.") #End if #Notify user that script has ended: print(" ...AMWG variable table(s) have been generated successfully.")
################## # Helper functions ##################
[docs] def _get_row_vals(data): # Now that data is (time,), we can do our simple stats: data_mean = data.data.mean() #Conditional Formatting depending on type of float if np.abs(data_mean) < 1: formatter = ".3g" else: formatter = ".3f" data_sample = len(data) data_std = data.std() data_sem = data_std / data_sample data_ci = data_sem * 1.96 # https://en.wikipedia.org/wiki/Standard_error data_trend = stats.linregress(data.year, data.values) stdev = f'{data_std.data.item() : {formatter}}' sem = f'{data_sem.data.item() : {formatter}}' ci = f'{data_ci.data.item() : {formatter}}' slope_int = f'{data_trend.intercept : {formatter}} + {data_trend.slope : {formatter}} t' pval = f'{data_trend.pvalue : {formatter}}' return [f'{data_mean:{formatter}}', data_sample, stdev, sem, ci, slope_int, pval]
#####
[docs] def _df_comp_table(adf, output_location, case_names): import pandas as pd output_csv_file_comp = output_location / "amwg_table_comp.csv" # * * * * * * * * * * * * * * * * * * * * * * * * * * * * #This will be for single-case for now (case_names[0]), #will need to change to loop as multi-case is introduced case = output_location/f"amwg_table_{case_names[0]}.csv" baseline = output_location/f"amwg_table_{case_names[-1]}.csv" #Read in test case and baseline dataframes: df_case = pd.read_csv(case) df_base = pd.read_csv(baseline) #Create a merged dataframe that contains only the variables #contained within both the test case and the baseline: df_merge = pd.merge(df_case, df_base, how='inner', on=['variable']) #Create the "comparison" dataframe: df_comp = pd.DataFrame(dtype=object) df_comp[['variable','unit','case']] = df_merge[['variable','unit_x','mean_x']] df_comp['baseline'] = df_merge[['mean_y']] diffs = df_comp['case'].values-df_comp['baseline'].values df_comp['diff'] = [f'{i:.3g}' if np.abs(i) < 1 else f'{i:.3f}' for i in diffs] #Write the comparison dataframe to a new CSV file: cols_comp = ['variable', 'unit', 'test', 'control', 'diff'] df_comp.to_csv(output_csv_file_comp, header=cols_comp, index=False) #Add comparison table dataframe to website (if enabled): adf.add_website_data(df_comp, "Case Comparison", case_names[0], plot_type="Tables")
############## #END OF SCRIPT