{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "922653bd",
   "metadata": {
    "tags": [
     "papermill-error-cell-tag"
    ]
   },
   "source": [
    "<span style=\"color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;\">An Exception was encountered at '<a href=\"#papermill-error-cell\">In [4]</a>'.</span>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9b47986e",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": 0.015921,
     "end_time": "2025-10-10T23:20:25.295894",
     "exception": false,
     "start_time": "2025-10-10T23:20:25.279973",
     "status": "completed"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "# ROF global monthly, annual, seasonal flows analysis <a id='top'></a>\n",
    "\n",
    "Use the following datasets\n",
    "\n",
    "1. reach-D19 gauge link ascii\n",
    "2. D19 flow site geopackage\n",
    "3. D19 discharge netCDF\n",
    "4. history netCD including river discharge\n",
    "\n",
    "[1. Setupt](#setup)\n",
    "\n",
    "[2. Loading data](#load_data)\n",
    "\n",
    "- Read monthly history files from archive. \n",
    "- Reference data: monthly discharge estimates at 922 big river mouths from Dai et al. 2019 data (D19)\n",
    "\n",
    "[3. Large 24 river analysis](#24_large_rivers)\n",
    "\n",
    "- Plotting time seriese (annual, seasonal cycle).\n",
    "- if D19 referece data is available, scatter plots at Large 24 selected rivers against D19 referece data\n",
    "\n",
    "[4. Large 50 rivers analysis](#50_large_rivers)\n",
    "\n",
    "- Annual flow summary table at large 50 selected rivers.\n",
    "- if D19 referece data is available, Scatter plot of annual flow against D19 reference data.\n",
    "\n",
    "[5. 922 rivers analysis](#922_rivers)\n",
    "\n",
    "- run only if reference flow is available\n",
    "- error statistics (%bias, rmse, correlation) at all 922 river sites.\n",
    "- plot error statistic on the global map\n",
    "- plot boxplots including case(s) for each error metric"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5825f48f",
   "metadata": {
    "editable": true,
    "execution": {
     "iopub.execute_input": "2025-10-10T23:20:25.318183Z",
     "iopub.status.busy": "2025-10-10T23:20:25.317812Z",
     "iopub.status.idle": "2025-10-10T23:20:53.979527Z",
     "shell.execute_reply": "2025-10-10T23:20:53.978961Z"
    },
    "papermill": {
     "duration": 28.671041,
     "end_time": "2025-10-10T23:20:53.980917",
     "exception": false,
     "start_time": "2025-10-10T23:20:25.309876",
     "status": "completed"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "The Python version: 3.11.4\n",
      "xarray 2025.7.1\n",
      "pandas 2.3.1\n",
      "geopandas 1.1.1\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ERROR 1: PROJ: proj_create_from_database: Open of /glade/work/richling/conda-envs/cupid-analysis/share/proj failed\n"
     ]
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import os, sys\n",
    "import glob\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib as mpl\n",
    "from matplotlib.ticker import FormatStrFormatter\n",
    "from sklearn.linear_model import LinearRegression\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import geopandas as gpd\n",
    "import xarray as xr\n",
    "import cartopy.crs as ccrs\n",
    "import cartopy.feature as cfeature\n",
    "\n",
    "import scripts.colors as colors\n",
    "import scripts.metrics as metric\n",
    "from scripts.utility import load_yaml\n",
    "from scripts.utility import no_time_variable\n",
    "\n",
    "rivers_50m = cfeature.NaturalEarthFeature(\"physical\", \"rivers_lake_centerlines\", \"50m\")\n",
    "land = cfeature.LAND\n",
    "\n",
    "print(\"\\nThe Python version: %s.%s.%s\" % sys.version_info[:3])\n",
    "print(xr.__name__, xr.__version__)\n",
    "print(pd.__name__, pd.__version__)\n",
    "print(gpd.__name__, gpd.__version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf1e68f2",
   "metadata": {
    "papermill": {
     "duration": 0.005153,
     "end_time": "2025-10-10T23:20:54.000144",
     "exception": false,
     "start_time": "2025-10-10T23:20:53.994991",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "-------------------------\n",
    "## 1. Setup <a id='setup'></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1dee386b",
   "metadata": {
    "editable": true,
    "execution": {
     "iopub.execute_input": "2025-10-10T23:20:54.011728Z",
     "iopub.status.busy": "2025-10-10T23:20:54.011067Z",
     "iopub.status.idle": "2025-10-10T23:20:54.017288Z",
     "shell.execute_reply": "2025-10-10T23:20:54.016885Z"
    },
    "papermill": {
     "duration": 0.012955,
     "end_time": "2025-10-10T23:20:54.018061",
     "exception": false,
     "start_time": "2025-10-10T23:20:54.005106",
     "status": "completed"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# Parameter Defaults\n",
    "# parameters are set in CUPiD's config.yml file\n",
    "# when running interactively, manually set the parameters below\n",
    "\n",
    "# global parameters\n",
    "CESM_output_dir = \"\"  # e.g., \"/glade/campaign/cesm/development/cross-wg/diagnostic_framework/CESM_output_for_testing\"\n",
    "case_name = None  # case name: e.g., \"b.e30_beta02.BLT1850.ne30_t232.104\"\n",
    "base_case_name = None  # base case name: e.g., \"b.e23_alpha17f.BLT1850.ne30_t232.092\"\n",
    "start_date = \"\"  # simulation starting date: e.g., \"0001-01-01\"\n",
    "end_date = \"\"  # simulation ending date: \"0100-01-01\"\n",
    "base_start_date = \"\"  # base simulation starting date: \"0001-01-01\"\n",
    "base_end_date = \"\"  # base simulation ending date: \"0100-01-01\"\n",
    "serial = True  # use dask LocalCluster\n",
    "lc_kwargs = {}\n",
    "\n",
    "# rof parameters\n",
    "hist_str = \"\"  # Used for hist file tag\n",
    "analysis_name = \"\"  # Used for Figure png names\n",
    "climo_nyears = 10  # number of years to compute the climatology\n",
    "grid_name = \"f09_f09_mosart\"  # ROF grid name used in case\n",
    "base_grid_name = (\n",
    "    grid_name  # spcify ROF grid name for base_case in config.yml if different than case\n",
    ")\n",
    "figureSave = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b0c864e1",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-10-10T23:20:54.039003Z",
     "iopub.status.busy": "2025-10-10T23:20:54.038770Z",
     "iopub.status.idle": "2025-10-10T23:20:54.043623Z",
     "shell.execute_reply": "2025-10-10T23:20:54.043117Z"
    },
    "papermill": {
     "duration": 0.01423,
     "end_time": "2025-10-10T23:20:54.044427",
     "exception": false,
     "start_time": "2025-10-10T23:20:54.030197",
     "status": "completed"
    },
    "tags": [
     "injected-parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# Parameters\n",
    "case_name = \"b.e30_beta02.BLT1850.ne30_t232.104\"\n",
    "base_case_name = \"b.e23_alpha17f.BLT1850.ne30_t232.092\"\n",
    "CESM_output_dir = \"/glade/campaign/cesm/development/cross-wg/diagnostic_framework/CESM_output_for_testing\"\n",
    "start_date = \"0001-01-01\"\n",
    "end_date = \"0101-01-01\"\n",
    "base_start_date = \"0001-01-01\"\n",
    "base_end_date = \"0101-01-01\"\n",
    "obs_data_dir = (\n",
    "    \"/glade/campaign/cesm/development/cross-wg/diagnostic_framework/CUPiD_obs_data\"\n",
    ")\n",
    "ts_dir = None\n",
    "lc_kwargs = {\"threads_per_worker\": 1}\n",
    "serial = False\n",
    "hist_str = \"h0a\"\n",
    "analysis_name = \"\"\n",
    "grid_name = \"f09_f09_mosart\"\n",
    "climo_nyears = 10\n",
    "figureSave = False\n",
    "subset_kwargs = {}\n",
    "product = \"/glade/work/richling/CUPid_pr_test/CUPiD/examples/key_metrics/computed_notebooks//rof/global_discharge_gauge_compare_obs.ipynb\"\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cd79e5c5",
   "metadata": {
    "tags": [
     "papermill-error-cell-tag"
    ]
   },
   "source": [
    "<span id=\"papermill-error-cell\" style=\"color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;\">Execution using papermill encountered an exception here and stopped:</span>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ba3b0180",
   "metadata": {
    "editable": true,
    "execution": {
     "iopub.execute_input": "2025-10-10T23:20:54.059300Z",
     "iopub.status.busy": "2025-10-10T23:20:54.059109Z",
     "iopub.status.idle": "2025-10-10T23:20:54.963863Z",
     "shell.execute_reply": "2025-10-10T23:20:54.963155Z"
    },
    "papermill": {
     "duration": 0.913316,
     "end_time": "2025-10-10T23:20:54.964748",
     "exception": true,
     "start_time": "2025-10-10T23:20:54.051432",
     "status": "failed"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "unsupported operand type(s) for +=: 'dict' and 'dict'",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mTypeError\u001b[39m                                 Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 24\u001b[39m\n\u001b[32m     16\u001b[39m case_dic = {\n\u001b[32m     17\u001b[39m     case_name: {\n\u001b[32m     18\u001b[39m         \u001b[33m\"\u001b[39m\u001b[33mgrid\u001b[39m\u001b[33m\"\u001b[39m: grid_name,\n\u001b[32m   (...)\u001b[39m\u001b[32m     21\u001b[39m     }\n\u001b[32m     22\u001b[39m }\n\u001b[32m     23\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m base_case_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m24\u001b[39m     \u001b[43mcase_dic\u001b[49m\u001b[43m \u001b[49m\u001b[43m+\u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m     25\u001b[39m \u001b[43m        \u001b[49m\u001b[43mbase_case_name\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m     26\u001b[39m \u001b[43m            \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mgrid\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mgrid_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m     27\u001b[39m \u001b[43m            \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msim_period\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mslice\u001b[39;49m\u001b[43m(\u001b[49m\u001b[33;43mf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mbase_start_date\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43mf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mbase_end_date\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m     28\u001b[39m \u001b[43m            \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mclimo_nyrs\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mmin\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[32m     29\u001b[39m \u001b[43m                \u001b[49m\u001b[43mclimo_nyears\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mbase_end_date\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[32;43m4\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m-\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mbase_start_date\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[32;43m4\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m+\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m1\u001b[39;49m\n\u001b[32m     30\u001b[39m \u001b[43m            \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m     31\u001b[39m \u001b[43m        \u001b[49m\u001b[43m}\u001b[49m\n\u001b[32m     32\u001b[39m \u001b[43m    \u001b[49m\u001b[43m}\u001b[49m\n",
      "\u001b[31mTypeError\u001b[39m: unsupported operand type(s) for +=: 'dict' and 'dict'"
     ]
    }
   ],
   "source": [
    "# ROF additional setup\n",
    "setup = load_yaml(\"./setup/setup.yaml\")\n",
    "\n",
    "ancillary_dir = setup[\n",
    "    \"ancillary_dir\"\n",
    "]  # ancillary directory including ROF domain, river network data etc.\n",
    "ref_flow_dir = setup[\"ref_flow_dir\"]  # including observed or reference flow data\n",
    "case_meta = setup[\"case_meta\"]  # Case metadata\n",
    "reach_gpkg = setup[\"reach_gpkg\"]  # river reach geopackage meta\n",
    "\n",
    "if analysis_name:\n",
    "    analysis_name = case_name\n",
    "if base_grid_name:\n",
    "    base_grid_name = grid_name\n",
    "\n",
    "case_dic = {\n",
    "    case_name: {\n",
    "        \"grid\": grid_name,\n",
    "        \"sim_period\": slice(f\"{start_date}\", f\"{end_date}\"),\n",
    "        \"climo_nyrs\": min(climo_nyears, int(end_date[:4]) - int(start_date[:4]) + 1),\n",
    "    }\n",
    "}\n",
    "if base_case_name is not None:\n",
    "    case_dic += {\n",
    "        base_case_name: {\n",
    "            \"grid\": grid_name,\n",
    "            \"sim_period\": slice(f\"{base_start_date}\", f\"{base_end_date}\"),\n",
    "            \"climo_nyrs\": min(\n",
    "                climo_nyears, int(base_end_date[:4]) - int(base_start_date[:4]) + 1\n",
    "            ),\n",
    "        }\n",
    "    }"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0ba882e5",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### Dasks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "74fd3d55",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# When running interactively, cupid_run should be set to False\n",
    "cupid_run = True\n",
    "\n",
    "client = None\n",
    "if cupid_run:\n",
    "    # Spin up cluster (if running in parallel)\n",
    "    if not serial:\n",
    "        from dask.distributed import Client, LocalCluster\n",
    "\n",
    "        cluster = LocalCluster(**lc_kwargs)\n",
    "        client = Client(cluster)\n",
    "else:\n",
    "    if not serial:\n",
    "        from dask.distributed import Client\n",
    "        from dask_jobqueue import PBSCluster\n",
    "\n",
    "        cluster = PBSCluster(\n",
    "            cores=1,\n",
    "            processes=1,\n",
    "            memory=\"50GB\",\n",
    "            queue=\"casper\",\n",
    "            walltime=\"01:00:00\",\n",
    "        )\n",
    "        cluster.scale(jobs=10)\n",
    "        client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "876cefb6",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "## 2. Loading data <a id='load_data'></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "647e5291",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 2.1. Monthly/annual flow netCDFs\n",
    "- month_data (xr dataset) - read from archive\n",
    "- year_data (xr dataset) - computed from monthly\n",
    "- seas_data (xr dataset) - computed from monthly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "57e1a17c",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "reachID = {}\n",
    "month_data = {}\n",
    "year_data = {}\n",
    "seas_data = {}\n",
    "for case, meta in case_dic.items():\n",
    "    in_dire = os.path.join(CESM_output_dir, case, \"rof/hist\")\n",
    "    model = case_meta[meta[\"grid\"]][\"model\"]\n",
    "    domain = case_meta[meta[\"grid\"]][\"domain_nc\"]\n",
    "    var_list = case_meta[meta[\"grid\"]][\"vars_read\"]\n",
    "\n",
    "    def preprocess(ds):\n",
    "        return ds[var_list]\n",
    "\n",
    "    year_list = [\n",
    "        \"{:04d}\".format(yr)\n",
    "        for yr in np.arange(\n",
    "            int(meta[\"sim_period\"].start[0:4]), int(meta[\"sim_period\"].stop[0:4]) + 1\n",
    "        )\n",
    "    ]\n",
    "\n",
    "    nc_list = []\n",
    "    for nc_path in sorted(glob.glob(f\"{in_dire}/{case}.{model}.{hist_str}.????-*.nc\")):\n",
    "        for yr in year_list:\n",
    "            if yr in os.path.basename(nc_path):\n",
    "                nc_list.append(nc_path)\n",
    "\n",
    "    # load data\n",
    "    ds = xr.open_mfdataset(\n",
    "        nc_list,\n",
    "        data_vars=\"minimal\",\n",
    "        parallel=True,\n",
    "        preprocess=preprocess,\n",
    "    ).sel(time=meta[\"sim_period\"])\n",
    "\n",
    "    # monthly\n",
    "    month_data[case] = ds.isel(time=slice(-meta[\"climo_nyrs\"] * 12, None)).load()\n",
    "    # annual\n",
    "    year_data[case] = (\n",
    "        ds.isel(time=slice(-meta[\"climo_nyrs\"] * 12, None))\n",
    "        .resample(time=\"YS\")\n",
    "        .mean(dim=\"time\")\n",
    "        .load()\n",
    "    )\n",
    "    # seasonal (compute here instead of reading for conisistent analysis period)\n",
    "    seas_data[case] = (\n",
    "        ds.isel(time=slice(-meta[\"climo_nyrs\"] * 12, None))\n",
    "        .groupby(\"time.month\")\n",
    "        .mean(\"time\")\n",
    "        .load()\n",
    "    )\n",
    "    vars_no_time = no_time_variable(month_data[case])\n",
    "    if vars_no_time:\n",
    "        seas_data[case][vars_no_time] = seas_data[case][vars_no_time].isel(\n",
    "            month=0, drop=True\n",
    "        )\n",
    "    mon_time = month_data[case].time.values\n",
    "    if domain == \"None\":\n",
    "        reachID[case] = month_data[case][\"reachID\"].values\n",
    "    else:\n",
    "        reachID[case] = xr.open_dataset(f\"{ancillary_dir}/{domain}\")[\"reachID\"].values\n",
    "    print(f\"Finished loading {case}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "75007315",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 2.2 Large river ID and name ascii\n",
    "- big_river_50: dictionary {_site_id_:_river name_}\n",
    "- big_river_24: dictionary {_site_id_:_river name_}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe0d02f6",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"./setup/large_river_50.txt\", index_col=\"river_name\")\n",
    "big_river_50 = {key: values[\"site_id\"] for key, values in df.iterrows()}\n",
    "big_river_24 = {\n",
    "    key: values[\"site_id\"] for ix, (key, values) in enumerate(df.iterrows()) if ix < 24\n",
    "}  # The first 24 is used for plots"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8035bdf0",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 2.3. reach-D19 gauge link csv\n",
    "- gauge_reach_lnk (dataframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69b47c1c",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "gauge_reach_lnk = {}\n",
    "for case, meta in case_dic.items():\n",
    "    gauge_reach_lnk[case] = pd.read_csv(\n",
    "        \"%s/D09/D09_925.%s.asc\" % (ref_flow_dir, case_meta[meta[\"grid\"]][\"network\"])\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4b1910ec",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 2.4 D19 flow site shapefile\n",
    "- gauge_shp (dataframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59e5e27d",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "gauge_shp = gpd.read_file(\n",
    "    os.path.join(ref_flow_dir, \"D09\", \"geospatial\", \"D09_925.gpkg\")\n",
    ")\n",
    "gauge_shp = gauge_shp[gauge_shp[\"id\"] != 9999999]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "931be674",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 2.5 D19 discharge data\n",
    "- ds_q_obs_mon (xr datasets)\n",
    "- ds_q_obs_yr (xr datasets)\n",
    "- dr_q_obs_seasonal (xr datasets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "477a1f06",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "# read monthly data\n",
    "ds_q = xr.open_dataset(\n",
    "    \"%s/D09/coastal-stns-Vol-monthly.updated-May2019.mod.nc\" % (ref_flow_dir),\n",
    "    decode_times=False,\n",
    ")\n",
    "ds_q[\"time\"] = xr.cftime_range(\n",
    "    start=\"1900-01-01\", end=\"2018-12-01\", freq=\"MS\", calendar=\"standard\"\n",
    ")\n",
    "\n",
    "# monthly - if time_period is outside observation period, use the entire obs period\n",
    "obs_available = True\n",
    "if ds_q[\"time\"].sel(time=slice(str(mon_time[0]), str(mon_time[-1]))).values.size == 0:\n",
    "    obs_available = False\n",
    "    ds_q_obs_mon = ds_q[\"FLOW\"]\n",
    "else:\n",
    "    ds_q_obs_mon = ds_q[\"FLOW\"].sel(time=slice(str(mon_time[0]), str(mon_time[-1])))\n",
    "# compute annual flow from monthly\n",
    "ds_q_obs_yr = ds_q_obs_mon.resample(time=\"YE\").mean(dim=\"time\")\n",
    "# compute annual cycle at monthly scale\n",
    "dr_q_obs_seasonal = ds_q_obs_mon.groupby(\"time.month\").mean(\"time\")\n",
    "# annual mean, max, and min flow during time_period\n",
    "ds_q_obs_yr_min = ds_q_obs_yr.min(\"time\")\n",
    "ds_q_obs_yr_max = ds_q_obs_yr.max(\"time\")\n",
    "ds_q_obs_yr_mean = ds_q_obs_yr.mean(\"time\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dcc99442",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 2.6 Get indices in observation and simulation for gauge name (processing)\n",
    "- gauge_plot (dictionary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "063e15ed",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "gauge_plot = {}\n",
    "for gname, site_id in big_river_50.items():\n",
    "    gauge_plot[gname] = {}\n",
    "    for case in case_dic.keys():\n",
    "        gauge_ix = [\n",
    "            i for i, gid in enumerate(ds_q.id.values) if gid == site_id\n",
    "        ]  # go through obs Dataset and get index matching to river (gauge) name\n",
    "        gauge_id = ds_q.id.values[gauge_ix][0]  ## guage ID\n",
    "        seg_id = (\n",
    "            gauge_reach_lnk[case]\n",
    "            .loc[gauge_reach_lnk[case][\"gauge_id\"] == gauge_id][\"route_id\"]\n",
    "            .values\n",
    "        )  # matching reach ID in river network\n",
    "        seg_ix = np.argwhere(\n",
    "            reachID[case] == seg_id\n",
    "        )  # matching reach index in river network\n",
    "        if len(seg_ix) == 0:\n",
    "            seg_ix = -999\n",
    "        else:\n",
    "            seg_ix = seg_ix[0]\n",
    "        gauge_plot[gname][case] = [gauge_ix, seg_ix, seg_id]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "242df780",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "------\n",
    "## 3. Analysis for 24 large rivers <a id='24_large_rivers'></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "51b9ddc4",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 3.1 Annual flow series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69ee976b",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "fig, axes = plt.subplots(8, 3, figsize=(7.25, 11.5))\n",
    "plt.subplots_adjust(\n",
    "    top=0.975, bottom=0.065, right=0.98, left=0.10, hspace=0.225, wspace=0.250\n",
    ")  # create some space below the plots by increasing the bottom-value\n",
    "\n",
    "for ix, river_name in enumerate(big_river_24.keys()):\n",
    "    row = ix // 3\n",
    "    col = ix % 3\n",
    "    for case, meta in case_dic.items():\n",
    "        net_idx = gauge_plot[river_name][case][1]\n",
    "        gaug_idx = gauge_plot[river_name][case][0]\n",
    "\n",
    "        q_name = case_meta[meta[\"grid\"]][\"flow_name\"]\n",
    "\n",
    "        if len(net_idx) == 1:\n",
    "            year_data[case][q_name][:, net_idx].plot(\n",
    "                ax=axes[row, col], linestyle=\"-\", lw=0.75, label=case\n",
    "            )\n",
    "        elif len(net_idx) == 2:  # means 2d grid\n",
    "            year_data[case][q_name][:, net_idx[0], net_idx[1]].plot(\n",
    "                ax=axes[row, col], linestyle=\"-\", lw=0.75, label=case\n",
    "            )\n",
    "    if obs_available:\n",
    "        ds_q_obs_yr.loc[:, gaug_idx].plot(\n",
    "            ax=axes[row, col],\n",
    "            linestyle=\"None\",\n",
    "            marker=\"o\",\n",
    "            markersize=3,\n",
    "            c=\"k\",\n",
    "            label=\"D19\",\n",
    "        )\n",
    "    else:\n",
    "        qob_mean = ds_q_obs_yr_mean.loc[gaug_idx]\n",
    "        qob_max = ds_q_obs_yr_max.loc[gaug_idx]\n",
    "        qob_min = ds_q_obs_yr_min.loc[gaug_idx]\n",
    "        axes[row, col].axhline(\n",
    "            y=qob_mean, color=\"k\", linestyle=\"--\", lw=0.5, label=\"D19 mean (1900-2018)\"\n",
    "        )\n",
    "        axes[row, col].axhspan(\n",
    "            qob_min[0],\n",
    "            qob_max[0],\n",
    "            color=\"gray\",\n",
    "            alpha=0.1,\n",
    "            label=\"D19 min-max (1900-2018)\",\n",
    "        )\n",
    "\n",
    "    axes[row, col].set_title(\"%d %s\" % (ix + 1, river_name), fontsize=8)\n",
    "\n",
    "    axes[row, col].set_xlabel(\"\")\n",
    "    if row < 7:\n",
    "        axes[row, col].set_xticklabels(\"\")\n",
    "    if col == 0:\n",
    "        axes[row, col].set_ylabel(\"Annual flow [m$^3$/s]\", fontsize=8)\n",
    "    else:\n",
    "        axes[row, col].set_ylabel(\"\")\n",
    "    axes[row, col].tick_params(\"both\", labelsize=\"xx-small\")\n",
    "\n",
    "# Legend- make space below the plot-raise bottom. there will be an label below the second last (bottom middle) ax, thanks to the bbox_to_anchor=(x, y) with a negative y-value.\n",
    "axes.flatten()[-2].legend(\n",
    "    loc=\"upper center\",\n",
    "    bbox_to_anchor=(0.125, -0.35, 0.75, 0.1),\n",
    "    ncol=5,\n",
    "    fontsize=\"x-small\",\n",
    ")\n",
    "\n",
    "if figureSave:\n",
    "    plt.savefig(f\"./NB1_Fig1_big_river_annual_{analysis_name}.png\", dpi=200)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1e988cf7",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 3.2. Annual cycle at monthly step"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "741f710e",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "fig, axes = plt.subplots(8, 3, figsize=(7.25, 11.5))\n",
    "plt.subplots_adjust(\n",
    "    top=0.975, bottom=0.065, right=0.98, left=0.10, hspace=0.225, wspace=0.250\n",
    ")  # create some space below the plots by increasing the bottom-value\n",
    "\n",
    "for ix, river_name in enumerate(big_river_24.keys()):\n",
    "    row = ix // 3\n",
    "    col = ix % 3\n",
    "    for case, meta in case_dic.items():\n",
    "        net_idx = gauge_plot[river_name][case][1]\n",
    "        gaug_idx = gauge_plot[river_name][case][0]\n",
    "\n",
    "        q_name = case_meta[meta[\"grid\"]][\"flow_name\"]\n",
    "\n",
    "        if len(net_idx) == 1:  # means vector\n",
    "            seas_data[case][q_name][:, net_idx].plot(\n",
    "                ax=axes[row, col], linestyle=\"-\", lw=0.75, label=case\n",
    "            )\n",
    "        elif len(net_idx) == 2:  # means 2d grid\n",
    "            seas_data[case][q_name][:, net_idx[0], net_idx[1]].plot(\n",
    "                ax=axes[row, col], linestyle=\"-\", lw=1.0, label=case\n",
    "            )\n",
    "\n",
    "    if obs_available:\n",
    "        dr_q_obs_seasonal.loc[:, gaug_idx].plot(\n",
    "            ax=axes[row, col],\n",
    "            linestyle=\":\",\n",
    "            lw=0.5,\n",
    "            marker=\"o\",\n",
    "            markersize=2,\n",
    "            c=\"k\",\n",
    "            label=\"D19\",\n",
    "        )\n",
    "\n",
    "    axes[row, col].set_title(\"%d %s\" % (ix + 1, river_name), fontsize=8)\n",
    "    axes[row, col].set_xlabel(\"\")\n",
    "    if row < 7:\n",
    "        axes[row, col].set_xticklabels(\"\")\n",
    "    if col == 0:\n",
    "        axes[row, col].set_ylabel(\"Mon. flow [m$^3$/s]\", fontsize=8)\n",
    "    else:\n",
    "        axes[row, col].set_ylabel(\"\")\n",
    "    axes[row, col].tick_params(\"both\", labelsize=\"xx-small\")\n",
    "\n",
    "# Legend- make space below the plot-raise bottom. there will be an label below the second last (bottom middle) ax, thanks to the bbox_to_anchor=(x, y) with a negative y-value.\n",
    "axes.flatten()[-2].legend(\n",
    "    loc=\"upper center\",\n",
    "    bbox_to_anchor=(0.10, -0.30, 0.75, 0.1),\n",
    "    ncol=5,\n",
    "    fontsize=\"x-small\",\n",
    ")\n",
    "\n",
    "if figureSave:\n",
    "    plt.savefig(f\"./NB1_Fig2_big_river_season_{analysis_name}.png\", dpi=200)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4eb2d7d6",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 3.3. scatter plots of monthly flow - obs vs sim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17b09760",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "if obs_available:\n",
    "    # Monthly flow scatter plot\n",
    "    fig, axes = plt.subplots(8, 3, figsize=(7.50, 15.00))\n",
    "    plt.subplots_adjust(\n",
    "        top=0.995, bottom=0.075, right=0.995, left=0.1, wspace=0.25, hspace=0.25\n",
    "    )\n",
    "\n",
    "    for ix, river_name in enumerate(big_river_24.keys()):\n",
    "        row = ix // 3\n",
    "        col = ix % 3\n",
    "        axes[row, col].yaxis.set_major_formatter(FormatStrFormatter(\"%.0f\"))\n",
    "\n",
    "        for jx, (case, meta) in enumerate(case_dic.items()):\n",
    "\n",
    "            net_idx = gauge_plot[river_name][case][1]\n",
    "            gaug_idx = gauge_plot[river_name][case][0]\n",
    "\n",
    "            q_name = case_meta[meta[\"grid\"]][\"flow_name\"]\n",
    "\n",
    "            if len(net_idx) == 1:  # means vector\n",
    "                ds_sim = month_data[case][q_name][:, net_idx]\n",
    "            elif len(net_idx) == 2:  # means 2d grid\n",
    "                ds_sim = month_data[case][q_name][:, net_idx[0], net_idx[1]].squeeze()\n",
    "\n",
    "            ds_obs = ds_q_obs_mon.sel(time=meta[\"time_period\"]).loc[:, gaug_idx]\n",
    "\n",
    "            axes[row, col].plot(\n",
    "                ds_obs, ds_sim, \"o\", label=case, markersize=4.0, alpha=0.4\n",
    "            )\n",
    "            if jx == 0:\n",
    "                max_val = np.max(ds_obs)\n",
    "                min_val = np.min(ds_obs)\n",
    "            else:\n",
    "                max_val = np.max([np.max(ds_sim), max_val])\n",
    "                min_val = np.min([np.min(ds_sim), min_val])\n",
    "\n",
    "        axes[row, col].plot(\n",
    "            [min_val * 0.98, max_val * 1.02],\n",
    "            [min_val * 0.98, max_val * 1.02],\n",
    "            \":k\",\n",
    "            linewidth=0.5,\n",
    "        )\n",
    "\n",
    "        axes[row, col].annotate(\n",
    "            \"%d %s\" % (ix + 1, river_name),\n",
    "            xy=(0.05, 0.875),\n",
    "            xycoords=\"axes fraction\",\n",
    "            fontsize=8,\n",
    "            bbox=dict(facecolor=\"white\", edgecolor=\"None\", alpha=0.8),\n",
    "        )\n",
    "        if row == 7 and col == 1:\n",
    "            axes[row, col].set_xlabel(\n",
    "                \"Monthly reference discharge [m$^3$/s]\", fontsize=9\n",
    "            )\n",
    "        else:\n",
    "            axes[row, col].set_xlabel(\"\")\n",
    "\n",
    "        if col == 0 and row == 5:\n",
    "            axes[row, col].set_ylabel(\n",
    "                \"Monthly simulated discharge [m$^3$/s]\", fontsize=10\n",
    "            )\n",
    "        else:\n",
    "            axes[row, col].set_ylabel(\"\")\n",
    "        axes[row, col].tick_params(\"both\", labelsize=\"xx-small\")\n",
    "\n",
    "    axes.flatten()[-2].legend(\n",
    "        loc=\"upper center\",\n",
    "        bbox_to_anchor=(0.10, -0.40, 0.75, 0.1),\n",
    "        ncol=5,\n",
    "        fontsize=\"x-small\",\n",
    "    )\n",
    "\n",
    "    if figureSave:\n",
    "        plt.savefig(f\"./NB1_Fig3_big_river_month_scatter_{analysis_name}.png\", dpi=200)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0fcfbc0e",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 3.4. scatter plots of annual flow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af17e284",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "if obs_available:\n",
    "    # annual flow scatter plot\n",
    "    fig, axes = plt.subplots(8, 3, figsize=(7.50, 15.00))\n",
    "    plt.subplots_adjust(\n",
    "        top=0.995, bottom=0.075, right=0.995, left=0.1, wspace=0.25, hspace=0.25\n",
    "    )\n",
    "\n",
    "    for ix, river_name in enumerate(big_river_24.keys()):\n",
    "        row = ix // 3\n",
    "        col = ix % 3\n",
    "        axes[row, col].yaxis.set_major_formatter(FormatStrFormatter(\"%.0f\"))\n",
    "\n",
    "        for jx, (case, meta) in enumerate(case_dic.items()):\n",
    "            net_idx = gauge_plot[river_name][case][1]\n",
    "            gaug_idx = gauge_plot[river_name][case][0]\n",
    "\n",
    "            q_name = case_meta[meta[\"grid\"]][\"flow_name\"]\n",
    "\n",
    "            if len(net_idx) == 1:  # means vector\n",
    "                ds_sim = year_data[case][q_name][:, net_idx]\n",
    "            elif len(net_idx) == 2:  # means 2d grid\n",
    "                ds_sim = year_data[case][q_name][:, net_idx[0], net_idx[1]]\n",
    "\n",
    "            ds_obs = ds_q_obs_yr.sel(time=meta[\"time_period\"]).loc[:, gaug_idx]\n",
    "\n",
    "            axes[row, col].plot(\n",
    "                ds_obs, ds_sim, \"o\", label=case, markersize=4.0, alpha=0.4\n",
    "            )\n",
    "            if jx == 0:\n",
    "                max_val = np.max(ds_obs)\n",
    "                min_val = np.min(ds_obs)\n",
    "            else:\n",
    "                max_val = np.max([np.max(ds_sim), max_val])\n",
    "                min_val = np.min([np.min(ds_sim), min_val])\n",
    "\n",
    "        axes[row, col].plot(\n",
    "            [min_val * 0.98, max_val * 1.02],\n",
    "            [min_val * 0.98, max_val * 1.02],\n",
    "            \":k\",\n",
    "            linewidth=0.5,\n",
    "        )\n",
    "\n",
    "        axes[row, col].annotate(\n",
    "            \"%d %s\" % (ix + 1, river_name),\n",
    "            xy=(0.05, 0.875),\n",
    "            xycoords=\"axes fraction\",\n",
    "            fontsize=8,\n",
    "            bbox=dict(facecolor=\"white\", edgecolor=\"None\", alpha=0.8),\n",
    "        )\n",
    "        if row == 7 and col == 1:\n",
    "            axes[row, col].set_xlabel(\n",
    "                \"Monthly reference discharge [m$^3$/s]\", fontsize=9\n",
    "            )\n",
    "        else:\n",
    "            axes[row, col].set_xlabel(\"\")\n",
    "\n",
    "        if col == 0 and row == 5:\n",
    "            axes[row, col].set_ylabel(\n",
    "                \"Monthly simulated discharge [m$^3$/s]\", fontsize=10\n",
    "            )\n",
    "        else:\n",
    "            axes[row, col].set_ylabel(\"\")\n",
    "        axes[row, col].tick_params(\"both\", labelsize=\"xx-small\")\n",
    "\n",
    "    axes.flatten()[-2].legend(\n",
    "        loc=\"upper center\",\n",
    "        bbox_to_anchor=(0.10, -0.40, 0.75, 0.1),\n",
    "        ncol=5,\n",
    "        fontsize=\"x-small\",\n",
    "    )\n",
    "\n",
    "    if figureSave:\n",
    "        plt.savefig(f\"./NB1_Fig4_big_river_annual_scatter_{analysis_name}.png\", dpi=200)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a2df0d4d",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "## 4. Anaysis for Large 50 rivers <a id='50_large_rivers'></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "59342d94",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 4.1 Summary tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e523ad7",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "with open(f\"large50rivers_{analysis_name}.txt\", \"w\") as f:\n",
    "    # some meta\n",
    "    f.write(\"# obs: Dai et al.(2019)\\n\")\n",
    "    f.write(\"# vol: km^3/yr\\n\")\n",
    "    f.write(\"# area: km^2\\n\")\n",
    "\n",
    "    # headers\n",
    "    f.write(\"No,          river_name,\")\n",
    "    f.write(\"{0: >15}_vol,\".format(\"obs\"))\n",
    "    for jx, (case, meta) in enumerate(case_dic.items()):\n",
    "        f.write(\"{0: >15}_vol,\".format(meta[\"grid\"]))\n",
    "    f.write(\"{0: >15}_area\".format(\"obs\"))\n",
    "    for jx, (case, meta) in enumerate(case_dic.items()):\n",
    "        f.write(\",\")\n",
    "        f.write(\"{0: >15}_area\".format(meta[\"grid\"]))\n",
    "    f.write(\"\\n\")\n",
    "\n",
    "    # data for each river\n",
    "    for ix, river_name in enumerate(big_river_50.keys()):\n",
    "\n",
    "        f.write(\"%02d,\" % (ix + 1))\n",
    "        f.write(\"{0: >20}\".format(river_name))\n",
    "\n",
    "        for jx, (case, meta) in enumerate(case_dic.items()):\n",
    "            f.write(\",\")\n",
    "            net_idx = gauge_plot[river_name][case][1]\n",
    "            gaug_idx = gauge_plot[river_name][case][0]\n",
    "\n",
    "            q_name = case_meta[meta[\"grid\"]][\"flow_name\"]\n",
    "\n",
    "            if len(net_idx) == 1:  # means vector\n",
    "                qsim = (\n",
    "                    np.nanmean(year_data[case][q_name][:, net_idx].values)\n",
    "                    * 60\n",
    "                    * 60\n",
    "                    * 24\n",
    "                    * 365\n",
    "                    / 10**9\n",
    "                )\n",
    "            elif len(net_idx) == 2:  # means 2d grid\n",
    "                qsim = (\n",
    "                    np.nanmean(\n",
    "                        year_data[case][q_name][:, net_idx[0], net_idx[1]].values\n",
    "                    )\n",
    "                    * 60\n",
    "                    * 60\n",
    "                    * 24\n",
    "                    * 365\n",
    "                    / 10**9\n",
    "                )\n",
    "\n",
    "            if jx == 0:\n",
    "                qobs = (\n",
    "                    ds_q_obs_yr_mean.loc[gaug_idx].values[0]\n",
    "                    * 60\n",
    "                    * 60\n",
    "                    * 24\n",
    "                    * 365\n",
    "                    / 10**9\n",
    "                )\n",
    "                f.write(\"{:19.1f},\".format(qobs))\n",
    "\n",
    "            f.write(\"{:19.1f}\".format(qsim))\n",
    "\n",
    "        for jx, (case, _) in enumerate(case_dic.items()):\n",
    "            f.write(\",\")\n",
    "            gaug_idx = gauge_plot[river_name][case][0]\n",
    "\n",
    "            # just get gauge_id from qs_q for now\n",
    "            gauge_id = ds_q[\"id\"].loc[gaug_idx].values[0]\n",
    "            network_area = gauge_reach_lnk[case][\n",
    "                gauge_reach_lnk[case][\"gauge_id\"] == gauge_id\n",
    "            ][\"route_area\"].values[0]\n",
    "\n",
    "            if jx == 0:\n",
    "                area = ds_q[\"area_stn\"].loc[gaug_idx].values[0]\n",
    "                f.write(\"{:20.0f},\".format(area))\n",
    "\n",
    "            f.write(\"{:20.0f}\".format(network_area))\n",
    "\n",
    "        f.write(\"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "903dff3b",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 4.2. scatter plot of annual mean flow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50579821",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "if obs_available:\n",
    "\n",
    "    df_yr_vol = pd.read_csv(\n",
    "        f\"./large50rivers_{analysis_name}.txt\",\n",
    "        skiprows=3,\n",
    "        index_col=[\"No\"],\n",
    "        skipinitialspace=True,\n",
    "    )\n",
    "\n",
    "    fig, axes = plt.subplots(1, figsize=(5.50, 5.50))\n",
    "    regressor = LinearRegression()\n",
    "    bias_text = \"\"\n",
    "\n",
    "    for jx, (case, meta) in enumerate(case_dic.items()):\n",
    "        # compute linear regression\n",
    "        df_reg = df_yr_vol[[\"obs_vol\", f\"{meta['grid']}_vol\"]].dropna()\n",
    "        regressor.fit(df_reg[[\"obs_vol\"]], df_reg[f\"{meta['grid']}_vol\"])\n",
    "        y_pred = regressor.predict(df_reg[[\"obs_vol\"]])\n",
    "\n",
    "        # compute bias over 50 sites\n",
    "        diff = (df_yr_vol[f\"{meta['grid']}_vol\"] - df_yr_vol[\"obs_vol\"]).mean(\n",
    "            axis=0, skipna=True\n",
    "        )\n",
    "\n",
    "        df_yr_vol.plot(\n",
    "            ax=axes,\n",
    "            kind=\"scatter\",\n",
    "            x=\"obs_vol\",\n",
    "            y=f\"{meta['grid']}_vol\",\n",
    "            s=30,\n",
    "            alpha=0.6,\n",
    "            label=meta[\"grid\"],\n",
    "        )\n",
    "        # plt.plot(df_reg['obs_vol'], y_pred, color=color)\n",
    "        bias_text = bias_text + f\"\\n{meta['grid']}: {diff:.1f} \"\n",
    "\n",
    "    plt.text(\n",
    "        0.65, 0.30, \"bias [km3/yr]\", transform=axes.transAxes, verticalalignment=\"top\"\n",
    "    )\n",
    "    plt.text(\n",
    "        0.65, 0.30, f\"{bias_text} \", transform=axes.transAxes, verticalalignment=\"top\"\n",
    "    )\n",
    "\n",
    "    plt.axline((0, 0), slope=1, linestyle=\"--\", color=\"black\")\n",
    "    axes.set_xscale(\"log\")\n",
    "    axes.set_yscale(\"log\")\n",
    "    axes.set_xlabel(\"reference flow\")\n",
    "    axes.set_ylabel(\"Simulated flow\")\n",
    "    axes.set_title(\"River Flow at stations [km^3/yr]\")\n",
    "\n",
    "    if figureSave:\n",
    "        plt.savefig(\n",
    "            f\"./NB1_Fig5_50big_river_annual_flow_scatter_{analysis_name}.png\", dpi=200\n",
    "        )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "25923599",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "------\n",
    "## 5. Anaysis for all 922 sites <a id='922_rivers'></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2b73745",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 5.1 Compute metris at all the sites (no plots nor tables)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d8316f5",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "if obs_available:\n",
    "    # Merge gauge_reach lnk (dataframe) into gauge shapefile\n",
    "    gauge_shp1 = {}\n",
    "    for case, df in gauge_reach_lnk.items():\n",
    "        # df = df.loc[(df['flag'] == 0)]\n",
    "        df1 = df.drop(columns=[\"riv_name\"])\n",
    "        gauge_shp1[case] = pd.merge(\n",
    "            gauge_shp, df1, how=\"inner\", left_on=\"id\", right_on=\"gauge_id\"\n",
    "        )\n",
    "\n",
    "    # compute %bias, correlation, and RMSE at each site based on monthly\n",
    "    mon_pbias = {}\n",
    "    mon_corr = {}\n",
    "    mon_rmse = {}\n",
    "\n",
    "    for case, meta in case_dic.items():\n",
    "        bias = np.full(len(gauge_shp1[case]), np.nan, dtype=float)\n",
    "        corr = np.full(len(gauge_shp1[case]), np.nan, dtype=float)\n",
    "        rmse = np.full(len(gauge_shp1[case]), np.nan, dtype=float)\n",
    "\n",
    "        q_name = case_meta[meta[\"grid\"]][\"flow_name\"]\n",
    "\n",
    "        for ix, row in gauge_shp1[case].iterrows():\n",
    "            q_obs = np.full(\n",
    "                meta[\"climo_nyrs\"] * 12, np.nan, dtype=float\n",
    "            )  # dummy q_sim that will be replaced by actual data if exist\n",
    "            q_sim = np.full(\n",
    "                meta[\"climo_nyrs\"] * 12, np.nan, dtype=float\n",
    "            )  # dummy q_sim that will be replaced by actual data if exist\n",
    "\n",
    "            route_id = row[\"route_id\"]\n",
    "            gauge_id = row[\"gauge_id\"]\n",
    "\n",
    "            gauge_ix = np.argwhere(ds_q.id.values == gauge_id)\n",
    "            if len(gauge_ix) == 1:\n",
    "                gauge_ix = gauge_ix[0]\n",
    "                q_obs = ds_q_obs_mon[:, gauge_ix].squeeze().values\n",
    "\n",
    "            route_ix = np.argwhere(reachID[case] == route_id)\n",
    "            if (\n",
    "                len(route_ix) == 1\n",
    "            ):  # meaning there is flow site in network and simulation exist at this site\n",
    "                route_ix = route_ix[0]\n",
    "                if len(route_ix) == 1:  # means vector\n",
    "                    q_sim = month_data[case][q_name][:, route_ix].squeeze().values\n",
    "                elif len(route_ix) == 2:  # means 2d grid\n",
    "                    q_sim = (\n",
    "                        month_data[case][q_name][:, route_ix[0], route_ix[1]]\n",
    "                        .squeeze()\n",
    "                        .values\n",
    "                    )\n",
    "\n",
    "                # compute %bias, correlation, RMSE\n",
    "                bias[ix] = metric.pbias(q_sim, q_obs)\n",
    "                corr[ix] = metric.corr(q_sim, q_obs)\n",
    "                rmse[ix] = metric.rmse(qsim, qobs)\n",
    "\n",
    "        mon_pbias[case] = bias\n",
    "        mon_corr[case] = corr\n",
    "        mon_rmse[case] = rmse\n",
    "\n",
    "        gauge_shp1[case][f\"bias_{meta['grid']}\"] = bias\n",
    "        gauge_shp1[case][f\"corr_{meta['grid']}\"] = corr\n",
    "        gauge_shp1[case][f\"rmse_{meta['grid']}\"] = rmse"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c6e6d0bc",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 5.2. Spatial metric map "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65940684",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "if obs_available:\n",
    "    # some local plot setups\n",
    "    cbar_kwrgs = {\n",
    "        \"bias\": {\n",
    "            \"shrink\": 0.9,\n",
    "            \"pad\": 0.02,\n",
    "            \"orientation\": \"horizontal\",\n",
    "            \"extend\": \"both\",\n",
    "        },\n",
    "        \"corr\": {\n",
    "            \"shrink\": 0.9,\n",
    "            \"pad\": 0.02,\n",
    "            \"orientation\": \"horizontal\",\n",
    "            \"extend\": \"min\",\n",
    "        },\n",
    "        \"rmse\": {\n",
    "            \"shrink\": 0.9,\n",
    "            \"pad\": 0.02,\n",
    "            \"orientation\": \"horizontal\",\n",
    "            \"extend\": \"max\",\n",
    "        },\n",
    "    }\n",
    "\n",
    "    meta = {\n",
    "        \"bias\": {\n",
    "            \"name\": \"%bias\",\n",
    "            \"vmin\": -100,\n",
    "            \"vmax\": 100,\n",
    "            \"cm\": colors.cmap_RedGrayBlue,\n",
    "        },\n",
    "        \"corr\": {\"name\": \"correlation\", \"vmin\": 0.2, \"vmax\": 1, \"cm\": colors.cmap_corr},\n",
    "        \"rmse\": {\"name\": \"RMSE\", \"vmin\": 0, \"vmax\": 1000, \"cm\": mpl.cm.turbo},\n",
    "    }\n",
    "\n",
    "    for error_metric in [\"rmse\", \"bias\", \"corr\"]:\n",
    "        for case, meta in case_dic.items():\n",
    "            fig, ax = plt.subplots(\n",
    "                1, figsize=(7.5, 4.0), subplot_kw={\"projection\": ccrs.PlateCarree()}\n",
    "            )\n",
    "\n",
    "            ax.add_feature(\n",
    "                rivers_50m, facecolor=\"None\", edgecolor=\"b\", lw=0.5, alpha=0.3\n",
    "            )\n",
    "            ax.add_feature(land, facecolor=\"white\", edgecolor=\"grey\")\n",
    "\n",
    "            gauge_shp1[case].plot(\n",
    "                ax=ax,\n",
    "                column=f\"{error_metric}_{meta['grid']}\",\n",
    "                markersize=10,\n",
    "                cmap=meta[error_metric][\"cm\"],\n",
    "                vmin=meta[error_metric][\"vmin\"],\n",
    "                vmax=meta[error_metric][\"vmax\"],\n",
    "            )\n",
    "\n",
    "            ax.set_title(\"%s %s\" % (case, meta[error_metric][\"name\"]))\n",
    "\n",
    "            points = ax.collections[-1]\n",
    "            plt.colorbar(points, ax=ax, **cbar_kwrgs[error_metric])\n",
    "\n",
    "            plt.tight_layout()\n",
    "\n",
    "            if figureSave:\n",
    "                plt.savefig(f\"./NB1_Fig6_{error_metric}_{case}_map.png\", dpi=200)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4799a39b",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 5.4 Boxplots of Error metrics (RMSE, %bias, and correlation coefficient)\n",
    "Boxplot distribution is based on metrics sampled at 922 sites.\n",
    "\n",
    "The box extends from the Q1 to Q3 quartile values of the data, with a line at the median (Q2). \n",
    "The whiskers extend from the edges of box to show the range of the data. \n",
    "By default, they extend no more than 1.5 * IQR (IQR = Q3 - Q1) from the edges of the box, ending at the farthest data point within that interval. \n",
    "Outliers are plotted as separate dots."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7520a731",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "if obs_available:\n",
    "    boxprops = {\"linestyle\": \"-\", \"linewidth\": 1.5, \"color\": \"blue\"}\n",
    "    medianprops = {\"linestyle\": \"-\", \"linewidth\": 1.5, \"color\": \"red\"}\n",
    "\n",
    "    for error_metric in [\"rmse\", \"bias\", \"corr\"]:\n",
    "        column_stat = []\n",
    "        gauge_shp_all_case = gauge_shp.copy(deep=True)\n",
    "        for case, meta in case_dic.items():\n",
    "            gauge_shp_all_case = gauge_shp_all_case.merge(\n",
    "                gauge_shp1[case][[\"id\", f\"{error_metric}_{meta['grid']}\"]],\n",
    "                left_on=\"id\",\n",
    "                right_on=\"id\",\n",
    "            )\n",
    "            column_stat.append(f\"{error_metric}_{meta['grid']}\")\n",
    "        fig, ax = plt.subplots(1, figsize=(6.5, 4))\n",
    "        gauge_shp_all_case.boxplot(\n",
    "            ax=ax,\n",
    "            column=column_stat,\n",
    "            boxprops=boxprops,\n",
    "            medianprops=medianprops,\n",
    "            sym=\".\",\n",
    "        )\n",
    "\n",
    "        xticklabels = [label[len(error_metric) + 1 :] for label in column_stat]\n",
    "        ax.set_xticklabels(xticklabels)\n",
    "\n",
    "        if error_metric == \"rmse\":\n",
    "            ax.set_ylim([0, 1000])\n",
    "            ax.set_title(\"RMSE [m3/s]\")\n",
    "        elif error_metric == \"bias\":\n",
    "            ax.set_ylim([-150, 250])\n",
    "            ax.set_title(\"%bias [%]\")\n",
    "        elif error_metric == \"corr\":\n",
    "            ax.set_ylim([-0.2, 1])\n",
    "            ax.set_title(\"correlation\")\n",
    "\n",
    "        if figureSave:\n",
    "            plt.savefig(f\"./NB1_Fig7_{error_metric}_boxplot.png\", dpi=150)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00f7b9a0",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "if client:\n",
    "    client.shutdown()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "cupid-analysis",
   "language": "python",
   "name": "cupid-analysis"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  },
  "papermill": {
   "duration": 41.679942,
   "end_time": "2025-10-10T23:20:56.099014",
   "exception": true,
   "input_path": "/glade/derecho/scratch/richling/tmp/tmpzf79t8m8.ipynb",
   "output_path": "/glade/work/richling/CUPid_pr_test/CUPiD/examples/key_metrics/computed_notebooks/rof/global_discharge_gauge_compare_obs.ipynb",
   "parameters": {
    "CESM_output_dir": "/glade/campaign/cesm/development/cross-wg/diagnostic_framework/CESM_output_for_testing",
    "analysis_name": "",
    "base_case_name": "b.e23_alpha17f.BLT1850.ne30_t232.092",
    "base_end_date": "0101-01-01",
    "base_start_date": "0001-01-01",
    "case_name": "b.e30_beta02.BLT1850.ne30_t232.104",
    "climo_nyears": 10,
    "end_date": "0101-01-01",
    "figureSave": false,
    "grid_name": "f09_f09_mosart",
    "hist_str": "h0a",
    "lc_kwargs": {
     "threads_per_worker": 1
    },
    "obs_data_dir": "/glade/campaign/cesm/development/cross-wg/diagnostic_framework/CUPiD_obs_data",
    "product": "/glade/work/richling/CUPid_pr_test/CUPiD/examples/key_metrics/computed_notebooks//rof/global_discharge_gauge_compare_obs.ipynb",
    "serial": false,
    "start_date": "0001-01-01",
    "subset_kwargs": {},
    "ts_dir": null
   },
   "start_time": "2025-10-10T23:20:14.419072"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}