{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "39c68d0d",
   "metadata": {
    "tags": [
     "papermill-error-cell-tag"
    ]
   },
   "source": [
    "<span style=\"color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;\">An Exception was encountered at '<a href=\"#papermill-error-cell\">In [1]</a>'.</span>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "94848dc3",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": 0.00736,
     "end_time": "2025-11-25T01:42:58.115559",
     "exception": false,
     "start_time": "2025-11-25T01:42:58.108199",
     "status": "completed"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "# ROF monthly, annual, seasonal discharge at ocean outlets <a id='top'></a>\n",
    "\n",
    "Use the following datasets\n",
    "\n",
    "1. reach-D19 gauge link ascii\n",
    "2. D19 flow site geopackage\n",
    "3. D19 discharge netCDF\n",
    "4. monthly and yearly flow netCD (history file)\n",
    "\n",
    "[1. Setup](#configuration)\n",
    "\n",
    "\n",
    "[2. Loading discharge data](#load_q)\n",
    "\n",
    "- Read monthly history files from archive. \n",
    "- Reference data: monthly discharge estimates at 922 big river mouths from Dai et al. 2019 data (D19)\n",
    "\n",
    "[3. Read river, catchment, gauge information](#read_ancillary)\n",
    "\n",
    "- catchment polygon (geopackage)\n",
    "- gauge point (geopackage)\n",
    "- gauge-catchment link (csv)\n",
    "- outlet reach information (netCDF) including discharging ocean names\n",
    "\n",
    "[4. Ocean discharge line plots](#annual_cycle_q)\n",
    "\n",
    "- total seasonal flow for oceans. \n",
    "\n",
    "[5. Plot zonal mean annual dicharge to the oceans](#zonal_mean_annual_q)\n",
    "\n",
    "- total seasonal flow for oceans. "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3b1a1f77",
   "metadata": {
    "tags": [
     "papermill-error-cell-tag"
    ]
   },
   "source": [
    "<span id=\"papermill-error-cell\" style=\"color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;\">Execution using papermill encountered an exception here and stopped:</span>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9f59d5e8",
   "metadata": {
    "editable": true,
    "execution": {
     "iopub.execute_input": "2025-11-25T01:42:58.128759Z",
     "iopub.status.busy": "2025-11-25T01:42:58.128529Z",
     "iopub.status.idle": "2025-11-25T01:42:59.612831Z",
     "shell.execute_reply": "2025-11-25T01:42:59.611951Z"
    },
    "papermill": {
     "duration": 1.49215,
     "end_time": "2025-11-25T01:42:59.613978",
     "exception": true,
     "start_time": "2025-11-25T01:42:58.121828",
     "status": "failed"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'cupid_utils'",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 12\u001b[39m\n\u001b[32m      9\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mxarray\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mxr\u001b[39;00m\n\u001b[32m     10\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mcartopy\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mfeature\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mcfeature\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m12\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mcupid_utils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mcu\u001b[39;00m  \u001b[38;5;66;03m# go buffs!\u001b[39;00m\n\u001b[32m     13\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mcupid_utils\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mrof\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m load_yaml\n\u001b[32m     14\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mcupid_utils\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mrof\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m no_time_variable\n",
      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'cupid_utils'"
     ]
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import os, sys\n",
    "import glob\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import geopandas as gpd\n",
    "import xarray as xr\n",
    "import cartopy.feature as cfeature\n",
    "\n",
    "import cupid_utils as cu  # go buffs!\n",
    "from cupid_utils.rof import load_yaml\n",
    "from cupid_utils.rof import no_time_variable\n",
    "from cupid_utils.rof import read_shps\n",
    "from cupid_utils.rof import get_index_array\n",
    "\n",
    "rivers_50m = cfeature.NaturalEarthFeature(\"physical\", \"rivers_lake_centerlines\", \"50m\")\n",
    "land = cfeature.LAND\n",
    "\n",
    "print(\"\\nThe Python version: %s.%s.%s\" % sys.version_info[:3])\n",
    "print(xr.__name__, xr.__version__)\n",
    "print(pd.__name__, pd.__version__)\n",
    "print(gpd.__name__, gpd.__version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cba45cb9",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "-------------------------\n",
    "## 1. Setup <a id='configuration'></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6cb98854",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "parameters",
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# Parameter Defaults\n",
    "# parameters are set in CUPiD's config.yml file\n",
    "# when running interactively, manually set the parameters below\n",
    "\n",
    "CESM_output_dir = \"\"\n",
    "case_name = None  # case name: e.g., \"b.e30_beta02.BLT1850.ne30_t232.104\"\n",
    "base_case_name = None  # base case name: e.g., \"b.e23_alpha17f.BLT1850.ne30_t232.092\"\n",
    "base_case_output_dir = None\n",
    "start_date = \"\"\n",
    "end_date = \"\"\n",
    "base_start_date = \"\"  # base simulation starting date: \"0001-01-01\"\n",
    "base_end_date = \"\"  # base simulation ending date: \"0100-01-01\"\n",
    "serial = True  # use dask LocalCluster\n",
    "lc_kwargs = {}\n",
    "\n",
    "hist_str = \"\"  # Used for hist file tag\n",
    "analysis_name = \"\"  # Used for Figure png names\n",
    "climo_nyears = 10  # number of years to compute the climatology\n",
    "grid_name = \"f09_f09_mosart\"  # ROF grid name used in case\n",
    "base_grid_name = (\n",
    "    grid_name  # spcify ROF grid name for base_case in config.yml if different than case\n",
    ")\n",
    "figureSave = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "194c45bd",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": [
     "injected-parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# Parameters\n",
    "case_name = \"b.e30_alpha07c_cesm.B1850C_LTso.ne30_t232_wgx3.232\"\n",
    "base_case_name = \"b.e30_alpha07c_cesm.B1850C_LTso.ne30_t232_wgx3.228\"\n",
    "case_nickname = \"BLT1850_232\"\n",
    "base_case_nickname = \"BLT1850_228\"\n",
    "CESM_output_dir = \"/glade/campaign/cesm/development/cross-wg/diagnostic_framework/CESM_output_for_testing\"\n",
    "start_date = \"0001-01-01\"\n",
    "end_date = \"0021-01-01\"\n",
    "climo_start_year = 11\n",
    "climo_end_year = 21\n",
    "base_start_date = \"0001-01-01\"\n",
    "base_end_date = \"0045-01-01\"\n",
    "base_climo_start_year = 36\n",
    "base_climo_end_year = 45\n",
    "obs_data_dir = (\n",
    "    \"/glade/campaign/cesm/development/cross-wg/diagnostic_framework/CUPiD_obs_data\"\n",
    ")\n",
    "ts_dir = None\n",
    "lc_kwargs = {\"threads_per_worker\": 1}\n",
    "serial = False\n",
    "hist_str = \"h0a\"\n",
    "analysis_name = \"\"\n",
    "grid_name = \"f09_f09_mosart\"\n",
    "climo_nyears = 10\n",
    "figureSave = False\n",
    "subset_kwargs = {}\n",
    "product = \"/glade/work/richling/cupid-test-2/CUPiD/examples/key_metrics/computed_notebooks//rof/global_discharge_ocean_compare_obs.ipynb\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63286ecd",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": [
     "hide-cell"
    ]
   },
   "outputs": [],
   "source": [
    "if base_case_output_dir is None:\n",
    "    base_case_output_dir = CESM_output_dir\n",
    "\n",
    "out_dir = os.path.join(os.path.dirname(product), \"rof_nb_output\")\n",
    "os.makedirs(out_dir, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5aed763",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# ROF additional setup\n",
    "setup = load_yaml(os.path.join(cu.__path__[0], \"..\", \"setup\", \"setup.yaml\"))\n",
    "\n",
    "domain_dir = setup[\n",
    "    \"ancillary_dir\"\n",
    "]  # ancillary directory including such as ROF domain, river network data\n",
    "geospatial_dir = setup[\"geospatial_dir\"]  # including shapefiles or geopackages\n",
    "ref_flow_dir = setup[\"ref_flow_dir\"]  # including observed or reference flow data\n",
    "case_meta = setup[\"case_meta\"]  # RO grid meta\n",
    "catch_gpkg = setup[\"catch_gpkg\"]  # catchment geopackage meta\n",
    "reach_gpkg = setup[\"reach_gpkg\"]  # reach geopackage meta\n",
    "network_nc = setup[\"river_network\"]  # river network meta\n",
    "\n",
    "if not analysis_name:\n",
    "    analysis_name = case_name\n",
    "if not base_grid_name:\n",
    "    base_grid_name = grid_name\n",
    "\n",
    "case_dic = {\n",
    "    case_name: {\n",
    "        \"grid\": grid_name,\n",
    "        \"sim_period\": slice(f\"{start_date}\", f\"{end_date}\"),\n",
    "        \"climo_nyrs\": min(climo_nyears, int(end_date[:4]) - int(start_date[:4]) + 1),\n",
    "        \"output_dir\": CESM_output_dir,\n",
    "    }\n",
    "}\n",
    "if base_case_name is not None:\n",
    "    case_dic[base_case_name] = {\n",
    "        \"grid\": base_grid_name,\n",
    "        \"sim_period\": slice(f\"{base_start_date}\", f\"{base_end_date}\"),\n",
    "        \"climo_nyrs\": min(\n",
    "            climo_nyears, int(base_end_date[:4]) - int(base_start_date[:4]) + 1\n",
    "        ),\n",
    "        \"output_dir\": base_case_output_dir,\n",
    "    }\n",
    "\n",
    "oceans_list = [\n",
    "    \"arctic\",\n",
    "    \"atlantic\",\n",
    "    \"indian\",\n",
    "    \"mediterranean\",\n",
    "    \"pacific\",\n",
    "    \"south_china\",\n",
    "    \"global\",\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0e28ced6",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "-----\n",
    "### dasks (optional)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b905ff27",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# When running interactively, cupid_run should be set to False\n",
    "cupid_run = True\n",
    "\n",
    "client = None\n",
    "if cupid_run:\n",
    "    # Spin up cluster (if running in parallel)\n",
    "    if not serial:\n",
    "        from dask.distributed import Client, LocalCluster\n",
    "\n",
    "        cluster = LocalCluster(**lc_kwargs)\n",
    "        client = Client(cluster)\n",
    "else:\n",
    "    if not serial:\n",
    "        from dask.distributed import Client\n",
    "        from dask_jobqueue import PBSCluster\n",
    "\n",
    "        cluster = PBSCluster(\n",
    "            cores=1,\n",
    "            processes=1,\n",
    "            memory=\"50GB\",\n",
    "            queue=\"casper\",\n",
    "            walltime=\"01:00:00\",\n",
    "        )\n",
    "        cluster.scale(jobs=10)\n",
    "        client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "50e67c0a",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "## 2. Loading discharge data <a id='load_q'></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2c06b3b9",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 2.1. Monthly/annual flow netCDFs\n",
    "- month_data (xr dataset)\n",
    "- year_data (xr dataset)\n",
    "- seas_data (xr dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc8788c1",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "reachID = {}\n",
    "month_data = {}\n",
    "year_data = {}\n",
    "seas_data = {}\n",
    "for case, meta in case_dic.items():\n",
    "    in_dire = os.path.join(meta[\"output_dir\"], case, \"rof/hist\")\n",
    "    model = case_meta[meta[\"grid\"]][\"model\"]\n",
    "    domain = case_meta[meta[\"grid\"]][\"domain_nc\"]\n",
    "    var_list = case_meta[meta[\"grid\"]][\"vars_read\"]\n",
    "\n",
    "    def preprocess(ds):\n",
    "        return ds[var_list]\n",
    "\n",
    "    year_list = [\n",
    "        \"{:04d}\".format(yr)\n",
    "        for yr in np.arange(\n",
    "            int(meta[\"sim_period\"].start[0:4]), int(meta[\"sim_period\"].stop[0:4]) + 1\n",
    "        )\n",
    "    ]\n",
    "\n",
    "    nc_list = []\n",
    "    for nc_path in sorted(glob.glob(f\"{in_dire}/{case}.{model}.{hist_str}.????-*.nc\")):\n",
    "        for yr in year_list:\n",
    "            if yr in os.path.basename(nc_path):\n",
    "                nc_list.append(nc_path)\n",
    "\n",
    "    # load data\n",
    "    ds = xr.open_mfdataset(\n",
    "        nc_list,\n",
    "        data_vars=\"minimal\",\n",
    "        parallel=True,\n",
    "        preprocess=preprocess,\n",
    "    ).sel(time=meta[\"sim_period\"])\n",
    "\n",
    "    # monthly\n",
    "    month_data[case] = ds.isel(time=slice(-meta[\"climo_nyrs\"] * 12, None))\n",
    "    # annual\n",
    "    year_data[case] = (\n",
    "        ds.isel(time=slice(-meta[\"climo_nyrs\"] * 12, None))\n",
    "        .resample(time=\"YS\")\n",
    "        .mean(dim=\"time\")\n",
    "        .load()\n",
    "    )\n",
    "    # seasonal (compute here instead of reading for conisistent analysis period)\n",
    "    seas_data[case] = (\n",
    "        ds.isel(time=slice(-meta[\"climo_nyrs\"] * 12, None))\n",
    "        .groupby(\"time.month\")\n",
    "        .mean(\"time\")\n",
    "        .load()\n",
    "    )\n",
    "    vars_no_time = no_time_variable(month_data[case])\n",
    "    if vars_no_time:\n",
    "        seas_data[case][vars_no_time] = seas_data[case][vars_no_time].isel(\n",
    "            month=0, drop=True\n",
    "        )\n",
    "    mon_time = month_data[case].time.values\n",
    "    if domain == \"None\":\n",
    "        reachID[case] = month_data[case][\"reachID\"].values\n",
    "    else:\n",
    "        reachID[case] = (\n",
    "            xr.open_dataset(f\"{domain_dir}/{domain}\")[\"reachID\"]\n",
    "            .stack(seg=(\"lat\", \"lon\"))\n",
    "            .values\n",
    "        )\n",
    "    print(f\"Finished loading {case}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8664ed39",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 2.2 D19 discharge data\n",
    "- ds_q_obs_mon (xr datasets)\n",
    "- ds_q_obs_yr (xr datasets)\n",
    "- dr_q_obs_seasonal (xr datasets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aae69e55",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "# read monthly data\n",
    "ds_q = xr.open_dataset(\n",
    "    \"%s/D09/coastal-stns-Vol-monthly.updated-May2019.mod.nc\" % (ref_flow_dir),\n",
    "    decode_times=False,\n",
    ")\n",
    "ds_q[\"time\"] = xr.cftime_range(\n",
    "    start=\"1900-01-01\", end=\"2018-12-01\", freq=\"MS\", calendar=\"standard\"\n",
    ")\n",
    "\n",
    "# monthly- if time_period is outside observation period, use the entire obs period\n",
    "obs_available = True\n",
    "if ds_q[\"time\"].sel(time=slice(str(mon_time[0]), str(mon_time[-1]))).values.size == 0:\n",
    "    obs_available = False\n",
    "    ds_q_obs_mon = ds_q[[\"FLOW\", \"lat\", \"lon\"]].set_coords([\"lat\", \"lon\"])\n",
    "else:\n",
    "    ds_q_obs_mon = (\n",
    "        ds_q[[\"FLOW\", \"lat\", \"lon\"]]\n",
    "        .sel(time=slice(str(mon_time[0]), str(mon_time[-1])))\n",
    "        .set_coords([\"lat\", \"lon\"])\n",
    "    )\n",
    "# compute annual flow from monthly\n",
    "ds_q_obs_yr = ds_q_obs_mon.resample(time=\"YE\").mean(dim=\"time\")\n",
    "# compute annual cycle at monthly scale\n",
    "dr_q_obs_seasonal = ds_q_obs_mon.groupby(\"time.month\").mean(\"time\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6c2f2c1b",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "## 3. Reading river, catchment, gauge infomation  <a id='read_ancillary'></a>\n",
    "\n",
    "- gauge-catchment (or grid box) link (csv)\n",
    "- gauge point (geopackage)\n",
    "- ocean polygon (geopackage)\n",
    "- catchment polygon (geopackage)\n",
    "- outlet reach information (netCDF)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8af951b5",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "### 3.1. reach-D19 gauge link csv\n",
    "- gauge_reach_lnk (dataframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9403600d",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "gauge_reach_lnk = {}\n",
    "for case, meta in case_dic.items():\n",
    "    gauge_reach_lnk[case] = pd.read_csv(\n",
    "        \"%s/D09/D09_925.%s.asc\" % (ref_flow_dir, case_meta[meta[\"grid\"]][\"network\"])\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ee3f31d7",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "### 3.2 D19 flow site geopackage\n",
    "- gauge_shp (dataframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "044d9aad",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "gauge_shp = gpd.read_file(\n",
    "    os.path.join(ref_flow_dir, \"D09\", \"geospatial\", \"D09_925.gpkg\")\n",
    ")\n",
    "gauge_shp = gauge_shp[gauge_shp[\"id\"] != 9999999]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8a3e5db9",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "### 3.3 Ocean polygon geopackage\n",
    "- ocean_shp (dataframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "020a3da4",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "ocean_shp = gpd.read_file(os.path.join(geospatial_dir, \"oceans.gpkg\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e2a1367",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "### 3.4 Read river network information\n",
    "- gdf_cat (dataframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13649934",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-cell"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "## read catchment geopackage\n",
    "gdf_cat = {}\n",
    "for case, meta in case_dic.items():\n",
    "    cat_gpkg = os.path.join(\n",
    "        geospatial_dir, catch_gpkg[meta[\"grid\"]][\"file_name\"]\n",
    "    )  # geopackage name\n",
    "    id_name_cat = catch_gpkg[meta[\"grid\"]][\"id_name\"]  # reach ID in geopackage\n",
    "    var_list = [id_name_cat]\n",
    "    if \"lk\" in grid_name:\n",
    "        var_list.append(\"lake\")\n",
    "    gdf_cat[case] = read_shps([cat_gpkg], var_list)\n",
    "    gdf_cat[case][\"centroid_longitude\"] = gdf_cat[case][\"geometry\"].centroid.x\n",
    "    gdf_cat[case][\"centroid_latitude\"] = gdf_cat[case][\"geometry\"].centroid.y"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "389b515f",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "### 3.5 Read river outlet information\n",
    "- Apppend into gdf_cat (dataframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c17a160",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "# read river outlet netcdf\n",
    "riv_ocean = {}\n",
    "for case, meta in case_dic.items():\n",
    "    riv_ocean_file = os.path.join(\n",
    "        domain_dir,\n",
    "        network_nc[meta[\"grid\"]][\"file_name\"].replace(\".aug.nc\", \".outlet.nc\"),\n",
    "    )  # network netcdf name\n",
    "    ds_rn_ocean = xr.open_dataset(riv_ocean_file).set_index(seg=\"seg_id\")\n",
    "    df_tmp = ds_rn_ocean.to_dataframe()\n",
    "    riv_ocean[case] = pd.merge(\n",
    "        gdf_cat[case],\n",
    "        df_tmp,\n",
    "        left_on=catch_gpkg[meta[\"grid\"]][\"id_name\"],\n",
    "        right_index=True,\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0ab7c93b",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 3.6 Merge gauge, outlet catchment dataframe\n",
    "\n",
    "- gauge_shp1 (dataframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc3677b5",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "# Merge gauge_reach lnk (dataframe) into gauge shapefile\n",
    "gauge_shp1 = {}\n",
    "for case, meta in case_dic.items():\n",
    "    df = gauge_reach_lnk[case]\n",
    "\n",
    "    # df = df.loc[(df['flag'] == 0)]\n",
    "    df1 = df.drop(columns=[\"riv_name\"])\n",
    "    df2 = pd.merge(gauge_shp, df1, how=\"inner\", left_on=\"id\", right_on=\"gauge_id\")\n",
    "    gauge_shp1[case] = pd.merge(\n",
    "        df2,\n",
    "        riv_ocean[case],\n",
    "        how=\"inner\",\n",
    "        left_on=\"route_id\",\n",
    "        right_on=catch_gpkg[meta[\"grid\"]][\"id_name\"],\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1975cac7",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "------\n",
    "## 4. Plot annual cycle for global oceans <a id='annual_cycle_q'></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce7b066a",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%time\n",
    "\n",
    "nrows = 4\n",
    "ncols = 2\n",
    "fig, axes = plt.subplots(nrows, ncols, figsize=(7.25, 6.5))\n",
    "plt.subplots_adjust(\n",
    "    top=0.95, bottom=0.065, right=0.98, left=0.10, hspace=0.225, wspace=0.250\n",
    ")  # create some space below the plots by increasing the bottom-value\n",
    "\n",
    "for ix, ocean_name in enumerate(oceans_list):\n",
    "    row = ix // 2\n",
    "    col = ix % 2\n",
    "    for case, meta in case_dic.items():\n",
    "\n",
    "        q_name = case_meta[meta[\"grid\"]][\"flow_name\"]\n",
    "\n",
    "        if case_meta[meta[\"grid\"]][\"network_type\"] == \"vector\":\n",
    "            if ocean_name == \"global\":\n",
    "                id_list = gauge_shp1[case][\"route_id\"].values\n",
    "            else:\n",
    "                id_list = gauge_shp1[case][gauge_shp1[case][\"ocean\"] == ocean_name][\n",
    "                    \"route_id\"\n",
    "                ].values\n",
    "            reach_index = get_index_array(reachID[case], id_list)\n",
    "            dr_flow = seas_data[case][q_name].isel(seg=reach_index).sum(dim=\"seg\")\n",
    "            dr_flow.plot(ax=axes[row, col], linestyle=\"-\", lw=0.75, label=case)\n",
    "\n",
    "        elif case_meta[grid_name][\"network_type\"] == \"grid\":  # means 2d grid\n",
    "            if ocean_name == \"global\":\n",
    "                id_list = gauge_shp1[case][\"route_id\"].values\n",
    "            else:\n",
    "                id_list = gauge_shp1[case][gauge_shp1[case][\"ocean\"] == ocean_name][\n",
    "                    \"route_id\"\n",
    "                ].values\n",
    "\n",
    "            reach_index = get_index_array(reachID[case], id_list)\n",
    "            seas_data_vector = seas_data[case][q_name].stack(seg=(\"lat\", \"lon\"))\n",
    "            dr_flow = seas_data_vector.isel(seg=reach_index).sum(dim=\"seg\")\n",
    "            dr_flow.plot(ax=axes[row, col], linestyle=\"-\", lw=0.75, label=case)\n",
    "\n",
    "    # reference data\n",
    "    if obs_available:\n",
    "        if ocean_name == \"global\":\n",
    "            id_list = gauge_shp1[case][\"id\"].values\n",
    "        else:\n",
    "            id_list = gauge_shp1[case][gauge_shp1[case][\"ocean\"] == ocean_name][\n",
    "                \"id\"\n",
    "            ].values\n",
    "        gauge_index = get_index_array(ds_q[\"id\"].values, id_list)\n",
    "        dr_obs = dr_q_obs_seasonal.isel(station=gauge_index).sum(dim=\"station\")\n",
    "        dr_obs.plot(\n",
    "            ax=axes[row, col],\n",
    "            linestyle=\"None\",\n",
    "            marker=\"o\",\n",
    "            markersize=2,\n",
    "            c=\"k\",\n",
    "            label=\"D19\",\n",
    "        )\n",
    "\n",
    "    axes[row, col].set_title(\"%d %s\" % (ix + 1, ocean_name), fontsize=9)\n",
    "    axes[row, col].set_xlabel(\"\")\n",
    "    if row < 7:\n",
    "        axes[row, col].set_xticklabels(\"\")\n",
    "    if col == 0:\n",
    "        axes[row, col].set_ylabel(\"Mon. flow [m$^3$/s]\", fontsize=9)\n",
    "    else:\n",
    "        axes[row, col].set_ylabel(\"\")\n",
    "    axes[row, col].tick_params(\"both\", labelsize=\"x-small\")\n",
    "\n",
    "# Legend- make space below the plot-raise bottom. there will be an label below the second last (bottom middle) ax, thanks to the bbox_to_anchor=(x, y) with a negative y-value.\n",
    "axes[row, col].legend(\n",
    "    loc=\"center left\", bbox_to_anchor=(1.10, 0.40, 0.75, 0.1), ncol=1, fontsize=\"small\"\n",
    ")\n",
    "\n",
    "for jx in range(ix + 1, nrows * ncols):\n",
    "    row = jx // 2\n",
    "    col = jx % 2\n",
    "    fig.delaxes(axes[row][col])\n",
    "\n",
    "if figureSave:\n",
    "    plt.savefig(\n",
    "        f\"{out_dir}/NB2_Fig1_ocean_discharge_season_{analysis_name}.png\", dpi=200\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "78f4716c",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "------\n",
    "## 5. Plot zonal mean annual dicharge to the oceans <a id='zonal_mean_annual_q'></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2f1376f",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": [
     "hide-cell"
    ]
   },
   "outputs": [],
   "source": [
    "%time\n",
    "# compute zonal mean total discharge from all the outlets in the modeled river networks.\n",
    "oceans_list = [\"atlantic\", \"pacific\", \"indian\", \"global\"]\n",
    "\n",
    "# Define latitude bins every 1 degree\n",
    "lat_bins = np.arange(-80, 81, 1.0)\n",
    "# Compute bin centers for labeling\n",
    "lat_bin_centers = 0.5 * (lat_bins[:-1] + lat_bins[1:])\n",
    "\n",
    "for ix, ocean_name in enumerate(oceans_list):\n",
    "\n",
    "    nrows = 1\n",
    "    ncols = 1\n",
    "    fig, axes = plt.subplots(nrows, ncols, figsize=(6.5, 5.0))\n",
    "    plt.subplots_adjust(\n",
    "        top=0.95, bottom=0.065, right=0.98, left=0.10, hspace=0.225, wspace=0.250\n",
    "    )  # create some space below the plots by increasing the bottom-value\n",
    "\n",
    "    axes_right = axes.twinx()\n",
    "\n",
    "    for case, meta in case_dic.items():\n",
    "        grid_name = meta[\"grid\"]\n",
    "        q_name = case_meta[meta[\"grid\"]][\"flow_name\"]\n",
    "\n",
    "        # extract flow data and latitude at outlet locations (index)\n",
    "        if ocean_name == \"global\":\n",
    "            id_list = gauge_shp1[case][\"route_id\"].values\n",
    "        else:\n",
    "            id_list = gauge_shp1[case][gauge_shp1[case][\"ocean\"] == ocean_name][\n",
    "                \"route_id\"\n",
    "            ].values\n",
    "        reach_index = get_index_array(reachID[case], id_list)\n",
    "\n",
    "        if case_meta[grid_name][\"network_type\"] == \"vector\":\n",
    "            # 1. Extract the mapping from riv_ocean (HRU → latitude)\n",
    "            lat_map = riv_ocean[case].set_index(\"hruid\")[\"centroid_latitude\"]\n",
    "\n",
    "            # 2. Convert that Series into an xarray DataArray\n",
    "            lat_da = xr.DataArray(\n",
    "                lat_map, dims=[\"hruid\"], coords={\"hruid\": lat_map.index}, name=\"lat\"\n",
    "            )\n",
    "\n",
    "            # 3. Align `lat_da` with the reachID coordinate in seas_data\n",
    "            #    (We assume reachID corresponds to hruid values)\n",
    "            lat_for_reach = lat_da.sel(\n",
    "                hruid=xr.DataArray(seas_data[case][\"reachID\"].values, dims=[\"seg\"])\n",
    "            )\n",
    "\n",
    "            # 4. Assign it as a new variable in seas_data\n",
    "            seas_data[case][\"lat\"] = lat_for_reach\n",
    "\n",
    "            ds_flow = seas_data[case][[q_name, \"lat\"]].isel(seg=reach_index)\n",
    "\n",
    "            # Group by latitude bins and take the mean across segments in each bin\n",
    "            zonal_mean = (\n",
    "                ds_flow[q_name]\n",
    "                .mean(dim=\"month\")\n",
    "                .groupby_bins(ds_flow[\"lat\"], bins=lat_bins, labels=lat_bin_centers)\n",
    "                .sum(dim=\"seg\", skipna=True)\n",
    "            ) / 1000000\n",
    "\n",
    "        elif case_meta[grid_name][\"network_type\"] == \"grid\":  # means 2d grid\n",
    "\n",
    "            seas_data_vector = seas_data[case][q_name].stack(seg=(\"lat\", \"lon\"))\n",
    "            dr_flow = seas_data_vector.isel(seg=reach_index)\n",
    "            zonal_mean = (\n",
    "                dr_flow.mean(dim=\"month\")\n",
    "                .groupby_bins(dr_flow[\"lat\"], bins=lat_bins, labels=lat_bin_centers)\n",
    "                .sum(dim=\"seg\", skipna=True)\n",
    "            ) / 1000000\n",
    "\n",
    "        # Rename the bin dimension to 'lat' for clarity\n",
    "        zonal_mean = zonal_mean.rename({\"lat_bins\": \"lat\"})\n",
    "        zonal_mean = zonal_mean.assign_coords(lat=(\"lat\", lat_bin_centers))\n",
    "\n",
    "        # plot zonal mean discharge per 1-degree lat\n",
    "        zonal_mean.plot(ax=axes, linestyle=\"-\", lw=0.75, label=case)\n",
    "        # plot cumulative zonal mean discharge per 1-degree lat\n",
    "        zonal_mean.cumsum(dim=\"lat\").plot(\n",
    "            ax=axes_right, linestyle=\"--\", lw=1.0, label=case\n",
    "        )\n",
    "\n",
    "    # reference data\n",
    "    if obs_available:\n",
    "        if ocean_name == \"global\":\n",
    "            id_list = gauge_shp1[case][\"id\"].values\n",
    "        else:\n",
    "            id_list = gauge_shp1[case][gauge_shp1[case][\"ocean\"] == ocean_name][\n",
    "                \"id\"\n",
    "            ].values\n",
    "        gauge_index = get_index_array(ds_q[\"id\"].values, id_list)\n",
    "        dr_obs = dr_q_obs_seasonal.isel(station=gauge_index)\n",
    "        zonal_mean_obs = (\n",
    "            dr_obs[\"FLOW\"]\n",
    "            .mean(dim=\"month\")\n",
    "            .groupby_bins(dr_obs[\"lat\"], bins=lat_bins, labels=lat_bin_centers)\n",
    "            .sum(dim=\"station\", skipna=True)\n",
    "        ) / 1000000\n",
    "\n",
    "        zonal_mean_obs.plot(\n",
    "            ax=axes,\n",
    "            linestyle=\"None\",\n",
    "            marker=\"o\",\n",
    "            markersize=2,\n",
    "            c=\"k\",\n",
    "            label=\"D19\",\n",
    "            zorder=0,\n",
    "        )\n",
    "\n",
    "    axes.set_title(\"%s\" % (ocean_name), fontsize=10)\n",
    "    axes.set_xlabel(\"Latitude\")\n",
    "\n",
    "    axes.set_xticks(np.arange(-80, 81, 20))\n",
    "    axes.set_xticklabels([\"80S\", \"60S\", \"40S\", \"20S\", \"0\", \"20N\", \"40N\", \"60N\", \"80N\"])\n",
    "    axes.set_ylabel(\n",
    "        \"Total annual discharge to ocean [10$^6$ m$^3$/s per degree lat]\", fontsize=10\n",
    "    )\n",
    "    axes.tick_params(\"both\", labelsize=\"small\")\n",
    "\n",
    "    axes_right.set_ylabel(\n",
    "        \"Total annual discharge accumulated from 90S [10$^6$ m$^3$/s]\", fontsize=10\n",
    "    )\n",
    "    axes_right.tick_params(\"both\", labelsize=\"small\")\n",
    "\n",
    "    # Legend- make space below the plot-raise bottom. there will be an label below the second last (bottom middle) ax, thanks to the bbox_to_anchor=(x, y) with a negative y-value.\n",
    "    axes.legend(\n",
    "        loc=\"center left\",\n",
    "        bbox_to_anchor=(0.025, 0.875, 0.75, 0.1),\n",
    "        ncol=1,\n",
    "        fontsize=\"small\",\n",
    "    )\n",
    "\n",
    "    if figureSave:\n",
    "        plt.savefig(\n",
    "            f\"{out_dir}/NB2_Fig3_global_zonal_mean_annal_discharge_{ocean_name}_{analysis_name}_gauge_only.png\",\n",
    "            dpi=300,\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8615882b",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-cell"
    ]
   },
   "outputs": [],
   "source": [
    "if client and not cupid_run:\n",
    "    client.shutdown()\n",
    "    print(\"Dask client shutdown complete.\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "cupid-analysis",
   "language": "python",
   "name": "cupid-analysis"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  },
  "papermill": {
   "duration": 3.12963,
   "end_time": "2025-11-25T01:43:00.250669",
   "exception": true,
   "input_path": "/glade/derecho/scratch/richling/tmp/tmp60rdxmui.ipynb",
   "output_path": "/glade/work/richling/cupid-test-2/CUPiD/examples/key_metrics/computed_notebooks/rof/global_discharge_ocean_compare_obs.ipynb",
   "parameters": {
    "CESM_output_dir": "/glade/campaign/cesm/development/cross-wg/diagnostic_framework/CESM_output_for_testing",
    "analysis_name": "",
    "base_case_name": "b.e30_alpha07c_cesm.B1850C_LTso.ne30_t232_wgx3.228",
    "base_case_nickname": "BLT1850_228",
    "base_climo_end_year": 45,
    "base_climo_start_year": 36,
    "base_end_date": "0045-01-01",
    "base_start_date": "0001-01-01",
    "case_name": "b.e30_alpha07c_cesm.B1850C_LTso.ne30_t232_wgx3.232",
    "case_nickname": "BLT1850_232",
    "climo_end_year": 21,
    "climo_nyears": 10,
    "climo_start_year": 11,
    "end_date": "0021-01-01",
    "figureSave": false,
    "grid_name": "f09_f09_mosart",
    "hist_str": "h0a",
    "lc_kwargs": {
     "threads_per_worker": 1
    },
    "obs_data_dir": "/glade/campaign/cesm/development/cross-wg/diagnostic_framework/CUPiD_obs_data",
    "product": "/glade/work/richling/cupid-test-2/CUPiD/examples/key_metrics/computed_notebooks//rof/global_discharge_ocean_compare_obs.ipynb",
    "serial": false,
    "start_date": "0001-01-01",
    "subset_kwargs": {},
    "ts_dir": null
   },
   "start_time": "2025-11-25T01:42:57.121039"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}