{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b5f106cb",
   "metadata": {
    "tags": [
     "papermill-error-cell-tag"
    ]
   },
   "source": [
    "<span style=\"color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;\">An Exception was encountered at '<a href=\"#papermill-error-cell\">In [4]</a>'.</span>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f93e9d49",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": 0.004275,
     "end_time": "2025-10-10T23:20:57.276581",
     "exception": false,
     "start_time": "2025-10-10T23:20:57.272306",
     "status": "completed"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "# ROF monthly, annual, seasonal discharge at ocean outlets <a id='top'></a>\n",
    "\n",
    "Use the following datasets\n",
    "\n",
    "1. reach-D19 gauge link ascii\n",
    "2. D19 flow site geopackage\n",
    "3. D19 discharge netCDF\n",
    "4. monthly and yearly flow netCD (history file)\n",
    "\n",
    "[1. Setupt](#setup)\n",
    "\n",
    "\n",
    "[2. Loading discharge data](#load_discharge_data)\n",
    "\n",
    "- Read monthly history files from archive. \n",
    "- Reference data: monthly discharge estimates at 922 big river mouths from Dai et al. 2019 data (D19)\n",
    "\n",
    "[3. Read river, catchment, gauge information](#read_ancillary)\n",
    "\n",
    "- catchment polygon (geopackage)\n",
    "- gauge point (geopackage)\n",
    "- gauge-catchment link (csv)\n",
    "- outlet reach information (netCDF) including discharging ocean names\n",
    "\n",
    "[4. Ocean discharge line plots](#24_large_rivers)\n",
    "\n",
    "- total seasonal flow for oceans. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1ad57f57",
   "metadata": {
    "editable": true,
    "execution": {
     "iopub.execute_input": "2025-10-10T23:20:57.285332Z",
     "iopub.status.busy": "2025-10-10T23:20:57.285058Z",
     "iopub.status.idle": "2025-10-10T23:20:58.478372Z",
     "shell.execute_reply": "2025-10-10T23:20:58.477928Z"
    },
    "papermill": {
     "duration": 1.198751,
     "end_time": "2025-10-10T23:20:58.479215",
     "exception": false,
     "start_time": "2025-10-10T23:20:57.280464",
     "status": "completed"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "The Python version: 3.11.4\n",
      "xarray 2025.7.1\n",
      "pandas 2.3.1\n",
      "geopandas 1.1.1\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ERROR 1: PROJ: proj_create_from_database: Open of /glade/work/richling/conda-envs/cupid-analysis/share/proj failed\n"
     ]
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import os, sys\n",
    "import glob\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import geopandas as gpd\n",
    "import xarray as xr\n",
    "import cartopy.feature as cfeature\n",
    "\n",
    "from scripts.utility import load_yaml\n",
    "from scripts.utility import no_time_variable\n",
    "from scripts.utility import read_shps\n",
    "from scripts.utility import get_index_array\n",
    "\n",
    "rivers_50m = cfeature.NaturalEarthFeature(\"physical\", \"rivers_lake_centerlines\", \"50m\")\n",
    "land = cfeature.LAND\n",
    "\n",
    "print(\"\\nThe Python version: %s.%s.%s\" % sys.version_info[:3])\n",
    "print(xr.__name__, xr.__version__)\n",
    "print(pd.__name__, pd.__version__)\n",
    "print(gpd.__name__, gpd.__version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "57997a80",
   "metadata": {
    "papermill": {
     "duration": 0.003846,
     "end_time": "2025-10-10T23:20:58.498928",
     "exception": false,
     "start_time": "2025-10-10T23:20:58.495082",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "-------------------------\n",
    "## 1. Setup <a id='setup'></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "33dfcf7d",
   "metadata": {
    "editable": true,
    "execution": {
     "iopub.execute_input": "2025-10-10T23:20:58.507812Z",
     "iopub.status.busy": "2025-10-10T23:20:58.507388Z",
     "iopub.status.idle": "2025-10-10T23:20:58.511352Z",
     "shell.execute_reply": "2025-10-10T23:20:58.510870Z"
    },
    "papermill": {
     "duration": 0.009824,
     "end_time": "2025-10-10T23:20:58.512563",
     "exception": false,
     "start_time": "2025-10-10T23:20:58.502739",
     "status": "completed"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "parameters",
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# Parameter Defaults\n",
    "# parameters are set in CUPiD's config.yml file\n",
    "# when running interactively, manually set the parameters below\n",
    "\n",
    "CESM_output_dir = \"\"\n",
    "case_name = None  # case name: e.g., \"b.e30_beta02.BLT1850.ne30_t232.104\"\n",
    "base_case_name = None  # base case name: e.g., \"b.e23_alpha17f.BLT1850.ne30_t232.092\"\n",
    "start_date = \"\"\n",
    "end_date = \"\"\n",
    "base_start_date = \"\"  # base simulation starting date: \"0001-01-01\"\n",
    "base_end_date = \"\"  # base simulation ending date: \"0100-01-01\"\n",
    "serial = True  # use dask LocalCluster\n",
    "lc_kwargs = {}\n",
    "\n",
    "hist_str = \"\"  # Used for hist file tag\n",
    "analysis_name = \"\"  # Used for Figure png names\n",
    "climo_nyears = 10  # number of years to compute the climatology\n",
    "grid_name = \"f09_f09_mosart\"  # ROF grid name used in case\n",
    "base_grid_name = (\n",
    "    grid_name  # spcify ROF grid name for base_case in config.yml if different than case\n",
    ")\n",
    "figureSave = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "bbc1e852",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-10-10T23:20:58.525565Z",
     "iopub.status.busy": "2025-10-10T23:20:58.525375Z",
     "iopub.status.idle": "2025-10-10T23:20:58.528841Z",
     "shell.execute_reply": "2025-10-10T23:20:58.528465Z"
    },
    "papermill": {
     "duration": 0.008341,
     "end_time": "2025-10-10T23:20:58.529463",
     "exception": false,
     "start_time": "2025-10-10T23:20:58.521122",
     "status": "completed"
    },
    "tags": [
     "injected-parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# Parameters\n",
    "case_name = \"b.e30_beta02.BLT1850.ne30_t232.104\"\n",
    "base_case_name = \"b.e23_alpha17f.BLT1850.ne30_t232.092\"\n",
    "CESM_output_dir = \"/glade/campaign/cesm/development/cross-wg/diagnostic_framework/CESM_output_for_testing\"\n",
    "start_date = \"0001-01-01\"\n",
    "end_date = \"0101-01-01\"\n",
    "base_start_date = \"0001-01-01\"\n",
    "base_end_date = \"0101-01-01\"\n",
    "obs_data_dir = (\n",
    "    \"/glade/campaign/cesm/development/cross-wg/diagnostic_framework/CUPiD_obs_data\"\n",
    ")\n",
    "ts_dir = None\n",
    "lc_kwargs = {\"threads_per_worker\": 1}\n",
    "serial = False\n",
    "hist_str = \"h0a\"\n",
    "analysis_name = \"\"\n",
    "grid_name = \"f09_f09_mosart\"\n",
    "climo_nyears = 10\n",
    "figureSave = False\n",
    "subset_kwargs = {}\n",
    "product = \"/glade/work/richling/CUPid_pr_test/CUPiD/examples/key_metrics/computed_notebooks//rof/global_discharge_ocean_compare_obs.ipynb\"\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2a571a45",
   "metadata": {
    "tags": [
     "papermill-error-cell-tag"
    ]
   },
   "source": [
    "<span id=\"papermill-error-cell\" style=\"color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;\">Execution using papermill encountered an exception here and stopped:</span>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "eb6b285c",
   "metadata": {
    "editable": true,
    "execution": {
     "iopub.execute_input": "2025-10-10T23:20:58.538998Z",
     "iopub.status.busy": "2025-10-10T23:20:58.538770Z",
     "iopub.status.idle": "2025-10-10T23:20:58.934596Z",
     "shell.execute_reply": "2025-10-10T23:20:58.933487Z"
    },
    "papermill": {
     "duration": 0.402188,
     "end_time": "2025-10-10T23:20:58.935458",
     "exception": true,
     "start_time": "2025-10-10T23:20:58.533270",
     "status": "failed"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "unsupported operand type(s) for +=: 'dict' and 'dict'",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mTypeError\u001b[39m                                 Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 27\u001b[39m\n\u001b[32m     19\u001b[39m case_dic = {\n\u001b[32m     20\u001b[39m     case_name: {\n\u001b[32m     21\u001b[39m         \u001b[33m\"\u001b[39m\u001b[33mgrid\u001b[39m\u001b[33m\"\u001b[39m: grid_name,\n\u001b[32m   (...)\u001b[39m\u001b[32m     24\u001b[39m     }\n\u001b[32m     25\u001b[39m }\n\u001b[32m     26\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m base_case_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m27\u001b[39m     \u001b[43mcase_dic\u001b[49m\u001b[43m \u001b[49m\u001b[43m+\u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m     28\u001b[39m \u001b[43m        \u001b[49m\u001b[43mbase_case_name\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m     29\u001b[39m \u001b[43m            \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mgrid\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mgrid_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m     30\u001b[39m \u001b[43m            \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43msim_period\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mslice\u001b[39;49m\u001b[43m(\u001b[49m\u001b[33;43mf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mbase_start_date\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43mf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mbase_end_date\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m     31\u001b[39m \u001b[43m            \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mclimo_nyrs\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mmin\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[32m     32\u001b[39m \u001b[43m                \u001b[49m\u001b[43mclimo_nyears\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mbase_end_date\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[32;43m4\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m-\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mbase_start_date\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[32;43m4\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m+\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m1\u001b[39;49m\n\u001b[32m     33\u001b[39m \u001b[43m            \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m     34\u001b[39m \u001b[43m        \u001b[49m\u001b[43m}\u001b[49m\n\u001b[32m     35\u001b[39m \u001b[43m    \u001b[49m\u001b[43m}\u001b[49m\n\u001b[32m     37\u001b[39m oceans_list = [\n\u001b[32m     38\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33marctic\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     39\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33matlantic\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m   (...)\u001b[39m\u001b[32m     44\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mglobal\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     45\u001b[39m ]\n",
      "\u001b[31mTypeError\u001b[39m: unsupported operand type(s) for +=: 'dict' and 'dict'"
     ]
    }
   ],
   "source": [
    "# ROF additional setup\n",
    "setup = load_yaml(\"./setup/setup.yaml\")\n",
    "\n",
    "domain_dir = setup[\n",
    "    \"ancillary_dir\"\n",
    "]  # ancillary directory including such as ROF domain, river network data\n",
    "geospatial_dir = setup[\"geospatial_dir\"]  # including shapefiles or geopackages\n",
    "ref_flow_dir = setup[\"ref_flow_dir\"]  # including observed or reference flow data\n",
    "case_meta = setup[\"case_meta\"]  # RO grid meta\n",
    "catch_gpkg = setup[\"catch_gpkg\"]  # catchment geopackage meta\n",
    "reach_gpkg = setup[\"reach_gpkg\"]  # reach geopackage meta\n",
    "network_nc = setup[\"river_network\"]  # river network meta\n",
    "\n",
    "if not analysis_name:\n",
    "    analysis_name = case_name\n",
    "if base_grid_name:\n",
    "    base_grid_name = grid_name\n",
    "\n",
    "case_dic = {\n",
    "    case_name: {\n",
    "        \"grid\": grid_name,\n",
    "        \"sim_period\": slice(f\"{start_date}\", f\"{end_date}\"),\n",
    "        \"climo_nyrs\": min(climo_nyears, int(end_date[:4]) - int(start_date[:4]) + 1),\n",
    "    }\n",
    "}\n",
    "if base_case_name is not None:\n",
    "    case_dic += {\n",
    "        base_case_name: {\n",
    "            \"grid\": grid_name,\n",
    "            \"sim_period\": slice(f\"{base_start_date}\", f\"{base_end_date}\"),\n",
    "            \"climo_nyrs\": min(\n",
    "                climo_nyears, int(base_end_date[:4]) - int(base_start_date[:4]) + 1\n",
    "            ),\n",
    "        }\n",
    "    }\n",
    "\n",
    "oceans_list = [\n",
    "    \"arctic\",\n",
    "    \"atlantic\",\n",
    "    \"indian\",\n",
    "    \"mediterranean\",\n",
    "    \"pacific\",\n",
    "    \"south_china\",\n",
    "    \"global\",\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "02d6bfe9",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "-----\n",
    "### dasks (optional)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63414608",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# When running interactively, cupid_run should be set to False\n",
    "cupid_run = True\n",
    "\n",
    "client = None\n",
    "if cupid_run:\n",
    "    # Spin up cluster (if running in parallel)\n",
    "    if not serial:\n",
    "        from dask.distributed import Client, LocalCluster\n",
    "\n",
    "        cluster = LocalCluster(**lc_kwargs)\n",
    "        client = Client(cluster)\n",
    "else:\n",
    "    if not serial:\n",
    "        from dask.distributed import Client\n",
    "        from dask_jobqueue import PBSCluster\n",
    "\n",
    "        cluster = PBSCluster(\n",
    "            cores=1,\n",
    "            processes=1,\n",
    "            memory=\"50GB\",\n",
    "            queue=\"casper\",\n",
    "            walltime=\"01:00:00\",\n",
    "        )\n",
    "        cluster.scale(jobs=10)\n",
    "        client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "820fd574",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "## 2. Loading discharge data <a id='load_discharge_data'></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cd7f6daa",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 2.1. Monthly/annual flow netCDFs\n",
    "- month_data (xr dataset)\n",
    "- year_data (xr dataset)\n",
    "- seas_data (xr dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5399cfd",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "reachID = {}\n",
    "month_data = {}\n",
    "year_data = {}\n",
    "seas_data = {}\n",
    "for case, meta in case_dic.items():\n",
    "    in_dire = os.path.join(CESM_output_dir, case, \"rof/hist\")\n",
    "    model = case_meta[meta[\"grid\"]][\"model\"]\n",
    "    domain = case_meta[meta[\"grid\"]][\"domain_nc\"]\n",
    "    var_list = case_meta[meta[\"grid\"]][\"vars_read\"]\n",
    "\n",
    "    def preprocess(ds):\n",
    "        return ds[var_list]\n",
    "\n",
    "    year_list = [\n",
    "        \"{:04d}\".format(yr)\n",
    "        for yr in np.arange(\n",
    "            int(meta[\"sim_period\"].start[0:4]), int(meta[\"sim_period\"].stop[0:4]) + 1\n",
    "        )\n",
    "    ]\n",
    "\n",
    "    nc_list = []\n",
    "    for nc_path in sorted(glob.glob(f\"{in_dire}/{case}.{model}.{hist_str}.????-*.nc\")):\n",
    "        for yr in year_list:\n",
    "            if yr in os.path.basename(nc_path):\n",
    "                nc_list.append(nc_path)\n",
    "\n",
    "    # load data\n",
    "    ds = xr.open_mfdataset(\n",
    "        nc_list,\n",
    "        data_vars=\"minimal\",\n",
    "        parallel=True,\n",
    "        preprocess=preprocess,\n",
    "    ).sel(time=meta[\"sim_period\"])\n",
    "\n",
    "    # monthly\n",
    "    month_data[case] = ds.isel(time=slice(-meta[\"climo_nyrs\"] * 12, None))\n",
    "    # annual\n",
    "    year_data[case] = (\n",
    "        ds.isel(time=slice(-meta[\"climo_nyrs\"] * 12, None))\n",
    "        .resample(time=\"YS\")\n",
    "        .mean(dim=\"time\")\n",
    "        .load()\n",
    "    )\n",
    "    # seasonal (compute here instead of reading for conisistent analysis period)\n",
    "    seas_data[case] = (\n",
    "        ds.isel(time=slice(-meta[\"climo_nyrs\"] * 12, None))\n",
    "        .groupby(\"time.month\")\n",
    "        .mean(\"time\")\n",
    "        .load()\n",
    "    )\n",
    "    vars_no_time = no_time_variable(month_data[case])\n",
    "    if vars_no_time:\n",
    "        seas_data[case][vars_no_time] = seas_data[case][vars_no_time].isel(\n",
    "            month=0, drop=True\n",
    "        )\n",
    "    mon_time = month_data[case].time.values\n",
    "    if domain == \"None\":\n",
    "        reachID[case] = month_data[case][\"reachID\"].values\n",
    "    else:\n",
    "        reachID[case] = (\n",
    "            xr.open_dataset(f\"{domain_dir}/{domain}\")[\"reachID\"]\n",
    "            .stack(seg=(\"lat\", \"lon\"))\n",
    "            .values\n",
    "        )\n",
    "    print(f\"Finished loading {case}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b1b69aaf",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 2.2 D19 discharge data\n",
    "- ds_q_obs_mon (xr datasets)\n",
    "- ds_q_obs_yr (xr datasets)\n",
    "- dr_q_obs_seasonal (xr datasets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9e52315",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "# read monthly data\n",
    "ds_q = xr.open_dataset(\n",
    "    \"%s/D09/coastal-stns-Vol-monthly.updated-May2019.mod.nc\" % (ref_flow_dir),\n",
    "    decode_times=False,\n",
    ")\n",
    "ds_q[\"time\"] = xr.cftime_range(\n",
    "    start=\"1900-01-01\", end=\"2018-12-01\", freq=\"MS\", calendar=\"standard\"\n",
    ")\n",
    "\n",
    "# monthly- if time_period is outside observation period, use the entire obs period\n",
    "obs_available = True\n",
    "if ds_q[\"time\"].sel(time=slice(str(mon_time[0]), str(mon_time[-1]))).values.size == 0:\n",
    "    obs_available = False\n",
    "    ds_q_obs_mon = ds_q[\"FLOW\"]\n",
    "else:\n",
    "    ds_q_obs_mon = ds_q[\"FLOW\"].sel(time=slice(str(mon_time[0]), str(mon_time[-1])))\n",
    "# compute annual flow from monthly\n",
    "ds_q_obs_yr = ds_q_obs_mon.resample(time=\"YE\").mean(dim=\"time\")\n",
    "# compute annual cycle at monthly scale\n",
    "dr_q_obs_seasonal = ds_q_obs_mon.groupby(\"time.month\").mean(\"time\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d3ef6a11",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "## 3. Reading river, catchment, gauge infomation  <a id='read_ancillary'></a>\n",
    "\n",
    "- gauge-catchment (or grid box) link (csv)\n",
    "- gauge point (geopackage)\n",
    "- ocean polygon (geopackage)\n",
    "- catchment polygon (geopackage)\n",
    "- outlet reach information (netCDF)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8807367c",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "### 3.1. reach-D19 gauge link csv\n",
    "- gauge_reach_lnk (dataframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e663529f",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "gauge_reach_lnk = {}\n",
    "for case, meta in case_dic.items():\n",
    "    gauge_reach_lnk[case] = pd.read_csv(\n",
    "        \"%s/D09/D09_925.%s.asc\" % (ref_flow_dir, case_meta[meta[\"grid\"]][\"network\"])\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4ea335ec",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "### 3.2 D19 flow site geopackage\n",
    "- gauge_shp (dataframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69a19bb6",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "gauge_shp = gpd.read_file(\n",
    "    os.path.join(ref_flow_dir, \"D09\", \"geospatial\", \"D09_925.gpkg\")\n",
    ")\n",
    "gauge_shp = gauge_shp[gauge_shp[\"id\"] != 9999999]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "94f25252",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "### 3.3 Ocean polygon geopackage\n",
    "- ocean_shp (dataframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a904520a",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "ocean_shp = gpd.read_file(os.path.join(geospatial_dir, \"oceans.gpkg\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0262c6ec",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "### 3.3 Read river network information\n",
    "- gdf_cat (dataframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "625d5f2f",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "## read catchment geopackage\n",
    "gdf_cat = {}\n",
    "for case, meta in case_dic.items():\n",
    "    cat_gpkg = os.path.join(\n",
    "        geospatial_dir, catch_gpkg[meta[\"grid\"]][\"file_name\"]\n",
    "    )  # geopackage name\n",
    "    id_name_cat = catch_gpkg[meta[\"grid\"]][\"id_name\"]  # reach ID in geopackage\n",
    "    var_list = [id_name_cat]\n",
    "    if \"lk\" in grid_name:\n",
    "        var_list.append(\"lake\")\n",
    "    gdf_cat[case] = read_shps([cat_gpkg], var_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "240a7708",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "### 3.4 Read river outlet information\n",
    "- Apppend into gdf_cat (dataframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b53ee36b",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "# read river outlet netcdf\n",
    "riv_ocean = {}\n",
    "for case, meta in case_dic.items():\n",
    "    riv_ocean_file = os.path.join(\n",
    "        domain_dir,\n",
    "        network_nc[meta[\"grid\"]][\"file_name\"].replace(\".aug.nc\", \".outlet.nc\"),\n",
    "    )  # network netcdf name\n",
    "    ds_rn_ocean = xr.open_dataset(riv_ocean_file).set_index(seg=\"seg_id\")\n",
    "    df_tmp = ds_rn_ocean.to_dataframe()\n",
    "    riv_ocean[case] = pd.merge(\n",
    "        gdf_cat[case],\n",
    "        df_tmp,\n",
    "        left_on=catch_gpkg[meta[\"grid\"]][\"id_name\"],\n",
    "        right_index=True,\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "58a4281c",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "source": [
    "### 2.6 Merge gauge, outlet catchment dataframe\n",
    "\n",
    "- gauge_shp1 (dataframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2913723e",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "# Merge gauge_reach lnk (dataframe) into gauge shapefile\n",
    "gauge_shp1 = {}\n",
    "for case, meta in case_dic.items():\n",
    "    df = gauge_reach_lnk[case]\n",
    "\n",
    "    # df = df.loc[(df['flag'] == 0)]\n",
    "    df1 = df.drop(columns=[\"riv_name\"])\n",
    "    df2 = pd.merge(gauge_shp, df1, how=\"inner\", left_on=\"id\", right_on=\"gauge_id\")\n",
    "    gauge_shp1[case] = pd.merge(\n",
    "        df2,\n",
    "        riv_ocean[case],\n",
    "        how=\"inner\",\n",
    "        left_on=\"route_id\",\n",
    "        right_on=catch_gpkg[meta[\"grid\"]][\"id_name\"],\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b8cf8573",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "------\n",
    "## 3. Plot annual cycle for global oceans <a id='24_large_rivers'></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f86354c",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "%time\n",
    "\n",
    "nrows = 4\n",
    "ncols = 2\n",
    "fig, axes = plt.subplots(nrows, ncols, figsize=(7.25, 6.5))\n",
    "plt.subplots_adjust(\n",
    "    top=0.95, bottom=0.065, right=0.98, left=0.10, hspace=0.225, wspace=0.250\n",
    ")  # create some space below the plots by increasing the bottom-value\n",
    "\n",
    "for ix, ocean_name in enumerate(oceans_list):\n",
    "    row = ix // 2\n",
    "    col = ix % 2\n",
    "    for case, meta in case_dic.items():\n",
    "\n",
    "        q_name = case_meta[meta[\"grid\"]][\"flow_name\"]\n",
    "\n",
    "        if case_meta[meta[\"grid\"]][\"network_type\"] == \"vector\":\n",
    "            if ocean_name == \"global\":\n",
    "                id_list = gauge_shp1[case][\"route_id\"].values\n",
    "            else:\n",
    "                id_list = gauge_shp1[case][gauge_shp1[case][\"ocean\"] == ocean_name][\n",
    "                    \"route_id\"\n",
    "                ].values\n",
    "            reach_index = get_index_array(reachID[case], id_list)\n",
    "            dr_flow = seas_data[case][q_name].isel(seg=reach_index).sum(dim=\"seg\")\n",
    "            dr_flow.plot(ax=axes[row, col], linestyle=\"-\", lw=0.75, label=case)\n",
    "\n",
    "        elif case_meta[grid_name][\"network_type\"] == \"grid\":  # means 2d grid\n",
    "            if ocean_name == \"global\":\n",
    "                id_list = gauge_shp1[case][\"route_id\"].values\n",
    "            else:\n",
    "                id_list = gauge_shp1[case][gauge_shp1[case][\"ocean\"] == ocean_name][\n",
    "                    \"route_id\"\n",
    "                ].values\n",
    "\n",
    "            reach_index = get_index_array(reachID[case], id_list)\n",
    "            seas_data_vector = seas_data[case][q_name].stack(seg=(\"lat\", \"lon\"))\n",
    "            dr_flow = seas_data_vector.isel(seg=reach_index).sum(dim=\"seg\")\n",
    "            dr_flow.plot(ax=axes[row, col], linestyle=\"-\", lw=0.75, label=case)\n",
    "\n",
    "    # reference data\n",
    "    if obs_available:\n",
    "        if ocean_name == \"global\":\n",
    "            id_list = gauge_shp1[case][\"id\"].values\n",
    "        else:\n",
    "            id_list = gauge_shp1[case][gauge_shp1[case][\"ocean\"] == ocean_name][\n",
    "                \"id\"\n",
    "            ].values\n",
    "        gauge_index = get_index_array(ds_q[\"id\"].values, id_list)\n",
    "        dr_obs = dr_q_obs_seasonal.isel(station=gauge_index).sum(dim=\"station\")\n",
    "        dr_obs.plot(\n",
    "            ax=axes[row, col],\n",
    "            linestyle=\"None\",\n",
    "            marker=\"o\",\n",
    "            markersize=2,\n",
    "            c=\"k\",\n",
    "            label=\"D19\",\n",
    "        )\n",
    "\n",
    "    axes[row, col].set_title(\"%d %s\" % (ix + 1, ocean_name), fontsize=9)\n",
    "    axes[row, col].set_xlabel(\"\")\n",
    "    if row < 7:\n",
    "        axes[row, col].set_xticklabels(\"\")\n",
    "    if col == 0:\n",
    "        axes[row, col].set_ylabel(\"Mon. flow [m$^3$/s]\", fontsize=9)\n",
    "    else:\n",
    "        axes[row, col].set_ylabel(\"\")\n",
    "    axes[row, col].tick_params(\"both\", labelsize=\"x-small\")\n",
    "\n",
    "# Legend- make space below the plot-raise bottom. there will be an label below the second last (bottom middle) ax, thanks to the bbox_to_anchor=(x, y) with a negative y-value.\n",
    "axes[row, col].legend(\n",
    "    loc=\"center left\", bbox_to_anchor=(1.10, 0.40, 0.75, 0.1), ncol=1, fontsize=\"small\"\n",
    ")\n",
    "\n",
    "for jx in range(ix + 1, nrows * ncols):\n",
    "    row = jx // 2\n",
    "    col = jx % 2\n",
    "    fig.delaxes(axes[row][col])\n",
    "\n",
    "if figureSave:\n",
    "    plt.savefig(f\"./NB2_Fig1_ocean_discharge_season_{analysis_name}.png\", dpi=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2866ba70",
   "metadata": {
    "editable": true,
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "if client:\n",
    "    client.shutdown()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "cupid-analysis",
   "language": "python",
   "name": "cupid-analysis"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  },
  "papermill": {
   "duration": 3.338264,
   "end_time": "2025-10-10T23:20:59.563087",
   "exception": true,
   "input_path": "/glade/derecho/scratch/richling/tmp/tmpr1qvkolk.ipynb",
   "output_path": "/glade/work/richling/CUPid_pr_test/CUPiD/examples/key_metrics/computed_notebooks/rof/global_discharge_ocean_compare_obs.ipynb",
   "parameters": {
    "CESM_output_dir": "/glade/campaign/cesm/development/cross-wg/diagnostic_framework/CESM_output_for_testing",
    "analysis_name": "",
    "base_case_name": "b.e23_alpha17f.BLT1850.ne30_t232.092",
    "base_end_date": "0101-01-01",
    "base_start_date": "0001-01-01",
    "case_name": "b.e30_beta02.BLT1850.ne30_t232.104",
    "climo_nyears": 10,
    "end_date": "0101-01-01",
    "figureSave": false,
    "grid_name": "f09_f09_mosart",
    "hist_str": "h0a",
    "lc_kwargs": {
     "threads_per_worker": 1
    },
    "obs_data_dir": "/glade/campaign/cesm/development/cross-wg/diagnostic_framework/CUPiD_obs_data",
    "product": "/glade/work/richling/CUPid_pr_test/CUPiD/examples/key_metrics/computed_notebooks//rof/global_discharge_ocean_compare_obs.ipynb",
    "serial": false,
    "start_date": "0001-01-01",
    "subset_kwargs": {},
    "ts_dir": null
   },
   "start_time": "2025-10-10T23:20:56.224823"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}