azuredatastudio/extensions/arc/notebooks/arcDataServices/content/postgres/tsg100-troubleshoot-postgres.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "TSG100 - The Azure Arc-enabled PostgreSQL Hyperscale troubleshooter\n",
    "===================================================================\n",
    "\n",
    "Description\n",
    "-----------\n",
    "\n",
    "Follow these steps to troubleshoot an Azure Arc-enabled PostgreSQL\n",
    "Hyperscale Server.\n",
    "\n",
    "Steps\n",
    "-----\n",
    "\n",
    "### Parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# The server to troubleshoot. If not provided\n",
    "# the user will be prompted to select a server.\n",
    "namespace = os.environ.get('POSTGRES_SERVER_NAMESPACE')\n",
    "name = os.environ.get('POSTGRES_SERVER_NAME')\n",
    "\n",
    "tail_lines = 50"
   ]
  },
  {
   "cell_type": "markdown",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "### Common functions\n",
    "\n",
    "Define helper functions used in this notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "hide_input"
    ]
   },
   "outputs": [],
   "source": [
    "# Define `run` function for transient fault handling, suggestions on error, and scrolling updates on Windows\n",
    "import sys\n",
    "import os\n",
    "import re\n",
    "import platform\n",
    "import shlex\n",
    "import shutil\n",
    "import datetime\n",
    "\n",
    "from subprocess import Popen, PIPE\n",
    "from IPython.display import Markdown\n",
    "\n",
    "retry_hints = {} # Output in stderr known to be transient, therefore automatically retry\n",
    "error_hints = {} # Output in stderr where a known SOP/TSG exists which will be HINTed for further help\n",
    "install_hint = {} # The SOP to help install the executable if it cannot be found\n",
    "\n",
    "def run(cmd, return_output=False, no_output=False, retry_count=0, base64_decode=False, return_as_json=False):\n",
    "    \"\"\"Run shell command, stream stdout, print stderr and optionally return output\n",
    "\n",
    "    NOTES:\n",
    "\n",
    "    1.  Commands that need this kind of ' quoting on Windows e.g.:\n",
    "\n",
    "            kubectl get nodes -o jsonpath={.items[?(@.metadata.annotations.pv-candidate=='data-pool')].metadata.name}\n",
    "\n",
    "        Need to actually pass in as '\"':\n",
    "\n",
    "            kubectl get nodes -o jsonpath={.items[?(@.metadata.annotations.pv-candidate=='\"'data-pool'\"')].metadata.name}\n",
    "\n",
    "        The ' quote approach, although correct when pasting into Windows cmd, will hang at the line:\n",
    "        \n",
    "            `iter(p.stdout.readline, b'')`\n",
    "\n",
    "        The shlex.split call does the right thing for each platform, just use the '\"' pattern for a '\n",
    "    \"\"\"\n",
    "    MAX_RETRIES = 5\n",
    "    output = \"\"\n",
    "    retry = False\n",
    "\n",
    "    # When running `azdata sql query` on Windows, replace any \\n in \"\"\" strings, with \" \", otherwise we see:\n",
    "    #\n",
    "    #    ('HY090', '[HY090] [Microsoft][ODBC Driver Manager] Invalid string or buffer length (0) (SQLExecDirectW)')\n",
    "    #\n",
    "    if platform.system() == \"Windows\" and cmd.startswith(\"azdata sql query\"):\n",
    "        cmd = cmd.replace(\"\\n\", \" \")\n",
    "\n",
    "    # shlex.split is required on bash and for Windows paths with spaces\n",
    "    #\n",
    "    cmd_actual = shlex.split(cmd)\n",
    "\n",
    "    # Store this (i.e. kubectl, python etc.) to support binary context aware error_hints and retries\n",
    "    #\n",
    "    user_provided_exe_name = cmd_actual[0].lower()\n",
    "\n",
    "    # When running python, use the python in the ADS sandbox ({sys.executable})\n",
    "    #\n",
    "    if cmd.startswith(\"python \"):\n",
    "        cmd_actual[0] = cmd_actual[0].replace(\"python\", sys.executable)\n",
    "\n",
    "        # On Mac, when ADS is not launched from terminal, LC_ALL may not be set, which causes pip installs to fail\n",
    "        # with:\n",
    "        #\n",
    "        #    UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 4969: ordinal not in range(128)\n",
    "        #\n",
    "        # Setting it to a default value of \"en_US.UTF-8\" enables pip install to complete\n",
    "        #\n",
    "        if platform.system() == \"Darwin\" and \"LC_ALL\" not in os.environ:\n",
    "            os.environ[\"LC_ALL\"] = \"en_US.UTF-8\"\n",
    "\n",
    "    # When running `kubectl`, if AZDATA_OPENSHIFT is set, use `oc`\n",
    "    #\n",
    "    if cmd.startswith(\"kubectl \") and \"AZDATA_OPENSHIFT\" in os.environ:\n",
    "        cmd_actual[0] = cmd_actual[0].replace(\"kubectl\", \"oc\")\n",
    "\n",
    "    # To aid supportability, determine which binary file will actually be executed on the machine\n",
    "    #\n",
    "    which_binary = None\n",
    "\n",
    "    # Special case for CURL on Windows.  The version of CURL in Windows System32 does not work to\n",
    "    # get JWT tokens, it returns \"(56) Failure when receiving data from the peer\".  If another instance\n",
    "    # of CURL exists on the machine use that one.  (Unfortunately the curl.exe in System32 is almost\n",
    "    # always the first curl.exe in the path, and it can't be uninstalled from System32, so here we\n",
    "    # look for the 2nd installation of CURL in the path)\n",
    "    if platform.system() == \"Windows\" and cmd.startswith(\"curl \"):\n",
    "        path = os.getenv('PATH')\n",
    "        for p in path.split(os.path.pathsep):\n",
    "            p = os.path.join(p, \"curl.exe\")\n",
    "            if os.path.exists(p) and os.access(p, os.X_OK):\n",
    "                if p.lower().find(\"system32\") == -1:\n",
    "                    cmd_actual[0] = p\n",
    "                    which_binary = p\n",
    "                    break\n",
    "\n",
    "    # Find the path based location (shutil.which) of the executable that will be run (and display it to aid supportability), this\n",
    "    # seems to be required for .msi installs of azdata.cmd/az.cmd.  (otherwise Popen returns FileNotFound) \n",
    "    #\n",
    "    # NOTE: Bash needs cmd to be the list of the space separated values hence shlex.split.\n",
    "    #\n",
    "    if which_binary == None:\n",
    "        which_binary = shutil.which(cmd_actual[0])\n",
    "\n",
    "    # Display an install HINT, so the user can click on a SOP to install the missing binary\n",
    "    #\n",
    "    if which_binary == None:\n",
    "        print(f\"The path used to search for '{cmd_actual[0]}' was:\")\n",
    "        print(sys.path)\n",
    "\n",
    "        if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
    "            display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
    "\n",
    "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\")\n",
    "    else:   \n",
    "        cmd_actual[0] = which_binary\n",
    "\n",
    "    start_time = datetime.datetime.now().replace(microsecond=0)\n",
    "\n",
    "    print(f\"START: {cmd} @ {start_time} ({datetime.datetime.utcnow().replace(microsecond=0)} UTC)\")\n",
    "    print(f\"       using: {which_binary} ({platform.system()} {platform.release()} on {platform.machine()})\")\n",
    "    print(f\"       cwd: {os.getcwd()}\")\n",
    "\n",
    "    # Command-line tools such as CURL and AZDATA HDFS commands output\n",
    "    # scrolling progress bars, which causes Jupyter to hang forever, to\n",
    "    # workaround this, use no_output=True\n",
    "    #\n",
    "\n",
    "    # Work around a infinite hang when a notebook generates a non-zero return code, break out, and do not wait\n",
    "    #\n",
    "    wait = True \n",
    "\n",
    "    try:\n",
    "        if no_output:\n",
    "            p = Popen(cmd_actual)\n",
    "        else:\n",
    "            p = Popen(cmd_actual, stdout=PIPE, stderr=PIPE, bufsize=1)\n",
    "            with p.stdout:\n",
    "                for line in iter(p.stdout.readline, b''):\n",
    "                    line = line.decode()\n",
    "                    if return_output:\n",
    "                        output = output + line\n",
    "                    else:\n",
    "                        if cmd.startswith(\"azdata notebook run\"): # Hyperlink the .ipynb file\n",
    "                            regex = re.compile('  \"(.*)\"\\: \"(.*)\"') \n",
    "                            match = regex.match(line)\n",
    "                            if match:\n",
    "                                if match.group(1).find(\"HTML\") != -1:\n",
    "                                    display(Markdown(f' - \"{match.group(1)}\": \"{match.group(2)}\"'))\n",
    "                                else:\n",
    "                                    display(Markdown(f' - \"{match.group(1)}\": \"[{match.group(2)}]({match.group(2)})\"'))\n",
    "\n",
    "                                    wait = False\n",
    "                                    break # otherwise infinite hang, have not worked out why yet.\n",
    "                        else:\n",
    "                            print(line, end='')\n",
    "\n",
    "        if wait:\n",
    "            p.wait()\n",
    "    except FileNotFoundError as e:\n",
    "        if install_hint is not None:\n",
    "            display(Markdown(f'HINT: Use {install_hint} to resolve this issue.'))\n",
    "\n",
    "        raise FileNotFoundError(f\"Executable '{cmd_actual[0]}' not found in path (where/which)\") from e\n",
    "\n",
    "    exit_code_workaround = 0 # WORKAROUND: azdata hangs on exception from notebook on p.wait()\n",
    "\n",
    "    if not no_output:\n",
    "        for line in iter(p.stderr.readline, b''):\n",
    "            try:\n",
    "                line_decoded = line.decode()\n",
    "            except UnicodeDecodeError:\n",
    "                # NOTE: Sometimes we get characters back that cannot be decoded(), e.g.\n",
    "                #\n",
    "                #   \\xa0\n",
    "                #\n",
    "                # For example see this in the response from `az group create`:\n",
    "                #\n",
    "                # ERROR: Get Token request returned http error: 400 and server \n",
    "                # response: {\"error\":\"invalid_grant\",# \"error_description\":\"AADSTS700082: \n",
    "                # The refresh token has expired due to inactivity.\\xa0The token was \n",
    "                # issued on 2018-10-25T23:35:11.9832872Z\n",
    "                #\n",
    "                # which generates the exception:\n",
    "                #\n",
    "                # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 179: invalid start byte\n",
    "                #\n",
    "                print(\"WARNING: Unable to decode stderr line, printing raw bytes:\")\n",
    "                print(line)\n",
    "                line_decoded = \"\"\n",
    "                pass\n",
    "            else:\n",
    "\n",
    "                # azdata emits a single empty line to stderr when doing an hdfs cp, don't\n",
    "                # print this empty \"ERR:\" as it confuses.\n",
    "                #\n",
    "                if line_decoded == \"\":\n",
    "                    continue\n",
    "                \n",
    "                print(f\"STDERR: {line_decoded}\", end='')\n",
    "\n",
    "                if line_decoded.startswith(\"An exception has occurred\") or line_decoded.startswith(\"ERROR: An error occurred while executing the following cell\"):\n",
    "                    exit_code_workaround = 1\n",
    "\n",
    "                # inject HINTs to next TSG/SOP based on output in stderr\n",
    "                #\n",
    "                if user_provided_exe_name in error_hints:\n",
    "                    for error_hint in error_hints[user_provided_exe_name]:\n",
    "                        if line_decoded.find(error_hint[0]) != -1:\n",
    "                            display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
    "\n",
    "                # Verify if a transient error, if so automatically retry (recursive)\n",
    "                #\n",
    "                if user_provided_exe_name in retry_hints:\n",
    "                    for retry_hint in retry_hints[user_provided_exe_name]:\n",
    "                        if line_decoded.find(retry_hint) != -1:\n",
    "                            if retry_count \u003c MAX_RETRIES:\n",
    "                                print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
    "                                retry_count = retry_count + 1\n",
    "                                output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
    "\n",
    "                                if return_output:\n",
    "                                    if base64_decode:\n",
    "                                        import base64\n",
    "                                        return base64.b64decode(output).decode('utf-8')\n",
    "                                    else:\n",
    "                                        return output\n",
    "\n",
    "    elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
    "\n",
    "    # WORKAROUND: We avoid infinite hang above in the `azdata notebook run` failure case, by inferring success (from stdout output), so\n",
    "    # don't wait here, if success known above\n",
    "    #\n",
    "    if wait: \n",
    "        if p.returncode != 0:\n",
    "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(p.returncode)}.\\n')\n",
    "    else:\n",
    "        if exit_code_workaround !=0 :\n",
    "            raise SystemExit(f'Shell command:\\n\\n\\t{cmd} ({elapsed}s elapsed)\\n\\nreturned non-zero exit code: {str(exit_code_workaround)}.\\n')\n",
    "\n",
    "    print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
    "\n",
    "    if return_output:\n",
    "        if base64_decode:\n",
    "            import base64\n",
    "            return base64.b64decode(output).decode('utf-8')\n",
    "        else:\n",
    "            return output\n",
    "\n",
    "\n",
    "\n",
    "# Hints for tool retry (on transient fault), known errors and install guide\n",
    "#\n",
    "retry_hints = {}\n",
    "error_hints = {}\n",
    "install_hint = {}\n",
    "\n",
    "\n",
    "print('Common functions defined successfully.')"
   ]
  },
  {
   "cell_type": "markdown",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "### Get Postgres server"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "hide_input"
    ]
   },
   "outputs": [],
   "source": [
    "# Sets the 'server' variable to the spec of the Postgres server\n",
    "\n",
    "import math\n",
    "import json\n",
    "\n",
    "# If a server was provided, get it\n",
    "if namespace and name:\n",
    "    server = json.loads(run(f'kubectl get postgresqls -n {namespace} {name} -o json', return_output=True))\n",
    "else:\n",
    "    # Otherwise prompt the user to select a server\n",
    "    servers = json.loads(run(f'kubectl get postgresqls --all-namespaces -o json', return_output=True))['items']\n",
    "    if not servers:\n",
    "        raise Exception('No Postgres servers found')\n",
    "\n",
    "    full_name = lambda s: f'{s[\"metadata\"][\"namespace\"]}.{s[\"metadata\"][\"name\"]}'\n",
    "    servers.sort(key=lambda s: full_name(s))\n",
    "\n",
    "    pad = math.floor(math.log10(len(servers)) + 1) + 3\n",
    "    for i, s in enumerate(servers):\n",
    "        print(f'{f\"[{i+1}]\":\u003c{pad}}{full_name(s)}')\n",
    "\n",
    "    while True:\n",
    "        try:\n",
    "            i = int(input('Enter the index of a server'))\n",
    "        except ValueError:\n",
    "            continue\n",
    "\n",
    "        if i \u003e= 1 and i \u003c= len(servers):\n",
    "            server = servers[i-1]\n",
    "            namespace = server['metadata']['namespace']\n",
    "            name = server['metadata']['name']\n",
    "            break\n",
    "\n",
    "display(Markdown(f'#### Got server {namespace}.{name}'))"
   ]
  },
  {
   "cell_type": "markdown",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "### Summarize all resources"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "uid = server['metadata']['uid']\n",
    "\n",
    "display(Markdown(f'#### Server summary'))\n",
    "run(f'kubectl get postgresqls -n {namespace} {name}')\n",
    "\n",
    "display(Markdown(f'#### Resource summary'))\n",
    "run(f'kubectl get sts,pods,pvc,svc,ep -n {namespace} -l postgresqls.arcdata.microsoft.com/cluster-id={uid}')"
   ]
  },
  {
   "cell_type": "markdown",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "### Troubleshoot the server"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "display(Markdown(f'#### Troubleshooting server {namespace}.{name}'))\n",
    "run(f'kubectl describe postgresqls -n {namespace} {name}')"
   ]
  },
  {
   "cell_type": "markdown",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "### Troubleshoot the pods"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "pods = json.loads(run(f'kubectl get pods -n {namespace} -l postgresqls.arcdata.microsoft.com/cluster-id={uid} -o json', return_output=True))['items']\n",
    "\n",
    "# Summarize and describe each pod\n",
    "for pod in pods:\n",
    "    pod_name = pod['metadata']['name']\n",
    "    pod_phase = pod['status']['phase']\n",
    "    con_statuses = pod['status'].get('containerStatuses', [])\n",
    "    num_con_ready = sum(1 for c in con_statuses if c['ready'])\n",
    "    num_cons = len(pod['spec']['containers'])\n",
    "    num_restarts = sum(c['restartCount'] for c in con_statuses)\n",
    "\n",
    "    display(Markdown(f'#### Troubleshooting pod {namespace}.{pod_name}\\n'\n",
    "                     f'#### {pod_phase} with {num_con_ready}/{num_cons} '\n",
    "                     f'containers ready and {num_restarts} restarts'))\n",
    "\n",
    "    run(f'kubectl describe pod -n {namespace} {pod_name}')"
   ]
  },
  {
   "cell_type": "markdown",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "### Troubleshoot the containers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Summarize and get logs from each container\n",
    "for pod in pods:\n",
    "    pod_name = pod['metadata']['name']\n",
    "    cons = pod['spec']['containers']\n",
    "    con_statuses = pod['status'].get('containerStatuses', [])\n",
    "    display(Markdown(f'#### Troubleshooting {len(cons)} container{\"\" if len(cons) \u003c 2 else \"s\"} '\n",
    "                     f'containers for pod {namespace}.{pod_name}'))\n",
    "\n",
    "    for i, con in enumerate(cons):\n",
    "        con_name = con['name']\n",
    "        con_status = next((c for c in con_statuses if c['name'] == con_name), {})\n",
    "        con_started = con_status.get('started', False)\n",
    "        con_ready = con_status.get('ready', False)\n",
    "        con_restarts = con_status.get('restartCount', 0)\n",
    "\n",
    "        display(Markdown(f'#### Troubleshooting container {namespace}.{pod_name}/{con_name} ({i+1}/{len(cons)})\\n'\n",
    "                         f'#### {\"R\" if con_ready else \"Not r\"}eady with {con_restarts} restarts'))\n",
    "\n",
    "        run(f'kubectl logs -n {namespace} {pod_name} {con_name} --tail {tail_lines}')\n",
    "\n",
    "        # Get logs from the previous terminated container if one exists\n",
    "        if con_restarts \u003e 0:\n",
    "            display(Markdown(f'#### Logs from previous terminated container {namespace}.{pod_name}/{con_name}'))\n",
    "            run(f'kubectl logs -n {namespace} {pod_name} {con_name} --tail {tail_lines} --previous')"
   ]
  },
  {
   "cell_type": "markdown",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "### Troubleshoot the PersistentVolumeClaims"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "display(Markdown(f'#### Troubleshooting PersistentVolumeClaims'))\n",
    "run(f'kubectl describe pvc -n {namespace} -l postgresqls.arcdata.microsoft.com/cluster-id={uid}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "print(\"Notebook execution is complete.\")"
   ]
  }
 ],
 "nbformat": 4,
 "nbformat_minor": 5,
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "pansop": {
   "related": "",
   "test": {
    "strategy": "",
    "types": null,
    "disable": {
     "reason": "",
     "workitems": null,
     "types": null
    }
   },
   "target": {
    "current": "public",
    "final": "public"
   },
   "internal": {
    "parameters": null,
    "symlink": false
   },
   "timeout": "0"
  },
  "language_info": {
   "codemirror_mode": "{ Name: \"\", Version: \"\"}",
   "file_extension": "",
   "mimetype": "",
   "name": "",
   "nbconvert_exporter": "",
   "pygments_lexer": "",
   "version": ""
  },
  "widgets": []
 }
}