Arc updates for March release (#14970)

* Updated Postgres Spec for where to find engine version, removed calling calling -ev in edit commands (#14735)

* Added spec.engine.version, took out calling engine version with edit calls

* Added text wrong place

* missed updates

* PR fix

* Update Arc Postgres troubleshooting notebook

Co-authored-by: Brian Bergeron <brberger@microsoft.com>

* Remove AzdataSession from azdata commands (#14856)

* remove session

* Add in controller-context support

* Revert "Add in controller-context support"

This reverts commit 3b39b968efbf6054041cb01cb2d8443532643a82.

* Add azdataContext to login

* Undo book change

* Undo change correctly

* Add controller context support (#14862)

* remove session

* Add in controller-context support

* Add params to fake

* Fix tests

* Add info and placeholder for controller URL/name (#14887)

* Add info and placeholder for controller URL

* add period + update name

* update memento and allow editing of namespace/URL

* vBump

* vBump

* Fix tests

Co-authored-by: nasc17 <69922333+nasc17@users.noreply.github.com>
Co-authored-by: Brian Bergeron <brian.e.bergeron@gmail.com>
Co-authored-by: Brian Bergeron <brberger@microsoft.com>
This commit is contained in:
Charles Gagnon
2021-04-05 11:47:36 -07:00
committed by GitHub
parent 71b91c3890
commit febd8b29c9
44 changed files with 525 additions and 740 deletions

View File

@@ -2,6 +2,10 @@
- This chapter contains notebooks for troubleshooting Postgres on Azure Arc
## Notebooks in this Chapter
- [TSG100 - The Azure Arc enabled PostgreSQL Hyperscale troubleshooter](tsg100-troubleshoot-postgres.ipynb)
[Home](../readme.md)
## Notebooks in this Chapter
- [TSG100 - The Azure Arc enabled PostgreSQL Hyperscale troubleshooter](../postgres/tsg100-troubleshoot-postgres.ipynb)

View File

@@ -1,7 +0,0 @@
- title: Postgres
url: /postgres/readme
not_numbered: true
expand_sections: true
sections:
- title: TSG100 - The Azure Arc enabled PostgreSQL Hyperscale troubleshooter
url: postgres/tsg100-troubleshoot-postgres

View File

@@ -2,7 +2,11 @@
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"TSG100 - The Azure Arc enabled PostgreSQL Hyperscale troubleshooter\n",
"===================================================================\n",
@@ -35,14 +39,17 @@
"# the user will be prompted to select a server.\n",
"namespace = os.environ.get('POSTGRES_SERVER_NAMESPACE')\n",
"name = os.environ.get('POSTGRES_SERVER_NAME')\n",
"version = os.environ.get('POSTGRES_SERVER_VERSION')\n",
"\n",
"tail_lines = 50"
]
},
{
"cell_type": "markdown",
"metadata": {},
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"### Common functions\n",
"\n",
@@ -63,7 +70,6 @@
"import sys\n",
"import os\n",
"import re\n",
"import json\n",
"import platform\n",
"import shlex\n",
"import shutil\n",
@@ -76,11 +82,7 @@
"error_hints = {} # Output in stderr where a known SOP/TSG exists which will be HINTed for further help\n",
"install_hint = {} # The SOP to help install the executable if it cannot be found\n",
"\n",
"first_run = True\n",
"rules = None\n",
"debug_logging = False\n",
"\n",
"def run(cmd, return_output=False, no_output=False, retry_count=0):\n",
"def run(cmd, return_output=False, no_output=False, retry_count=0, base64_decode=False, return_as_json=False):\n",
" \"\"\"Run shell command, stream stdout, print stderr and optionally return output\n",
"\n",
" NOTES:\n",
@@ -103,13 +105,6 @@
" output = \"\"\n",
" retry = False\n",
"\n",
" global first_run\n",
" global rules\n",
"\n",
" if first_run:\n",
" first_run = False\n",
" rules = load_rules()\n",
"\n",
" # When running `azdata sql query` on Windows, replace any \\n in \"\"\" strings, with \" \", otherwise we see:\n",
" #\n",
" # ('HY090', '[HY090] [Microsoft][ODBC Driver Manager] Invalid string or buffer length (0) (SQLExecDirectW)')\n",
@@ -172,7 +167,12 @@
" if which_binary == None:\n",
" which_binary = shutil.which(cmd_actual[0])\n",
"\n",
" # Display an install HINT, so the user can click on a SOP to install the missing binary\n",
" #\n",
" if which_binary == None:\n",
" print(f\"The path used to search for '{cmd_actual[0]}' was:\")\n",
" print(sys.path)\n",
"\n",
" if user_provided_exe_name in install_hint and install_hint[user_provided_exe_name] is not None:\n",
" display(Markdown(f'HINT: Use [{install_hint[user_provided_exe_name][0]}]({install_hint[user_provided_exe_name][1]}) to resolve this issue.'))\n",
"\n",
@@ -219,8 +219,6 @@
" break # otherwise infinite hang, have not worked out why yet.\n",
" else:\n",
" print(line, end='')\n",
" if rules is not None:\n",
" apply_expert_rules(line)\n",
"\n",
" if wait:\n",
" p.wait()\n",
@@ -276,25 +274,22 @@
" if line_decoded.find(error_hint[0]) != -1:\n",
" display(Markdown(f'HINT: Use [{error_hint[1]}]({error_hint[2]}) to resolve this issue.'))\n",
"\n",
" # apply expert rules (to run follow-on notebooks), based on output\n",
" #\n",
" if rules is not None:\n",
" apply_expert_rules(line_decoded)\n",
"\n",
" # Verify if a transient error, if so automatically retry (recursive)\n",
" #\n",
" if user_provided_exe_name in retry_hints:\n",
" for retry_hint in retry_hints[user_provided_exe_name]:\n",
" if line_decoded.find(retry_hint) != -1:\n",
" if retry_count < MAX_RETRIES:\n",
" if retry_count \u003c MAX_RETRIES:\n",
" print(f\"RETRY: {retry_count} (due to: {retry_hint})\")\n",
" retry_count = retry_count + 1\n",
" output = run(cmd, return_output=return_output, retry_count=retry_count)\n",
"\n",
" if return_output:\n",
" return output\n",
" else:\n",
" return\n",
" if base64_decode:\n",
" import base64\n",
" return base64.b64decode(output).decode('utf-8')\n",
" else:\n",
" return output\n",
"\n",
" elapsed = datetime.datetime.now().replace(microsecond=0) - start_time\n",
"\n",
@@ -311,78 +306,31 @@
" print(f'\\nSUCCESS: {elapsed}s elapsed.\\n')\n",
"\n",
" if return_output:\n",
" return output\n",
"\n",
"def load_json(filename):\n",
" \"\"\"Load a json file from disk and return the contents\"\"\"\n",
"\n",
" with open(filename, encoding=\"utf8\") as json_file:\n",
" return json.load(json_file)\n",
"\n",
"def load_rules():\n",
" \"\"\"Load any 'expert rules' from the metadata of this notebook (.ipynb) that should be applied to the stderr of the running executable\"\"\"\n",
"\n",
" # Load this notebook as json to get access to the expert rules in the notebook metadata.\n",
" #\n",
" try:\n",
" j = load_json(\"tsg100-troubleshoot-postgres.ipynb\")\n",
" except:\n",
" pass # If the user has renamed the book, we can't load ourself. NOTE: Is there a way in Jupyter, to know your own filename?\n",
" else:\n",
" if \"metadata\" in j and \\\n",
" \"azdata\" in j[\"metadata\"] and \\\n",
" \"expert\" in j[\"metadata\"][\"azdata\"] and \\\n",
" \"expanded_rules\" in j[\"metadata\"][\"azdata\"][\"expert\"]:\n",
"\n",
" rules = j[\"metadata\"][\"azdata\"][\"expert\"][\"expanded_rules\"]\n",
"\n",
" rules.sort() # Sort rules, so they run in priority order (the [0] element). Lowest value first.\n",
"\n",
" # print (f\"EXPERT: There are {len(rules)} rules to evaluate.\")\n",
"\n",
" return rules\n",
"\n",
"def apply_expert_rules(line):\n",
" \"\"\"Determine if the stderr line passed in, matches the regular expressions for any of the 'expert rules', if so\n",
" inject a 'HINT' to the follow-on SOP/TSG to run\"\"\"\n",
"\n",
" global rules\n",
"\n",
" for rule in rules:\n",
" notebook = rule[1]\n",
" cell_type = rule[2]\n",
" output_type = rule[3] # i.e. stream or error\n",
" output_type_name = rule[4] # i.e. ename or name \n",
" output_type_value = rule[5] # i.e. SystemExit or stdout\n",
" details_name = rule[6] # i.e. evalue or text \n",
" expression = rule[7].replace(\"\\\\*\", \"*\") # Something escaped *, and put a \\ in front of it!\n",
"\n",
" if debug_logging:\n",
" print(f\"EXPERT: If rule '{expression}' satisfied', run '{notebook}'.\")\n",
"\n",
" if re.match(expression, line, re.DOTALL):\n",
"\n",
" if debug_logging:\n",
" print(\"EXPERT: MATCH: name = value: '{0}' = '{1}' matched expression '{2}', therefore HINT '{4}'\".format(output_type_name, output_type_value, expression, notebook))\n",
"\n",
" match_found = True\n",
"\n",
" display(Markdown(f'HINT: Use [{notebook}]({notebook}) to resolve this issue.'))\n",
" if base64_decode:\n",
" import base64\n",
" return base64.b64decode(output).decode('utf-8')\n",
" else:\n",
" return output\n",
"\n",
"\n",
"\n",
"print('Common functions defined successfully.')\n",
"\n",
"# Hints for binary (transient fault) retry, (known) error and install guide\n",
"# Hints for tool retry (on transient fault), known errors and install guide\n",
"#\n",
"retry_hints = {'kubectl': ['A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond']}\n",
"error_hints = {'kubectl': [['no such host', 'TSG010 - Get configuration contexts', '../monitor-k8s/tsg010-get-kubernetes-contexts.ipynb'], ['No connection could be made because the target machine actively refused it', 'TSG056 - Kubectl fails with No connection could be made because the target machine actively refused it', '../repair/tsg056-kubectl-no-connection-could-be-made.ipynb']]}\n",
"install_hint = {'kubectl': ['SOP036 - Install kubectl command line interface', '../install/sop036-install-kubectl.ipynb']}"
"retry_hints = {}\n",
"error_hints = {}\n",
"install_hint = {}\n",
"\n",
"\n",
"print('Common functions defined successfully.')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"### Get Postgres server"
]
@@ -400,10 +348,11 @@
"# Sets the 'server' variable to the spec of the Postgres server\n",
"\n",
"import math\n",
"import json\n",
"\n",
"# If a server was provided, get it\n",
"if namespace and name and version:\n",
" server = json.loads(run(f'kubectl get postgresql-{version} -n {namespace} {name} -o json', return_output=True))\n",
"if namespace and name:\n",
" server = json.loads(run(f'kubectl get postgresqls -n {namespace} {name} -o json', return_output=True))\n",
"else:\n",
" # Otherwise prompt the user to select a server\n",
" servers = json.loads(run(f'kubectl get postgresqls --all-namespaces -o json', return_output=True))['items']\n",
@@ -415,19 +364,18 @@
"\n",
" pad = math.floor(math.log10(len(servers)) + 1) + 3\n",
" for i, s in enumerate(servers):\n",
" print(f'{f\"[{i+1}]\":<{pad}}{full_name(s)}')\n",
" print(f'{f\"[{i+1}]\":\u003c{pad}}{full_name(s)}')\n",
"\n",
" while True:\n",
" try:\n",
" i = int(input('Enter the index of a server to troubleshoot: '))\n",
" i = int(input('Enter the index of a server'))\n",
" except ValueError:\n",
" continue\n",
"\n",
" if i >= 1 and i <= len(servers):\n",
" if i \u003e= 1 and i \u003c= len(servers):\n",
" server = servers[i-1]\n",
" namespace = server['metadata']['namespace']\n",
" name = server['metadata']['name']\n",
" version = server['kind'][len('postgresql-'):]\n",
" break\n",
"\n",
"display(Markdown(f'#### Got server {namespace}.{name}'))"
@@ -435,7 +383,11 @@
},
{
"cell_type": "markdown",
"metadata": {},
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"### Summarize all resources"
]
@@ -443,13 +395,15 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"uid = server['metadata']['uid']\n",
"\n",
"display(Markdown(f'#### Server summary'))\n",
"run(f'kubectl get postgresql-{version} -n {namespace} {name}')\n",
"run(f'kubectl get postgresqls -n {namespace} {name}')\n",
"\n",
"display(Markdown(f'#### Resource summary'))\n",
"run(f'kubectl get sts,pods,pvc,svc,ep -n {namespace} -l postgresqls.arcdata.microsoft.com/cluster-id={uid}')"
@@ -457,7 +411,11 @@
},
{
"cell_type": "markdown",
"metadata": {},
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"### Troubleshoot the server"
]
@@ -465,16 +423,22 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"display(Markdown(f'#### Troubleshooting server {namespace}.{name}'))\n",
"run(f'kubectl describe postgresql-{version} -n {namespace} {name}')"
"run(f'kubectl describe postgresqls -n {namespace} {name}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"### Troubleshoot the pods"
]
@@ -482,7 +446,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"pods = json.loads(run(f'kubectl get pods -n {namespace} -l postgresqls.arcdata.microsoft.com/cluster-id={uid} -o json', return_output=True))['items']\n",
@@ -505,7 +471,11 @@
},
{
"cell_type": "markdown",
"metadata": {},
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"### Troubleshoot the containers"
]
@@ -513,7 +483,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Summarize and get logs from each container\n",
@@ -521,7 +493,7 @@
" pod_name = pod['metadata']['name']\n",
" cons = pod['spec']['containers']\n",
" con_statuses = pod['status'].get('containerStatuses', [])\n",
" display(Markdown(f'#### Troubleshooting {len(cons)} container{\"\" if len(cons) < 2 else \"s\"} '\n",
" display(Markdown(f'#### Troubleshooting {len(cons)} container{\"\" if len(cons) \u003c 2 else \"s\"} '\n",
" f'containers for pod {namespace}.{pod_name}'))\n",
"\n",
" for i, con in enumerate(cons):\n",
@@ -537,14 +509,18 @@
" run(f'kubectl logs -n {namespace} {pod_name} {con_name} --tail {tail_lines}')\n",
"\n",
" # Get logs from the previous terminated container if one exists\n",
" if con_restarts > 0:\n",
" if con_restarts \u003e 0:\n",
" display(Markdown(f'#### Logs from previous terminated container {namespace}.{pod_name}/{con_name}'))\n",
" run(f'kubectl logs -n {namespace} {pod_name} {con_name} --tail {tail_lines} --previous')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"### Troubleshoot the PersistentVolumeClaims"
]
@@ -552,7 +528,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"display(Markdown(f'#### Troubleshooting PersistentVolumeClaims'))\n",
@@ -562,10 +540,12 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"print('Notebook execution complete.')"
"print(\"Notebook execution is complete.\")"
]
}
],
@@ -576,20 +556,36 @@
"name": "python3",
"display_name": "Python 3"
},
"azdata": {
"pansop": {
"related": "",
"test": {
"ci": false,
"gci": false
},
"contract": {
"requires": {
"kubectl": {
"installed": true
}
"strategy": "",
"types": null,
"disable": {
"reason": "",
"workitems": null,
"types": null
}
},
"side_effects": false
}
"target": {
"current": "public",
"final": "public"
},
"internal": {
"parameters": null,
"symlink": false
},
"timeout": "0"
},
"language_info": {
"codemirror_mode": "{ Name: \"\", Version: \"\"}",
"file_extension": "",
"mimetype": "",
"name": "",
"nbconvert_exporter": "",
"pygments_lexer": "",
"version": ""
},
"widgets": []
}
}