From 6f8ee325bc204de71d05fd2637bfb0567b799443 Mon Sep 17 00:00:00 2001 From: Pedro Guimaraes Date: Fri, 26 Jan 2024 18:11:12 +0100 Subject: [PATCH 01/10] Add sosreport logging --- .github/workflows/integration_test_charm.yaml | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/.github/workflows/integration_test_charm.yaml b/.github/workflows/integration_test_charm.yaml index 5d81fc9b..061fa8a0 100644 --- a/.github/workflows/integration_test_charm.yaml +++ b/.github/workflows/integration_test_charm.yaml @@ -328,6 +328,41 @@ jobs: run: | juju switch test mkdir ~/logs/ + - name: Run SOS reports + if: ${{ failure() && steps.tests.outcome == 'failure' }} + run: | + sudo snap install sosreport --channel=latest/stable --classic + # Needed as sosreport does not like 100+ char long paths + mkdir /tmp/sos + sudo sos report \ + --only-plugins kubernetes,systemd,logs \ + --enable-plugins kubernetes \ + -k kubernetes.describe=true -k kubernetes.podlogs=true -k kubernetes.all=true -k logs.all_logs=true \ + --batch \ + --clean \ + --tmp-dir=./tmp/sos \ + -z gzip + - name: Run SOS in LXCs if Needed + if: ${{ inputs.cloud == 'lxd' && (failure() && steps.tests.outcome == 'failure') }} + run: | + juju exec --parallel=true --all -- sudo snap install sosreport --channel=latest/stable --classic + sudo snap install jq + export NODES=$(juju status --format=json | jq -r '.machines[]|."ip-addresses"[0]' | paste -s -d, -) + sudo sos collect \ + -i ~/.local/share/juju/ssh/juju_id_rsa --ssh-user ubuntu \ + --ssh-user ubuntu --nodes "$NODES" \ + --only-plugins systemd,logs \ + logs.all_logs=true \ + --batch \ + --clean \ + --tmp-dir=./tmp/sos \ + -z gzip + - name: Prepare upload - local reports + if: ${{ failure() && steps.tests.outcome == 'failure' }} + run: | + I="$(whoami)" + sudo -e chown -R "$I" /tmp/sos + mv /tmp/sos/* ~/logs/ - name: juju status if: ${{ success() || (failure() && steps.tests.outcome == 'failure') }} run: juju status --color --relations | tee ~/logs/juju-status.txt From 4c8d1ba671abb7765b362cfc953c58198d3a08a6 Mon Sep 17 00:00:00 2001 From: Pedro Guimaraes Date: Fri, 26 Jan 2024 19:16:00 +0100 Subject: [PATCH 02/10] Add fixes to sosreport cmd --- .github/workflows/integration_test_charm.yaml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/integration_test_charm.yaml b/.github/workflows/integration_test_charm.yaml index 061fa8a0..2574b549 100644 --- a/.github/workflows/integration_test_charm.yaml +++ b/.github/workflows/integration_test_charm.yaml @@ -340,7 +340,7 @@ jobs: -k kubernetes.describe=true -k kubernetes.podlogs=true -k kubernetes.all=true -k logs.all_logs=true \ --batch \ --clean \ - --tmp-dir=./tmp/sos \ + --tmp-dir=/tmp/sos \ -z gzip - name: Run SOS in LXCs if Needed if: ${{ inputs.cloud == 'lxd' && (failure() && steps.tests.outcome == 'failure') }} @@ -350,18 +350,16 @@ jobs: export NODES=$(juju status --format=json | jq -r '.machines[]|."ip-addresses"[0]' | paste -s -d, -) sudo sos collect \ -i ~/.local/share/juju/ssh/juju_id_rsa --ssh-user ubuntu \ - --ssh-user ubuntu --nodes "$NODES" \ + --nodes "$(juju status --format=json | jq -r '.machines[]|."ip-addresses"[0]' | paste -s -d, -)" \ --only-plugins systemd,logs \ - logs.all_logs=true \ + -k logs.all_logs=true \ --batch \ --clean \ - --tmp-dir=./tmp/sos \ + --tmp-dir=/tmp/sos \ -z gzip - name: Prepare upload - local reports if: ${{ failure() && steps.tests.outcome == 'failure' }} run: | - I="$(whoami)" - sudo -e chown -R "$I" /tmp/sos mv /tmp/sos/* ~/logs/ - name: juju status if: ${{ success() || (failure() && steps.tests.outcome == 'failure') }} From fa34cd80ccee7a0ab0983a9791c2ed5ab44d189f Mon Sep 17 00:00:00 2001 From: Pedro Guimaraes Date: Fri, 26 Jan 2024 19:50:08 +0100 Subject: [PATCH 03/10] removing all_logs option --- .github/workflows/integration_test_charm.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/integration_test_charm.yaml b/.github/workflows/integration_test_charm.yaml index 2574b549..ee3c19fd 100644 --- a/.github/workflows/integration_test_charm.yaml +++ b/.github/workflows/integration_test_charm.yaml @@ -337,7 +337,7 @@ jobs: sudo sos report \ --only-plugins kubernetes,systemd,logs \ --enable-plugins kubernetes \ - -k kubernetes.describe=true -k kubernetes.podlogs=true -k kubernetes.all=true -k logs.all_logs=true \ + -k kubernetes.describe=true -k kubernetes.podlogs=true -k kubernetes.all=true \ --batch \ --clean \ --tmp-dir=/tmp/sos \ @@ -352,7 +352,6 @@ jobs: -i ~/.local/share/juju/ssh/juju_id_rsa --ssh-user ubuntu \ --nodes "$(juju status --format=json | jq -r '.machines[]|."ip-addresses"[0]' | paste -s -d, -)" \ --only-plugins systemd,logs \ - -k logs.all_logs=true \ --batch \ --clean \ --tmp-dir=/tmp/sos \ From 0a6f10c532cd83e6189e8f9f1ac94fae6cda57aa Mon Sep 17 00:00:00 2001 From: Pedro Guimaraes Date: Fri, 26 Jan 2024 23:13:16 +0100 Subject: [PATCH 04/10] Add space check and permission management --- .github/workflows/integration_test_charm.yaml | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_test_charm.yaml b/.github/workflows/integration_test_charm.yaml index ee3c19fd..cbe958b2 100644 --- a/.github/workflows/integration_test_charm.yaml +++ b/.github/workflows/integration_test_charm.yaml @@ -341,13 +341,18 @@ jobs: --batch \ --clean \ --tmp-dir=/tmp/sos \ - -z gzip + -z gzip -j 1 - name: Run SOS in LXCs if Needed if: ${{ inputs.cloud == 'lxd' && (failure() && steps.tests.outcome == 'failure') }} run: | juju exec --parallel=true --all -- sudo snap install sosreport --channel=latest/stable --classic sudo snap install jq export NODES=$(juju status --format=json | jq -r '.machines[]|."ip-addresses"[0]' | paste -s -d, -) + echo "Found nodes: $NODES" + + echo "Total space before running command:" + sudo df -h + sudo sos collect \ -i ~/.local/share/juju/ssh/juju_id_rsa --ssh-user ubuntu \ --nodes "$(juju status --format=json | jq -r '.machines[]|."ip-addresses"[0]' | paste -s -d, -)" \ @@ -355,11 +360,19 @@ jobs: --batch \ --clean \ --tmp-dir=/tmp/sos \ - -z gzip + -z gzip -j 1 - name: Prepare upload - local reports if: ${{ failure() && steps.tests.outcome == 'failure' }} run: | + I=$(whoami) + sudo chown -R $I /tmp/sos/ mv /tmp/sos/* ~/logs/ + + - name: Print kernel messages + if: ${{ failure() }} + run: | + sudo dmesg + - name: juju status if: ${{ success() || (failure() && steps.tests.outcome == 'failure') }} run: juju status --color --relations | tee ~/logs/juju-status.txt From 02d87e7cca976f0cb9922e97a1606badd60a3b31 Mon Sep 17 00:00:00 2001 From: Pedro Guimaraes Date: Sat, 27 Jan 2024 09:48:58 +0100 Subject: [PATCH 05/10] Fix sos report cmd and add --no-local to collect, as we are already collecting more data from localhost --- .github/workflows/integration_test_charm.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration_test_charm.yaml b/.github/workflows/integration_test_charm.yaml index cbe958b2..29b9d9c8 100644 --- a/.github/workflows/integration_test_charm.yaml +++ b/.github/workflows/integration_test_charm.yaml @@ -341,7 +341,7 @@ jobs: --batch \ --clean \ --tmp-dir=/tmp/sos \ - -z gzip -j 1 + -z gzip - name: Run SOS in LXCs if Needed if: ${{ inputs.cloud == 'lxd' && (failure() && steps.tests.outcome == 'failure') }} run: | @@ -354,7 +354,7 @@ jobs: sudo df -h sudo sos collect \ - -i ~/.local/share/juju/ssh/juju_id_rsa --ssh-user ubuntu \ + -i ~/.local/share/juju/ssh/juju_id_rsa --ssh-user ubuntu --no-local \ --nodes "$(juju status --format=json | jq -r '.machines[]|."ip-addresses"[0]' | paste -s -d, -)" \ --only-plugins systemd,logs \ --batch \ From 127e315f202edd50b5a471cb6d0a90035496195d Mon Sep 17 00:00:00 2001 From: Pedro Guimaraes Date: Tue, 30 Jan 2024 10:50:55 +0100 Subject: [PATCH 06/10] Add integration tests --- .github/workflows/integration_test_charm.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration_test_charm.yaml b/.github/workflows/integration_test_charm.yaml index 29b9d9c8..6a97fcc8 100644 --- a/.github/workflows/integration_test_charm.yaml +++ b/.github/workflows/integration_test_charm.yaml @@ -335,8 +335,8 @@ jobs: # Needed as sosreport does not like 100+ char long paths mkdir /tmp/sos sudo sos report \ - --only-plugins kubernetes,systemd,logs \ - --enable-plugins kubernetes \ + --only-plugins kubernetes,systemd,logs,juju \ + --enable-plugins kubernetes,juju \ -k kubernetes.describe=true -k kubernetes.podlogs=true -k kubernetes.all=true \ --batch \ --clean \ @@ -355,8 +355,9 @@ jobs: sudo sos collect \ -i ~/.local/share/juju/ssh/juju_id_rsa --ssh-user ubuntu --no-local \ - --nodes "$(juju status --format=json | jq -r '.machines[]|."ip-addresses"[0]' | paste -s -d, -)" \ + --nodes "$NODES" \ --only-plugins systemd,logs \ + -k logs.all_logs=true \ --batch \ --clean \ --tmp-dir=/tmp/sos \ @@ -366,7 +367,7 @@ jobs: run: | I=$(whoami) sudo chown -R $I /tmp/sos/ - mv /tmp/sos/* ~/logs/ + mv /tmp/sos/*.tar.gz ~/logs/ - name: Print kernel messages if: ${{ failure() }} From 5a34e1a1ff1b6e2ad864464ec488964a56311734 Mon Sep 17 00:00:00 2001 From: Pedro Guimaraes Date: Tue, 30 Jan 2024 10:55:01 +0100 Subject: [PATCH 07/10] Fix lint --- .github/workflows/integration_test_charm.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration_test_charm.yaml b/.github/workflows/integration_test_charm.yaml index 6a97fcc8..53ffea45 100644 --- a/.github/workflows/integration_test_charm.yaml +++ b/.github/workflows/integration_test_charm.yaml @@ -347,7 +347,8 @@ jobs: run: | juju exec --parallel=true --all -- sudo snap install sosreport --channel=latest/stable --classic sudo snap install jq - export NODES=$(juju status --format=json | jq -r '.machines[]|."ip-addresses"[0]' | paste -s -d, -) + export NODES + NODES="$(juju status --format=json | jq -r '.machines[]|."ip-addresses"[0]' | paste -s -d, -)" echo "Found nodes: $NODES" echo "Total space before running command:" @@ -365,8 +366,8 @@ jobs: - name: Prepare upload - local reports if: ${{ failure() && steps.tests.outcome == 'failure' }} run: | - I=$(whoami) - sudo chown -R $I /tmp/sos/ + I="$(whoami)" + sudo chown -R "$I" /tmp/sos/ mv /tmp/sos/*.tar.gz ~/logs/ - name: Print kernel messages From ccd83ce1920f2da92acad6c912b05077bd41036c Mon Sep 17 00:00:00 2001 From: Pedro Guimaraes Date: Wed, 31 Jan 2024 10:51:01 +0100 Subject: [PATCH 08/10] Add documentation for the log structure --- .github/workflows/integration_test_charm.yaml | 2 +- README.md | 3 +- TROUBLESHOOTING_CHARMS.md | 58 +++++++++++++++++++ 3 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 TROUBLESHOOTING_CHARMS.md diff --git a/.github/workflows/integration_test_charm.yaml b/.github/workflows/integration_test_charm.yaml index 53ffea45..c90fd213 100644 --- a/.github/workflows/integration_test_charm.yaml +++ b/.github/workflows/integration_test_charm.yaml @@ -357,7 +357,7 @@ jobs: sudo sos collect \ -i ~/.local/share/juju/ssh/juju_id_rsa --ssh-user ubuntu --no-local \ --nodes "$NODES" \ - --only-plugins systemd,logs \ + --only-plugins systemd,logs,juju \ -k logs.all_logs=true \ --batch \ --clean \ diff --git a/README.md b/README.md index cb65932a..a36fb43b 100644 --- a/README.md +++ b/README.md @@ -58,4 +58,5 @@ Note: all workflows in this repository share a version number. If a breaking cha If you do not want to use Renovate, pin to the latest major version (e.g. `v1`). ## Contributing -See [CONTRIBUTING.md](CONTRIBUTING.md) \ No newline at end of file +See [CONTRIBUTING.md](CONTRIBUTING.md) +See [TROUBLESHOOTING_CHARMS.md](TROUBLESHOOTING_CHARMS.md) diff --git a/TROUBLESHOOTING_CHARMS.md b/TROUBLESHOOTING_CHARMS.md new file mode 100644 index 00000000..6442d404 --- /dev/null +++ b/TROUBLESHOOTING_CHARMS.md @@ -0,0 +1,58 @@ +Whenever a test fails, data-platform-workflows will capture that run logs using [sosreport](https://github.com/sosreport/sos). + +The logs can be downloaded from the run's "Summary" page. + +The sosreport is ran in the actual runner and captures logs from the host itself as well as the model's containers (LXC / k8s). + +# Log structure + +``` +/ +| ++---- juju-debug-log.txt: captured at the end of the run +| ++---- juju-status.txt: captured at the end of the run +| ++---- sos-collector-... +| ++---- sosreport-... +``` + +## Github Runner logs + +The tarball `sosreport-` contains all the host logs. It will hold its syslog, journal and kernel logs. + +Relevant logs: +* /var/log/{kern,syslog}.log: OS-related logs, including kernel +* /sos_commands/kubernetes/: logs related to the k8s infra and its pods +* /sos_commands/logs/: journalctl outputs + +## LXC logs + +The workflow also runs `sos collect` against each of the LXC containers, if they are available in the model. + +The goal is to collect system level logs of the containers, as well as juju's. + +These logs will be in `sos-collector-...` tarball. In that tarball, each container will have its own `sosreport-...`. + +Each tarball will contain a subset of the logs mentioned in the previous section (since logs such as kern.log or k8s +do not make sense within LXC containers). + +# Missing any extra logs? + +If any logs are missing, e.g. logs in specific folders of /var/snap, then the steps are: +1) Extend or add a new plugin to the sosreport +2) Add it as an extra plugin (if needed) to the `integration_test_charms.yaml`. + +It is important that, not only the sosreport PR has been merged upstream, but the change makes its way into the +sosreport's official snap and the [packages in Ubuntu](https://packages.ubuntu.com/search?suite=all&arch=any&searchon=names&keywords=sosreport). + +# Notes + +It is important to state these commands are ran at the end of the test, if it fails; therefore, if a container has been +created and destroyed during the test, it will not show in the sosreports. However, juju debug logs will contain every +log exchanged with the controller, and hence, even history of destroyed units. + +If the syslog file has no recent logs, then check the /sos_commands/logs for the journalctl outputs. Normally, they will +correspond to the same logs but journalctl may be more up-to-date. + From fbb900c0b7c62a4418c703166e28ffe429e0768b Mon Sep 17 00:00:00 2001 From: Pedro Guimaraes Date: Thu, 1 Feb 2024 08:37:27 +0100 Subject: [PATCH 09/10] Add a check for empty model before sos collect --- .github/workflows/integration_test_charm.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/integration_test_charm.yaml b/.github/workflows/integration_test_charm.yaml index c90fd213..57ed6259 100644 --- a/.github/workflows/integration_test_charm.yaml +++ b/.github/workflows/integration_test_charm.yaml @@ -345,6 +345,11 @@ jobs: - name: Run SOS in LXCs if Needed if: ${{ inputs.cloud == 'lxd' && (failure() && steps.tests.outcome == 'failure') }} run: | + if [ -z $(sudo lxc list -f csv | wc -l) ]; then + echo "No containers available, nothing to collect logs for..." + exit 0 + fi + juju exec --parallel=true --all -- sudo snap install sosreport --channel=latest/stable --classic sudo snap install jq export NODES From d53a0853edcd8649891a7fb323301a5b4fa275f5 Mon Sep 17 00:00:00 2001 From: Pedro Guimaraes Date: Thu, 1 Feb 2024 08:46:33 +0100 Subject: [PATCH 10/10] Add double-quotes --- .github/workflows/integration_test_charm.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_charm.yaml b/.github/workflows/integration_test_charm.yaml index 57ed6259..dbad6b61 100644 --- a/.github/workflows/integration_test_charm.yaml +++ b/.github/workflows/integration_test_charm.yaml @@ -345,7 +345,7 @@ jobs: - name: Run SOS in LXCs if Needed if: ${{ inputs.cloud == 'lxd' && (failure() && steps.tests.outcome == 'failure') }} run: | - if [ -z $(sudo lxc list -f csv | wc -l) ]; then + if [ -z "$(sudo lxc list -f csv | wc -l)" ]; then echo "No containers available, nothing to collect logs for..." exit 0 fi