From f3626a5532dbe0d1cf2b0867f41f3e540568a8ee Mon Sep 17 00:00:00 2001 From: Maksym Kutsevol Date: Mon, 16 Dec 2024 10:06:08 -0800 Subject: [PATCH] Fix fbkpatch override reference script too early Summary: Fixes the order of creation for the fbkpatch script and the override that uses it. Example failures in https://fburl.com/scuba/chef/rykzxxbw Moves the script to fb_kpatch, so it's closer to the override. Differential Revision: D67226415 fbshipit-source-id: 0dce8b171ed8656f0416be0453cba85324fe090a --- cookbooks/fb_kpatch/files/default/fbkpatch | 105 +++++++++++++++++++++ cookbooks/fb_kpatch/recipes/default.rb | 19 +++- 2 files changed, 119 insertions(+), 5 deletions(-) create mode 100755 cookbooks/fb_kpatch/files/default/fbkpatch diff --git a/cookbooks/fb_kpatch/files/default/fbkpatch b/cookbooks/fb_kpatch/files/default/fbkpatch new file mode 100755 index 00000000..d47c0061 --- /dev/null +++ b/cookbooks/fb_kpatch/files/default/fbkpatch @@ -0,0 +1,105 @@ +#!/bin/bash +# +# Execute the kpatch script to load the next kernel live patch hotfix, +# and log the output of the kpatch script to scribe. +# This also execute /usr/local/bin/klp_netcons.sh to populate the netconsole +# dictionary + +# You probably want to call kpatch directly +if [ -z "$1" ] || [ "$1" == "--help" ] || [ "$1" == "help" ] || [ "$1" == "-h" ] ; then + echo "$0 is a wrapper for kpatch, used from chef to log results to" + echo "kernel_livepatch logview. Please use kpatch directly from the shell." + exit 0 +fi + +function report_to_scuba { + # Logs can be found in `journalctl -t kpatch.service`. Beware of the aggressive journal + # rotation settings. It might throw away these logs pretty quickly. + local JSON=$1 + local TIMEOUT=${2:-600} + local START_TIME + START_TIME=$(/bin/date +%s) + echo Attempt to submit data to scribe | systemd-cat -t kpatch.service + until echo "$JSON" | /usr/local/bin/scribe_cat --sync --check-non-ok-result errorlog_kpatch; do + trying_for_seconds=$(($(/bin/date +%s)-START_TIME)) + echo Failed to submit data, sleeping. | systemd-cat -t kpatch.service + sleep 1; + if [ "${trying_for_seconds}" -gt "$TIMEOUT" ]; then + echo Timeout reached for data submission. Failing. | systemd-cat -t kpatch.service + break + fi + done + +} + +# Gather some basic information +HOSTNAME=$(hostname) +KVER=$(uname -r) +TIME=$(/bin/date +%s) +KLPNETCONS="/usr/local/bin/klp_netcons.sh" + +# Get the hardware type. +. /etc/fbwhoami +if [ -n "${MODEL_NAME}" ] ; then + HARDWARE=${MODEL_NAME} +else + HARDWARE="UNKNOWN" +fi + +# Attempt to load the KLP and get the status of that attempt +KPATCHOUT=$(/usr/sbin/kpatch "$@" 2>&1) +STATUS=${PIPESTATUS[0]} + +OUTESCAPED=$(echo "$KPATCHOUT" | jq -asR) + +HOTFIXES="" +MODULESDIR="/var/lib/kpatch/${KVER}" +if [ -d "${MODULESDIR}" ] +then + HOTFIXES=$(modinfo -Fname "${MODULESDIR}"/*ko | sed 's/^.*_hotfix/hotfix/') +fi + +MESSAGE=$(cat << EOF +{ + "command": "kpatch $@", + "exit_status": "$STATUS", + "hardware": "$HARDWARE", + "hostname": "$HOSTNAME", + "kernel": "$KVER", + "time": "$TIME", + "output": $OUTESCAPED, + "hotfixes": "$HOTFIXES" +} +EOF +) + +JSON=$(jq -c -n "$MESSAGE") + +# Scuba submission should run in background detached from this process. +# Because this starts very early during boot and network/scribed is not yet +# available. +( # () makes a group (manual: https://fburl.com/n84qccbx). This gives us 2 things we need + # here. A separate process to run and io redirection of the whole group. + trap '' HUP INT # ignore these signals. We will get them if the parent exits first + report_to_scuba "$JSON" +) &1 1>/dev/null & # throw out in/out/err. & makes it run in the background + +if [ "${STATUS}" -ne "0" ]; then + echo kpatch failed with exit status "${STATUS}" +# exit "${STATUS}" +# Pretend the KLP load was successful. If it was not we will +# try again at the next chef run. KLP load success & failure +# are monitored separately, and the kernel team is working on +# reducing the failure rate to the point where we can pass +# errors to chef again without breaking the fleet. +fi + +# Update the netconsole cmdline dictionary after every successful +# operation + +if [ -x ${KLPNETCONS} ] +then + ${KLPNETCONS} +fi + +exit 0 diff --git a/cookbooks/fb_kpatch/recipes/default.rb b/cookbooks/fb_kpatch/recipes/default.rb index ba31c35a..2fe8097d 100644 --- a/cookbooks/fb_kpatch/recipes/default.rb +++ b/cookbooks/fb_kpatch/recipes/default.rb @@ -27,11 +27,6 @@ action :upgrade end -service 'kpatch' do - only_if { node['fb_kpatch']['enable'] } - action [:enable, :start] -end - service 'disable kpatch' do not_if { node['fb_kpatch']['enable'] } service_name 'kpatch' @@ -47,6 +42,15 @@ }, }) end + +# Script to log kpatch results to scribe +cookbook_file '/usr/local/bin/fbkpatch' do + source 'fbkpatch' + owner node.root_user + group node.root_group + mode '0755' +end + fb_systemd_override 'fbkpatch' do unit_name 'kpatch.service' content({ @@ -57,3 +61,8 @@ }, }) end + +service 'kpatch' do + only_if { node['fb_kpatch']['enable'] } + action [:enable, :start] +end