fix for nvidia GPU load at 0% but reported as critical
We run a number of machines with nvidia GPU cards and notice that when the load is reported at 0% - which can happen when the machine is idle - its flagged as critical in Observium. The fix is as follows:
In the ./includes/polling/unix-agent/nvidia.inc.php file change the utilization.gpu and utilization.memory discover_sensor calls from:
['limit_high' => 100, 'limit_low' => 0]
To:
['limit_high' => 100]
This the file looks like:
<?php /** * Observium * * This file is part of Observium. * * @package observium * @subpackage poller * @copyright (C) Adam Armstrong * */
global $agent_sensors;
if (!safe_empty($agent_data['nvidia']['smi']) && $nvidia = parse_csv($agent_data['nvidia']['smi'])) {
$invalid = [ '[Not Supported]', 'N/A', '[N/A]' ]; print_cli_heading("nvidia-smi", 3); foreach ($nvidia as $card) {
$descr_card = "Nvidia Card " . ((int)$card['index'] + 1) . ": " . $card['name']; print_cli_heading($descr_card, 4);
if (!in_array($card['temperature.gpu'], $invalid)) { $index = 'temperature.gpu.' . $card['index']; $descr = $descr_card; discover_sensor('temperature', $device, '', $index, 'nvidia-smi', $descr, 1, $card['temperature.gpu'], ['limit_high' => 100], 'agent'); $agent_sensors['temperature']['nvidia-smi'][$index] = ['description' => $descr, 'current' => $card['temperature.gpu'], 'index' => $index]; print_cli_data("temperature.gpu", $card['temperature.gpu'] . "C");
}
if (!in_array($card['power.draw [W]'], $invalid)) { $index = 'power.draw.' . $card['index']; $descr = $descr_card; discover_sensor('power', $device, '', $index, 'nvidia-smi', $descr, 1, $card['power.draw [W]'], [], 'agent'); $agent_sensors['power']['nvidia-smi'][$index] = ['description' => $descr, 'current' => $card['power.draw [W]'], 'index' => $index]; print_cli_data("power.draw", $card['power.draw [W]'] . "W"); }
if (!in_array($card['fan.speed [%]'], $invalid)) { $index = 'fan.speed.' . $card['index']; $descr = $descr_card . " Fan Load"; discover_sensor('load', $device, '', $index, 'nvidia-smi', $descr, 1, $card['fan.speed [%]'], ['limit_high' => 100, 'limit_low' => 0], 'agent'); $agent_sensors['load']['nvidia-smi'][$index] = ['description' => $descr, 'current' => $card['fan.speed [%]'], 'index' => $index]; print_cli_data("fan.speed", $card['fan.speed [%]'] . ""); }
if (!in_array($card['utilization.gpu [%]'], $invalid)) { $index = 'utilization.gpu.' . $card['index']; $descr = $descr_card . " GPU Load"; discover_sensor('load', $device, '', $index, 'nvidia-smi', $descr, 1, $card['utilization.gpu [%]'], ['limit_high' => 100], 'agent'); $agent_sensors['load']['nvidia-smi'][$index] = ['description' => $descr, 'current' => $card['utilization.gpu [%]'], 'index' => $index]; print_cli_data("utilization.gpu", $card['utilization.gpu [%]'] . ""); }
if (!in_array($card['utilization.memory [%]'], $invalid)) { $index = 'utilization.memory.' . $card['index']; $descr = $descr_card . " Memory Load"; discover_sensor('load', $device, '', $index, 'nvidia-smi', $descr, 1, $card['utilization.memory [%]'], ['limit_high' => 100], 'agent'); $agent_sensors['load']['nvidia-smi'][$index] = ['description' => $descr, 'current' => $card['utilization.memory [%]'], 'index' => $index]; print_cli_data("utilization.memory", $card['utilization.memory [%]'] . ""); }
} echo "\n"; }
// EOF
Thanks
Chris
Update - you can remove , 'limit_low' => 0 on fan_speed as well.
[cid:image001.png@01DCCDAF.259BF160] Chris James Technical Lead, Vessel Production Systems 4 The Heights, Brooklands Weybridge, KT13 0NY Mobile: +44 7768 480 557 chris.james@tgs.commailto:chris.james@tgs.com
From: Chris James Sent: 16 April 2026 14:39 To: observium@lists.observium.org Subject: fix for nvidia GPU load at 0% but reported as critical
We run a number of machines with nvidia GPU cards and notice that when the load is reported at 0% - which can happen when the machine is idle - its flagged as critical in Observium. The fix is as follows:
In the ./includes/polling/unix-agent/nvidia.inc.php file change the utilization.gpu and utilization.memory discover_sensor calls from:
['limit_high' => 100, 'limit_low' => 0]
To:
['limit_high' => 100]
This the file looks like:
<?php /** * Observium * * This file is part of Observium. * * @package observium * @subpackage poller * @copyright (C) Adam Armstrong * */
global $agent_sensors;
if (!safe_empty($agent_data['nvidia']['smi']) && $nvidia = parse_csv($agent_data['nvidia']['smi'])) {
$invalid = [ '[Not Supported]', 'N/A', '[N/A]' ]; print_cli_heading("nvidia-smi", 3); foreach ($nvidia as $card) {
$descr_card = "Nvidia Card " . ((int)$card['index'] + 1) . ": " . $card['name']; print_cli_heading($descr_card, 4);
if (!in_array($card['temperature.gpu'], $invalid)) { $index = 'temperature.gpu.' . $card['index']; $descr = $descr_card; discover_sensor('temperature', $device, '', $index, 'nvidia-smi', $descr, 1, $card['temperature.gpu'], ['limit_high' => 100], 'agent'); $agent_sensors['temperature']['nvidia-smi'][$index] = ['description' => $descr, 'current' => $card['temperature.gpu'], 'index' => $index]; print_cli_data("temperature.gpu", $card['temperature.gpu'] . "C");
}
if (!in_array($card['power.draw [W]'], $invalid)) { $index = 'power.draw.' . $card['index']; $descr = $descr_card; discover_sensor('power', $device, '', $index, 'nvidia-smi', $descr, 1, $card['power.draw [W]'], [], 'agent'); $agent_sensors['power']['nvidia-smi'][$index] = ['description' => $descr, 'current' => $card['power.draw [W]'], 'index' => $index]; print_cli_data("power.draw", $card['power.draw [W]'] . "W"); }
if (!in_array($card['fan.speed [%]'], $invalid)) { $index = 'fan.speed.' . $card['index']; $descr = $descr_card . " Fan Load"; discover_sensor('load', $device, '', $index, 'nvidia-smi', $descr, 1, $card['fan.speed [%]'], ['limit_high' => 100, 'limit_low' => 0], 'agent'); $agent_sensors['load']['nvidia-smi'][$index] = ['description' => $descr, 'current' => $card['fan.speed [%]'], 'index' => $index]; print_cli_data("fan.speed", $card['fan.speed [%]'] . ""); }
if (!in_array($card['utilization.gpu [%]'], $invalid)) { $index = 'utilization.gpu.' . $card['index']; $descr = $descr_card . " GPU Load"; discover_sensor('load', $device, '', $index, 'nvidia-smi', $descr, 1, $card['utilization.gpu [%]'], ['limit_high' => 100], 'agent'); $agent_sensors['load']['nvidia-smi'][$index] = ['description' => $descr, 'current' => $card['utilization.gpu [%]'], 'index' => $index]; print_cli_data("utilization.gpu", $card['utilization.gpu [%]'] . ""); }
if (!in_array($card['utilization.memory [%]'], $invalid)) { $index = 'utilization.memory.' . $card['index']; $descr = $descr_card . " Memory Load"; discover_sensor('load', $device, '', $index, 'nvidia-smi', $descr, 1, $card['utilization.memory [%]'], ['limit_high' => 100], 'agent'); $agent_sensors['load']['nvidia-smi'][$index] = ['description' => $descr, 'current' => $card['utilization.memory [%]'], 'index' => $index]; print_cli_data("utilization.memory", $card['utilization.memory [%]'] . ""); }
} echo "\n"; }
// EOF
Thanks
Chris
participants (1)
-
Chris James