Skip to content

Most counters in Sysman are broken #434

@ProjectPhysX

Description

@ProjectPhysX

Hi all,

I'm developing a universally compatible CPU/GPU monitoring tool, and for Intel GPU support on Linux I'm testing Sysman.

The data I want to get:

  • GPU usage
  • VRAM bandwidth
  • memory use
  • temperature
  • fan speed
  • power draw
  • PCIe throughput

My system configuration is:

  • Intel Arc B580, Intel UHD Graphics 770
  • Ubuntu 24.04.4 LTS, kernel 6.14.0-37-generic
  • Intel Arc driver version 26.05.37020.3

Unfortunately the majority of counters do not work:

  • zes_engine_handle_t - zesDeviceEnumEngineGroups() returns 0 engines
  • zes_mem_state_t::free - on B580 always reports same value as zes_mem_state_t::size, on UHD 770 reports way too small value (reports ~half of memory occupied but only ~3GB RAM is actually used)
  • zes_mem_bandwidth_t::maxBandwidth / zes_mem_bandwidth_t::readCounter / zes_mem_bandwidth_t::writeCounter / zes_mem_bandwidth_t::timestamp - always report 0
  • zes_temp_properties_t::maxTemperature - always reports 0.0, for all 3 available zes_temp_handle_t returned by zesDeviceEnumTemperatureSensors()
  • zes_fan_handle_t - zesDeviceEnumFans() returns 0 available fans on Arc B580.
  • zes_power_sustained_limit_t::power - always reutns 0

Only a few counters do work:

  • zes_device_properties_t::.core.name - returns correct GPU name
  • zes_device_properties_t::.core.coreClockRate - returns correct max GPU clock
  • zes_mem_state_t::size - correct GPU memory size is returned
  • zes_power_energy_counter_t::energy / zes_power_energy_counter_t::timestamp - the first 2 of the 4 zes_pwr_handle_t on Arc B580 report valid data, the other 2 report 0. How to tell which of the energy counters is GPU chip / memory / board ower draw?

Am I doing something wrong? Do I need to use a newer Linux kernel?
cc @pacampbe

Kind regards,
Moritz

My sample code:

#define MAX_GPUs 32u
struct GPU {
	string name = "";
	char vendor = (char)0;
	uint usage = 0u; // in %
	uint memory_bandwidth_used=0u, memory_bandwidth_max=0u; // in MB/s
	uint memory_used=0u, memory_max=0u; // in MB
	uint temperature_used=0u, temperature_max=100u; // in 'C
	uint fan_used=0u, fan_max=5000u; // in RPM
	uint power_used=0u, power_max=250u; // in W
	uint clock_core_used=0u, clock_core_max=0u, clock_memory_used=0u, clock_memory_max=0u; // in MHz
	uint pcie_bandwidth_used=0u, pcie_bandwidth_max=0u; // in MB/s
	uint memory_bus_width = 0u; // in bit
	uint get_usage()            { return clamp(usage, 0u, 100u); } // in %
	uint get_memory_bandwidth() { return memory_bandwidth_max>0u ? clamp(100u*memory_bandwidth_used/memory_bandwidth_max, 0u, 100u) : max_uint; } // in %
	uint get_memory()           { return memory_max          >0u ? clamp(100u*memory_used          /memory_max          , 0u, 100u) : max_uint; } // in %
	uint get_temperature()      { return temperature_max     >0u ? clamp(100u*temperature_used     /temperature_max     , 0u, 100u) : max_uint; } // in %
	uint get_fan()              { return fan_max             >0u ? clamp(100u*fan_used             /fan_max             , 0u, 100u) : max_uint; } // in %
	uint get_power()            { return power_max           >0u ? clamp(100u*power_used           /power_max           , 0u, 100u) : max_uint; } // in %
	uint get_clock_core()       { return clock_core_max      >0u ? clamp(100u*clock_core_used      /clock_core_max      , 0u, 100u) : max_uint; } // in %
	uint get_clock_memory()     { return clock_memory_max    >0u ? clamp(100u*clock_memory_used    /clock_memory_max    , 0u, 100u) : max_uint; } // in %
	uint get_pcie_bandwidth()   { return pcie_bandwidth_max  >0u ? clamp(100u*pcie_bandwidth_used  /pcie_bandwidth_max  , 0u, 100u) : max_uint; } // in %
} gpus[MAX_GPUs];

#include "SYSMAN/include/zes_api.h" // https://github.com/oneapi-src/level-zero/blob/master/include/zes_api.h
uint sysman_gpu_start=0u, sysman_gpu_number=0u;
vector<ze_driver_handle_t> sysman_driver_handles;
uint64_t sysman_last_active[MAX_GPUs] = {};
uint64_t sysman_last_active_timestamp[MAX_GPUs] = {};
uint64_t sysman_last_readwrite[MAX_GPUs] = {};
uint64_t sysman_last_readwrite_timestamp[MAX_GPUs] = {};
uint64_t sysman_last_energy[MAX_GPUs] = {};
uint64_t sysman_last_energy_timestamp[MAX_GPUs] = {};
void gpu_initialize_intel() {
	println("init");
	if(zesInit(0)!=ZE_RESULT_SUCCESS) return;
	println("init successful");
	sysman_gpu_start = gpu_number;
	uint g = sysman_gpu_start;
	uint sysman_driver_handle_count = 0u;
	zesDriverGet(&sysman_driver_handle_count, nullptr);
	sysman_driver_handles.resize(sysman_driver_handle_count);
	zesDriverGet(&sysman_driver_handle_count, &sysman_driver_handles[0]);
	println("drivers: "+to_string(sysman_driver_handle_count));
	for(uint s=0u; s<(uint)sysman_driver_handles.size(); s++) {
		uint sysman_device_count = 0;
		zesDeviceGet(sysman_driver_handles[s], &sysman_device_count, nullptr);
		vector<zes_device_handle_t> sysman_device_handle(sysman_device_count);
		zesDeviceGet(sysman_driver_handles[s], &sysman_device_count, &sysman_device_handle[0]);
		println("devices: "+to_string(sysman_device_count));
		for(uint i=0; i<sysman_device_count; i++) {
			
			zes_device_properties_t sysman_device_properties = {};
			zesDeviceGetProperties(sysman_device_handle[i], &sysman_device_properties);
			gpus[g].name = string(sysman_device_properties.core.name);
			gpus[g].vendor = 'I';
			gpus[g].clock_core_max = sysman_device_properties.core.coreClockRate;
			println(gpus[g].name);

			gpus[g].fan_max = 5000u; // no data available
			g++;
		}
	}
	sysman_gpu_number = g-sysman_gpu_start;
	gpu_number += sysman_gpu_number;
}
void gpu_fetch_intel() {
	if(sysman_gpu_number==0u) return;
	uint g = sysman_gpu_start;
	for(uint s=0u; s<(uint)sysman_driver_handles.size(); s++) {
		uint sysman_device_count = 0;
		zesDeviceGet(sysman_driver_handles[s], &sysman_device_count, nullptr);
		vector<ze_device_handle_t> sysman_device_handle(sysman_device_count);
		zesDeviceGet(sysman_driver_handles[s], &sysman_device_count, &sysman_device_handle[0]);
		for(uint i=0; i<sysman_device_count; i++) {
			println(gpus[g].name);
			
			uint sysman_engine_count = 0u;
			zesDeviceEnumEngineGroups(sysman_device_handle[i], &sysman_engine_count, nullptr);
			zes_engine_handle_t* sysman_engine_handles = new zes_engine_handle_t[sysman_engine_count];
			zesDeviceEnumEngineGroups(sysman_device_handle[i], &sysman_engine_count, sysman_engine_handles);
			println("sysman_engine_count: "+to_string(sysman_engine_count));
			for(uint j=0u; j<sysman_engine_count; j++) {
				zes_engine_properties_t sysman_engine_properties = {};
				zesEngineGetProperties(sysman_engine_handles[j], &sysman_engine_properties);
				if(sysman_engine_properties.type==ZES_ENGINE_GROUP_ALL) {
					zes_engine_stats_t sysman_engine_stats = {};
					zesEngineGetActivity(sysman_engine_handles[j], &sysman_engine_stats);
					gpus[g].usage = (uint)((sysman_engine_stats.activeTime-sysman_last_active[g])/min(sysman_engine_stats.timestamp-sysman_last_active_timestamp[g], 1ull));
					sysman_engine_stats.timestamp;
					sysman_last_active[g] = sysman_engine_stats.activeTime;
					sysman_last_active_timestamp[g] = sysman_engine_stats.timestamp;
					println("sysman_engine_stats.activeTime: "+to_string(sysman_engine_stats.activeTime));
					println("sysman_engine_stats.timestamp: "+to_string(sysman_engine_stats.timestamp));
				}
			}

			uint sysman_mem_count = 0u;
			zesDeviceEnumMemoryModules(sysman_device_handle[i], &sysman_mem_count, nullptr);
			zes_mem_handle_t* sysman_mem_handles = new zes_mem_handle_t[sysman_mem_count];
			zesDeviceEnumMemoryModules(sysman_device_handle[i], &sysman_mem_count, sysman_mem_handles);
			println("sysman_mem_count: "+to_string(sysman_mem_count));
			for(uint j=0u; j<min(sysman_mem_count, 1u); j++) {
				zes_mem_state_t sysman_mem_state = {};
				zes_mem_bandwidth_t sysman_mem_bandwidth = {};
				zesMemoryGetState(sysman_mem_handles[j], &sysman_mem_state);
				zesMemoryGetBandwidth(sysman_mem_handles[j], &sysman_mem_bandwidth);
				gpus[g].memory_max = (uint)(sysman_mem_state.size/1000000ull);
				gpus[g].memory_used = (uint)((sysman_mem_state.size-sysman_mem_state.free)/1000000ull);
				gpus[g].memory_bandwidth_used = (uint)(((sysman_mem_bandwidth.readCounter+sysman_mem_bandwidth.writeCounter-sysman_last_readwrite[g]))/max(sysman_mem_bandwidth.timestamp-sysman_last_readwrite_timestamp[g], 1ull));
				gpus[g].memory_bandwidth_max = (uint)(sysman_mem_bandwidth.maxBandwidth/1000000ull);
				sysman_last_readwrite[g] = sysman_mem_bandwidth.readCounter+sysman_mem_bandwidth.writeCounter;
				sysman_last_readwrite_timestamp[g] = sysman_mem_bandwidth.timestamp;
				println("sysman_mem_state.size: "+to_string(sysman_mem_state.size));
				println("sysman_mem_state.free: "+to_string(sysman_mem_state.free));
				println("sysman_mem_bandwidth.maxBandwidth: "+to_string(sysman_mem_bandwidth.maxBandwidth));
				println("sysman_mem_bandwidth.readCounter: "+to_string(sysman_mem_bandwidth.readCounter));
				println("sysman_mem_bandwidth.writeCounter: "+to_string(sysman_mem_bandwidth.writeCounter));
				println("sysman_mem_bandwidth.timestamp: "+to_string(sysman_mem_bandwidth.timestamp));
			}
			delete[] sysman_mem_handles;
			
			uint sysman_temp_count = 0u;
			zesDeviceEnumTemperatureSensors(sysman_device_handle[i], &sysman_temp_count, nullptr);
			zes_temp_handle_t* sysman_temp_handles = new zes_temp_handle_t[sysman_temp_count];
			zesDeviceEnumTemperatureSensors(sysman_device_handle[i], &sysman_temp_count, sysman_temp_handles);
			println("sysman_temp_count: "+to_string(sysman_temp_count));
			for(uint j=0u; j<sysman_temp_count; j++) {
				zes_temp_properties_t sysman_temp_properties = {};
				zesTemperatureGetProperties(sysman_temp_handles[j], &sysman_temp_properties);
				if(sysman_temp_properties.type==ZES_TEMP_SENSORS_GPU) gpus[g].temperature_used = sysman_temp_properties.maxTemperature>0.0 ? to_uint(sysman_temp_properties.maxTemperature) : gpus[g].temperature_used;
				println("sysman_temp_properties.maxTemperature: "+to_string(sysman_temp_properties.maxTemperature));
				println("sysman_temp_properties.type: "+to_string(sysman_temp_properties.type));
			}
			delete[] sysman_temp_handles;
			
			uint sysman_fan_count = 0u;
			zesDeviceEnumFans(sysman_device_handle[i], &sysman_fan_count, nullptr);
			zes_fan_handle_t* sysman_fan_handles = new zes_fan_handle_t[sysman_fan_count];
			zesDeviceEnumFans(sysman_device_handle[i], &sysman_fan_count, sysman_fan_handles);
			if(sysman_fan_count==0u) gpus[g].fan_max = 0u;
			println("sysman_fan_count: "+to_string(sysman_fan_count));
			for(uint j=0u; j<sysman_fan_count; j++) {
				zes_fan_properties_t sysman_fan_properties = {};
				zesFanGetProperties(sysman_fan_handles[j], &sysman_fan_properties);
				gpus[g].fan_used = sysman_fan_properties.maxRPM>0 ? (uint)sysman_fan_properties.maxRPM : 0u;
			}
			delete[] sysman_fan_handles;
			
			uint sysman_pwr_count = 0u;
			zesDeviceEnumPowerDomains(sysman_device_handle[i], &sysman_pwr_count, nullptr);
			zes_pwr_handle_t* sysman_pwr_handle = new zes_pwr_handle_t[sysman_pwr_count];
			zesDeviceEnumPowerDomains(sysman_device_handle[i], &sysman_pwr_count, sysman_pwr_handle);
			if(sysman_pwr_count==0u) gpus[g].power_max = 0u;
			println("sysman_pwr_count: "+to_string(sysman_pwr_count));
			for(uint j=0u; j<min(sysman_pwr_count, 1u); j++) {
				zes_power_energy_counter_t sysman_power_energy_counter = {};
				zesPowerGetEnergyCounter(sysman_pwr_handle[j], &sysman_power_energy_counter);
				gpus[g].power_used = (uint)((sysman_power_energy_counter.energy-sysman_last_energy[g])/max(sysman_power_energy_counter.timestamp-sysman_last_energy_timestamp[g], 1ull));
				sysman_last_energy[g] = sysman_power_energy_counter.energy;
				sysman_last_energy_timestamp[g] = sysman_power_energy_counter.timestamp;
				zes_power_sustained_limit_t sysman_power_sustained_limit = {};
				zes_power_burst_limit_t sysman_power_burst_limit = {};
				zes_power_peak_limit_t sysman_power_peak_limit = {};
				zesPowerGetLimits(sysman_pwr_handle[j], &sysman_power_sustained_limit, &sysman_power_burst_limit, &sysman_power_peak_limit);
				gpus[g].power_max = sysman_power_sustained_limit.power/1000u;
				println("sysman_power_energy_counter.energy: "+to_string(sysman_power_energy_counter.energy));
				println("sysman_power_energy_counter.timestamp: "+to_string(sysman_power_energy_counter.timestamp));
				println("sysman_power_sustained_limit.power: "+to_string(sysman_power_sustained_limit.power));
			}
			delete[] sysman_pwr_handle;

			zes_device_properties_t sysman_device_properties = {};
			zesDeviceGetProperties(sysman_device_handle[i], &sysman_device_properties);
			gpus[g].clock_core_used = sysman_device_properties.core.coreClockRate;
			g++;
		}
	}
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions