-
Notifications
You must be signed in to change notification settings - Fork 124
Open
Description
Hi all,
I'm developing a universally compatible CPU/GPU monitoring tool, and for Intel GPU support on Linux I'm testing Sysman.
The data I want to get:
- GPU usage
- VRAM bandwidth
- memory use
- temperature
- fan speed
- power draw
- PCIe throughput
My system configuration is:
- Intel Arc B580, Intel UHD Graphics 770
- Ubuntu
24.04.4LTS, kernel6.14.0-37-generic - Intel Arc driver version
26.05.37020.3
Unfortunately the majority of counters do not work:
zes_engine_handle_t-zesDeviceEnumEngineGroups()returns0engineszes_mem_state_t::free- on B580 always reports same value aszes_mem_state_t::size, on UHD 770 reports way too small value (reports ~half of memory occupied but only ~3GB RAM is actually used)zes_mem_bandwidth_t::maxBandwidth/zes_mem_bandwidth_t::readCounter/zes_mem_bandwidth_t::writeCounter/zes_mem_bandwidth_t::timestamp- always report0zes_temp_properties_t::maxTemperature- always reports0.0, for all 3 availablezes_temp_handle_treturned byzesDeviceEnumTemperatureSensors()zes_fan_handle_t-zesDeviceEnumFans()returns 0 available fans on Arc B580.zes_power_sustained_limit_t::power- always reutns0
Only a few counters do work:
zes_device_properties_t::.core.name- returns correct GPU namezes_device_properties_t::.core.coreClockRate- returns correct max GPU clockzes_mem_state_t::size- correct GPU memory size is returnedzes_power_energy_counter_t::energy/zes_power_energy_counter_t::timestamp- the first 2 of the 4zes_pwr_handle_ton Arc B580 report valid data, the other 2 report0. How to tell which of the energy counters is GPU chip / memory / board ower draw?
Am I doing something wrong? Do I need to use a newer Linux kernel?
cc @pacampbe
Kind regards,
Moritz
My sample code:
#define MAX_GPUs 32u
struct GPU {
string name = "";
char vendor = (char)0;
uint usage = 0u; // in %
uint memory_bandwidth_used=0u, memory_bandwidth_max=0u; // in MB/s
uint memory_used=0u, memory_max=0u; // in MB
uint temperature_used=0u, temperature_max=100u; // in 'C
uint fan_used=0u, fan_max=5000u; // in RPM
uint power_used=0u, power_max=250u; // in W
uint clock_core_used=0u, clock_core_max=0u, clock_memory_used=0u, clock_memory_max=0u; // in MHz
uint pcie_bandwidth_used=0u, pcie_bandwidth_max=0u; // in MB/s
uint memory_bus_width = 0u; // in bit
uint get_usage() { return clamp(usage, 0u, 100u); } // in %
uint get_memory_bandwidth() { return memory_bandwidth_max>0u ? clamp(100u*memory_bandwidth_used/memory_bandwidth_max, 0u, 100u) : max_uint; } // in %
uint get_memory() { return memory_max >0u ? clamp(100u*memory_used /memory_max , 0u, 100u) : max_uint; } // in %
uint get_temperature() { return temperature_max >0u ? clamp(100u*temperature_used /temperature_max , 0u, 100u) : max_uint; } // in %
uint get_fan() { return fan_max >0u ? clamp(100u*fan_used /fan_max , 0u, 100u) : max_uint; } // in %
uint get_power() { return power_max >0u ? clamp(100u*power_used /power_max , 0u, 100u) : max_uint; } // in %
uint get_clock_core() { return clock_core_max >0u ? clamp(100u*clock_core_used /clock_core_max , 0u, 100u) : max_uint; } // in %
uint get_clock_memory() { return clock_memory_max >0u ? clamp(100u*clock_memory_used /clock_memory_max , 0u, 100u) : max_uint; } // in %
uint get_pcie_bandwidth() { return pcie_bandwidth_max >0u ? clamp(100u*pcie_bandwidth_used /pcie_bandwidth_max , 0u, 100u) : max_uint; } // in %
} gpus[MAX_GPUs];
#include "SYSMAN/include/zes_api.h" // https://github.com/oneapi-src/level-zero/blob/master/include/zes_api.h
uint sysman_gpu_start=0u, sysman_gpu_number=0u;
vector<ze_driver_handle_t> sysman_driver_handles;
uint64_t sysman_last_active[MAX_GPUs] = {};
uint64_t sysman_last_active_timestamp[MAX_GPUs] = {};
uint64_t sysman_last_readwrite[MAX_GPUs] = {};
uint64_t sysman_last_readwrite_timestamp[MAX_GPUs] = {};
uint64_t sysman_last_energy[MAX_GPUs] = {};
uint64_t sysman_last_energy_timestamp[MAX_GPUs] = {};
void gpu_initialize_intel() {
println("init");
if(zesInit(0)!=ZE_RESULT_SUCCESS) return;
println("init successful");
sysman_gpu_start = gpu_number;
uint g = sysman_gpu_start;
uint sysman_driver_handle_count = 0u;
zesDriverGet(&sysman_driver_handle_count, nullptr);
sysman_driver_handles.resize(sysman_driver_handle_count);
zesDriverGet(&sysman_driver_handle_count, &sysman_driver_handles[0]);
println("drivers: "+to_string(sysman_driver_handle_count));
for(uint s=0u; s<(uint)sysman_driver_handles.size(); s++) {
uint sysman_device_count = 0;
zesDeviceGet(sysman_driver_handles[s], &sysman_device_count, nullptr);
vector<zes_device_handle_t> sysman_device_handle(sysman_device_count);
zesDeviceGet(sysman_driver_handles[s], &sysman_device_count, &sysman_device_handle[0]);
println("devices: "+to_string(sysman_device_count));
for(uint i=0; i<sysman_device_count; i++) {
zes_device_properties_t sysman_device_properties = {};
zesDeviceGetProperties(sysman_device_handle[i], &sysman_device_properties);
gpus[g].name = string(sysman_device_properties.core.name);
gpus[g].vendor = 'I';
gpus[g].clock_core_max = sysman_device_properties.core.coreClockRate;
println(gpus[g].name);
gpus[g].fan_max = 5000u; // no data available
g++;
}
}
sysman_gpu_number = g-sysman_gpu_start;
gpu_number += sysman_gpu_number;
}
void gpu_fetch_intel() {
if(sysman_gpu_number==0u) return;
uint g = sysman_gpu_start;
for(uint s=0u; s<(uint)sysman_driver_handles.size(); s++) {
uint sysman_device_count = 0;
zesDeviceGet(sysman_driver_handles[s], &sysman_device_count, nullptr);
vector<ze_device_handle_t> sysman_device_handle(sysman_device_count);
zesDeviceGet(sysman_driver_handles[s], &sysman_device_count, &sysman_device_handle[0]);
for(uint i=0; i<sysman_device_count; i++) {
println(gpus[g].name);
uint sysman_engine_count = 0u;
zesDeviceEnumEngineGroups(sysman_device_handle[i], &sysman_engine_count, nullptr);
zes_engine_handle_t* sysman_engine_handles = new zes_engine_handle_t[sysman_engine_count];
zesDeviceEnumEngineGroups(sysman_device_handle[i], &sysman_engine_count, sysman_engine_handles);
println("sysman_engine_count: "+to_string(sysman_engine_count));
for(uint j=0u; j<sysman_engine_count; j++) {
zes_engine_properties_t sysman_engine_properties = {};
zesEngineGetProperties(sysman_engine_handles[j], &sysman_engine_properties);
if(sysman_engine_properties.type==ZES_ENGINE_GROUP_ALL) {
zes_engine_stats_t sysman_engine_stats = {};
zesEngineGetActivity(sysman_engine_handles[j], &sysman_engine_stats);
gpus[g].usage = (uint)((sysman_engine_stats.activeTime-sysman_last_active[g])/min(sysman_engine_stats.timestamp-sysman_last_active_timestamp[g], 1ull));
sysman_engine_stats.timestamp;
sysman_last_active[g] = sysman_engine_stats.activeTime;
sysman_last_active_timestamp[g] = sysman_engine_stats.timestamp;
println("sysman_engine_stats.activeTime: "+to_string(sysman_engine_stats.activeTime));
println("sysman_engine_stats.timestamp: "+to_string(sysman_engine_stats.timestamp));
}
}
uint sysman_mem_count = 0u;
zesDeviceEnumMemoryModules(sysman_device_handle[i], &sysman_mem_count, nullptr);
zes_mem_handle_t* sysman_mem_handles = new zes_mem_handle_t[sysman_mem_count];
zesDeviceEnumMemoryModules(sysman_device_handle[i], &sysman_mem_count, sysman_mem_handles);
println("sysman_mem_count: "+to_string(sysman_mem_count));
for(uint j=0u; j<min(sysman_mem_count, 1u); j++) {
zes_mem_state_t sysman_mem_state = {};
zes_mem_bandwidth_t sysman_mem_bandwidth = {};
zesMemoryGetState(sysman_mem_handles[j], &sysman_mem_state);
zesMemoryGetBandwidth(sysman_mem_handles[j], &sysman_mem_bandwidth);
gpus[g].memory_max = (uint)(sysman_mem_state.size/1000000ull);
gpus[g].memory_used = (uint)((sysman_mem_state.size-sysman_mem_state.free)/1000000ull);
gpus[g].memory_bandwidth_used = (uint)(((sysman_mem_bandwidth.readCounter+sysman_mem_bandwidth.writeCounter-sysman_last_readwrite[g]))/max(sysman_mem_bandwidth.timestamp-sysman_last_readwrite_timestamp[g], 1ull));
gpus[g].memory_bandwidth_max = (uint)(sysman_mem_bandwidth.maxBandwidth/1000000ull);
sysman_last_readwrite[g] = sysman_mem_bandwidth.readCounter+sysman_mem_bandwidth.writeCounter;
sysman_last_readwrite_timestamp[g] = sysman_mem_bandwidth.timestamp;
println("sysman_mem_state.size: "+to_string(sysman_mem_state.size));
println("sysman_mem_state.free: "+to_string(sysman_mem_state.free));
println("sysman_mem_bandwidth.maxBandwidth: "+to_string(sysman_mem_bandwidth.maxBandwidth));
println("sysman_mem_bandwidth.readCounter: "+to_string(sysman_mem_bandwidth.readCounter));
println("sysman_mem_bandwidth.writeCounter: "+to_string(sysman_mem_bandwidth.writeCounter));
println("sysman_mem_bandwidth.timestamp: "+to_string(sysman_mem_bandwidth.timestamp));
}
delete[] sysman_mem_handles;
uint sysman_temp_count = 0u;
zesDeviceEnumTemperatureSensors(sysman_device_handle[i], &sysman_temp_count, nullptr);
zes_temp_handle_t* sysman_temp_handles = new zes_temp_handle_t[sysman_temp_count];
zesDeviceEnumTemperatureSensors(sysman_device_handle[i], &sysman_temp_count, sysman_temp_handles);
println("sysman_temp_count: "+to_string(sysman_temp_count));
for(uint j=0u; j<sysman_temp_count; j++) {
zes_temp_properties_t sysman_temp_properties = {};
zesTemperatureGetProperties(sysman_temp_handles[j], &sysman_temp_properties);
if(sysman_temp_properties.type==ZES_TEMP_SENSORS_GPU) gpus[g].temperature_used = sysman_temp_properties.maxTemperature>0.0 ? to_uint(sysman_temp_properties.maxTemperature) : gpus[g].temperature_used;
println("sysman_temp_properties.maxTemperature: "+to_string(sysman_temp_properties.maxTemperature));
println("sysman_temp_properties.type: "+to_string(sysman_temp_properties.type));
}
delete[] sysman_temp_handles;
uint sysman_fan_count = 0u;
zesDeviceEnumFans(sysman_device_handle[i], &sysman_fan_count, nullptr);
zes_fan_handle_t* sysman_fan_handles = new zes_fan_handle_t[sysman_fan_count];
zesDeviceEnumFans(sysman_device_handle[i], &sysman_fan_count, sysman_fan_handles);
if(sysman_fan_count==0u) gpus[g].fan_max = 0u;
println("sysman_fan_count: "+to_string(sysman_fan_count));
for(uint j=0u; j<sysman_fan_count; j++) {
zes_fan_properties_t sysman_fan_properties = {};
zesFanGetProperties(sysman_fan_handles[j], &sysman_fan_properties);
gpus[g].fan_used = sysman_fan_properties.maxRPM>0 ? (uint)sysman_fan_properties.maxRPM : 0u;
}
delete[] sysman_fan_handles;
uint sysman_pwr_count = 0u;
zesDeviceEnumPowerDomains(sysman_device_handle[i], &sysman_pwr_count, nullptr);
zes_pwr_handle_t* sysman_pwr_handle = new zes_pwr_handle_t[sysman_pwr_count];
zesDeviceEnumPowerDomains(sysman_device_handle[i], &sysman_pwr_count, sysman_pwr_handle);
if(sysman_pwr_count==0u) gpus[g].power_max = 0u;
println("sysman_pwr_count: "+to_string(sysman_pwr_count));
for(uint j=0u; j<min(sysman_pwr_count, 1u); j++) {
zes_power_energy_counter_t sysman_power_energy_counter = {};
zesPowerGetEnergyCounter(sysman_pwr_handle[j], &sysman_power_energy_counter);
gpus[g].power_used = (uint)((sysman_power_energy_counter.energy-sysman_last_energy[g])/max(sysman_power_energy_counter.timestamp-sysman_last_energy_timestamp[g], 1ull));
sysman_last_energy[g] = sysman_power_energy_counter.energy;
sysman_last_energy_timestamp[g] = sysman_power_energy_counter.timestamp;
zes_power_sustained_limit_t sysman_power_sustained_limit = {};
zes_power_burst_limit_t sysman_power_burst_limit = {};
zes_power_peak_limit_t sysman_power_peak_limit = {};
zesPowerGetLimits(sysman_pwr_handle[j], &sysman_power_sustained_limit, &sysman_power_burst_limit, &sysman_power_peak_limit);
gpus[g].power_max = sysman_power_sustained_limit.power/1000u;
println("sysman_power_energy_counter.energy: "+to_string(sysman_power_energy_counter.energy));
println("sysman_power_energy_counter.timestamp: "+to_string(sysman_power_energy_counter.timestamp));
println("sysman_power_sustained_limit.power: "+to_string(sysman_power_sustained_limit.power));
}
delete[] sysman_pwr_handle;
zes_device_properties_t sysman_device_properties = {};
zesDeviceGetProperties(sysman_device_handle[i], &sysman_device_properties);
gpus[g].clock_core_used = sysman_device_properties.core.coreClockRate;
g++;
}
}
}Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels