Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions libkineto/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -222,3 +222,39 @@ if(KINETO_BUILD_TESTS)
add_subdirectory(test)
add_subdirectory("${LIBKINETO_THIRDPARTY_DIR}/googletest")
endif()

execute_process(
COMMAND musa_toolkits_version
OUTPUT_VARIABLE MUSA_VERSION_OUTPUT
RESULT_VARIABLE MUSA_VERSION_RESULT
OUTPUT_STRIP_TRAILING_WHITESPACE
)

if(MUSA_VERSION_RESULT EQUAL 0)
message(STATUS "MUSA toolkits found")
message(VERBOSE "Raw output: ${MUSA_VERSION_OUTPUT}")

# parse the musatookit version
if(MUSA_VERSION_OUTPUT MATCHES "\"version\"[ \t]*:[ \t]*\"[ \t]*([0-9]+)[ \t]*\\.[ \t]*([0-9]+)[ \t]*\\.[ \t]*([0-9]+)[ \t]*\"")
set(MUSA_VERSION_MAJOR ${CMAKE_MATCH_1})
set(MUSA_VERSION_MINOR ${CMAKE_MATCH_2})
set(MUSA_VERSION_PATCH ${CMAKE_MATCH_3})

# such as 4.3.2 -> 4 * 10000 + 3 * 100 + 2 = 40302
math(EXPR REAL_MUSA_VERSION_INT
"${MUSA_VERSION_MAJOR} * 10000 + ${MUSA_VERSION_MINOR} * 100 + ${MUSA_VERSION_PATCH}")

set(REAL_MUSA_VERSION ${REAL_MUSA_VERSION_INT})

add_definitions(-DREAL_MUSA_VERSION=${REAL_MUSA_VERSION_INT})

message(STATUS "MUSA version: ${MUSA_VERSION_MAJOR}.${MUSA_VERSION_MINOR}.${MUSA_VERSION_PATCH} -> ${REAL_MUSA_VERSION_INT}")

else()
message(WARNING "Failed to parse MUSA version from output")
set(REAL_MUSA_VERSION 0)
endif()
else()
message(WARNING "musa_toolkits_version command failed with result: ${MUSA_VERSION_RESULT}")
set(REAL_MUSA_VERSION 0)
endif()
1 change: 1 addition & 0 deletions libkineto/include/ActivityType.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ enum class ActivityType {
OVERHEAD, // CUPTI induced overhead events sampled from its overhead API.
MTIA_RUNTIME, // host side MTIA runtime events
MTIA_CCP_EVENTS, // MTIA ondevice CCP events
MTIA_INSIGHT, // MTIA Insight Events
CUDA_SYNC, // synchronization events between runtime and kernels

// Optional Activity types
Expand Down
3 changes: 3 additions & 0 deletions libkineto/include/ClientInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ class ClientInterface {
virtual void prepare(bool, bool, bool, bool, bool) = 0;
virtual void start() = 0;
virtual void stop() = 0;
virtual void start_memory_profile() = 0;
virtual void stop_memory_profile() = 0;
virtual void export_memory_profile(const std::string&) = 0;
};

} // namespace libkineto
12 changes: 12 additions & 0 deletions libkineto/include/Config.h
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,14 @@ class Config : public AbstractConfig {
return muptiDeviceBufferPoolLimit_;
}

bool memoryProfilerEnabled() const {
return memoryProfilerEnabled_;
}

int profileMemoryDuration() const {
return profileMemoryDuration_;
}

void updateActivityProfilerRequestReceivedTime();

void printActivityProfilerConfig(std::ostream& s) const override;
Expand Down Expand Up @@ -506,6 +514,10 @@ class Config : public AbstractConfig {

// MUPTI Timestamp Format
bool useTSCTimestamp_{true};

// Memory Profiler
bool memoryProfilerEnabled_{false};
int profileMemoryDuration_{1000};
};

constexpr char kUseDaemonEnvVar[] = "KINETO_USE_DAEMON";
Expand Down
1 change: 1 addition & 0 deletions libkineto/include/output_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ class ActivityLogger {
void handleTraceStart() {
handleTraceStart(std::unordered_map<std::string, std::string>(), "");
}
virtual void finalizeMemoryTrace(const std::string&, const Config&) = 0;

virtual void finalizeTrace(
const Config& config,
Expand Down
1 change: 1 addition & 0 deletions libkineto/sample_programs/kineto_mupti_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ int main() {
libkineto::ActivityType::OVERHEAD,
libkineto::ActivityType::MTIA_RUNTIME,
libkineto::ActivityType::MTIA_CCP_EVENTS,
libkineto::ActivityType::MTIA_INSIGHT,
libkineto::ActivityType::CUDA_SYNC,
libkineto::ActivityType::GLOW_RUNTIME,
libkineto::ActivityType::CUDA_PROFILER_RANGE,
Expand Down
57 changes: 48 additions & 9 deletions libkineto/src/ActivityProfilerController.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,16 @@ ActivityProfilerController::ActivityProfilerController(
ActivityProfilerController::~ActivityProfilerController() {
configLoader_.removeHandler(
ConfigLoader::ConfigKind::ActivityProfiler, this);
if (profilerThread_) {
// signaling termination of the profiler loop
stopRunloop_ = true;
profilerThread_->join();
delete profilerThread_;
profilerThread_ = nullptr;
for (int thread_type = 0; thread_type < ThreadType::THREAD_MAX_COUNT;
thread_type++) {
std::thread* profilerThread = profilerThreads_[thread_type];
if (profilerThread) {
// signaling termination of the profiler loop
stopRunloop_ = true;
profilerThread->join();
delete profilerThread;
profilerThread = nullptr;
}
}

#if !USE_GOOGLE_LOG
Expand Down Expand Up @@ -234,6 +238,32 @@ void ActivityProfilerController::profilerLoop() {
VLOG(0) << "Exited activity profiling loop";
}

void ActivityProfilerController::memoryProfilerLoop() {
std::string path = asyncRequestConfig_->activitiesLogFile();
auto profile_time = asyncRequestConfig_->profileMemoryDuration();
std::unique_ptr<Config> config = asyncRequestConfig_->clone();
while (!stopRunloop_) {
// Perform Double-checked locking to reduce overhead of taking lock.
if (asyncRequestConfig_ && !profiler_->isActive()) {
std::lock_guard<std::mutex> lock(asyncConfigLock_);
if (asyncRequestConfig_ && !profiler_->isActive() &&
asyncRequestConfig_->memoryProfilerEnabled()) {
logger_ = makeLogger(*asyncRequestConfig_);
path = asyncRequestConfig_->activitiesLogFile();
profile_time = asyncRequestConfig_->profileMemoryDuration();
config = asyncRequestConfig_->clone();
asyncRequestConfig_ = nullptr;
} else {
continue;
}
} else {
continue;
}

profiler_->performMemoryLoop(path, profile_time, logger_.get(), *config);
}
}

void ActivityProfilerController::step() {
// Do not remove this copy to currentIter. Otherwise count is not guaranteed.
int64_t currentIter = ++iterationCount_;
Expand Down Expand Up @@ -293,9 +323,18 @@ void ActivityProfilerController::scheduleTrace(const Config& config) {
}

// start a profilerLoop() thread to handle request
if (!profilerThread_) {
profilerThread_ =
new std::thread(&ActivityProfilerController::profilerLoop, this);
if (config.memoryProfilerEnabled()) {
auto thread_type = ThreadType::MEMORY_SNAPSHOT;
if (!profilerThreads_[thread_type]) {
profilerThreads_[thread_type] = new std::thread(
&ActivityProfilerController::memoryProfilerLoop, this);
}
} else {
auto thread_type = ThreadType::KINETO;
if (!profilerThreads_[thread_type]) {
profilerThreads_[thread_type] =
new std::thread(&ActivityProfilerController::profilerLoop, this);
}
}
}

Expand Down
8 changes: 7 additions & 1 deletion libkineto/src/ActivityProfilerController.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
#include "LoggerCollector.h"

namespace KINETO_NAMESPACE {
enum ThreadType {
KINETO = 0,
MEMORY_SNAPSHOT,
THREAD_MAX_COUNT // Number of enum entries (used for array sizing)
};

class Config;

Expand Down Expand Up @@ -108,6 +113,7 @@ class ActivityProfilerController : public ConfigLoader::ConfigHandler {
bool shouldActivateTimestampConfig(
const std::chrono::time_point<std::chrono::system_clock>& now);
void profilerLoop();
void memoryProfilerLoop();
void activateConfig(std::chrono::time_point<std::chrono::system_clock> now);

std::unique_ptr<Config> asyncRequestConfig_;
Expand All @@ -116,7 +122,7 @@ class ActivityProfilerController : public ConfigLoader::ConfigHandler {
std::unique_ptr<MuptiActivityProfiler> profiler_;
std::unique_ptr<ActivityLogger> logger_;
std::shared_ptr<LoggerCollector> loggerCollectorFactory_;
std::thread* profilerThread_{nullptr};
std::thread* profilerThreads_[ThreadType::THREAD_MAX_COUNT] = {nullptr};
std::atomic_bool stopRunloop_{false};
std::atomic<std::int64_t> iterationCount_{-1};
ConfigLoader& configLoader_;
Expand Down
1 change: 1 addition & 0 deletions libkineto/src/ActivityType.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ static constexpr std::array<ActivityTypeName, activityTypeCount + 1> map{{
{"overhead", ActivityType::OVERHEAD},
{"mtia_runtime", ActivityType::MTIA_RUNTIME},
{"mtia_ccp_events", ActivityType::MTIA_CCP_EVENTS},
{"mtia_insight", ActivityType::MTIA_INSIGHT},
{"cuda_sync", ActivityType::CUDA_SYNC},
{"glow_runtime", ActivityType::GLOW_RUNTIME},
{"musa_profiler_range", ActivityType::CUDA_PROFILER_RANGE},
Expand Down
32 changes: 29 additions & 3 deletions libkineto/src/MuptiActivity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ inline const std::string GpuActivity<MUpti_ActivityKernel6>::metadataJson() cons

inline std::string memcpyName(uint8_t kind, uint8_t src, uint8_t dst) {
return fmt::format(
"Memcpy {} ({} -> {})",
"Memcpy1 {} ({} -> {})",
memcpyKindString((MUpti_ActivityMemcpyKind)kind),
memoryKindString((MUpti_ActivityMemoryKind)src),
memoryKindString((MUpti_ActivityMemoryKind)dst));
Expand All @@ -145,15 +145,15 @@ inline std::string memcpyName(uint8_t kind, uint8_t src, uint8_t dst) {

inline std::string memoryAtomicName(uint8_t kind, uint8_t src, uint8_t dst) {
return fmt::format(
"Memcpy {} ({} -> {})",
"Memcpy2 {} ({} -> {})",
memoryAtomicKindString((MUpti_ActivityMemoryAtomicKind)kind),
memoryKindString((MUpti_ActivityMemoryKind)src),
memoryKindString((MUpti_ActivityMemoryKind)dst));
}

inline std::string memoryAtomicValueName(uint8_t kind, uint8_t dst) {
return fmt::format(
"Memcpy {} ({} -> {})",
"Memcpy3 {} ({})",
memoryAtomicValueKindString((MUpti_ActivityMemoryAtomicValueKind)kind),
memoryKindString((MUpti_ActivityMemoryKind)dst));
}
Expand Down Expand Up @@ -215,6 +215,32 @@ inline const std::string GpuActivity<MUpti_ActivityMemcpy2>::metadataJson() cons
// clang-format on
}

#if defined(REAL_MUSA_VERSION) && (REAL_MUSA_VERSION >= 40306)
template<>
inline ActivityType GpuActivity<MUpti_ActivityMemoryTransfer>::type() const {
return ActivityType::GPU_MEMCPY;
}

template<>
inline const std::string GpuActivity<MUpti_ActivityMemoryTransfer>::name() const {
return "MemTransfer(d2d)";
}

template<>
inline const std::string GpuActivity<MUpti_ActivityMemoryTransfer>::metadataJson() const {
const MUpti_ActivityMemoryTransfer& memcpy = raw();
// clang-format off
return fmt::format(R"JSON(
"device": {}, "context": {},
"stream": {}, "correlation": {},
"bytes": {}, "memory bandwidth (GB/s)": {})JSON",
memcpy.deviceId, memcpy.contextId,
memcpy.streamId, memcpy.correlationId,
memcpy.bytes, bandwidth(memcpy.bytes, memcpy.end - memcpy.start));
// clang-format on
}
#endif

template<>
inline ActivityType GpuActivity<MUpti_ActivityMemoryAtomic>::type() const {
return ActivityType::GPU_MEMCPY;
Expand Down
10 changes: 10 additions & 0 deletions libkineto/src/MuptiActivityApi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,11 @@ void MuptiActivityApi::enableMuptiActivities(
for (const auto& activity : selected_activities) {
if (activity == ActivityType::GPU_MEMCPY) {
MUPTI_CALL(muptiActivityEnable(MUPTI_ACTIVITY_KIND_MEMCPY));
MUPTI_CALL(muptiActivityEnable(MUPTI_ACTIVITY_KIND_MEMORY_ATOMIC));
MUPTI_CALL(muptiActivityEnable(MUPTI_ACTIVITY_KIND_MEMORY_ATOMIC_VALUE));
#if defined(REAL_MUSA_VERSION) && (REAL_MUSA_VERSION >= 40306)
MUPTI_CALL(muptiActivityEnable(MUPTI_ACTIVITY_KIND_MEMORY_TRANSFER));
#endif
}
if (activity == ActivityType::GPU_MEMSET) {
MUPTI_CALL(muptiActivityEnable(MUPTI_ACTIVITY_KIND_MEMSET));
Expand Down Expand Up @@ -357,6 +362,11 @@ void MuptiActivityApi::disableMuptiActivities(
for (const auto& activity : selected_activities) {
if (activity == ActivityType::GPU_MEMCPY) {
MUPTI_CALL(muptiActivityDisable(MUPTI_ACTIVITY_KIND_MEMCPY));
MUPTI_CALL(muptiActivityDisable(MUPTI_ACTIVITY_KIND_MEMORY_ATOMIC));
MUPTI_CALL(muptiActivityDisable(MUPTI_ACTIVITY_KIND_MEMORY_ATOMIC_VALUE));
#if defined(REAL_MUSA_VERSION) && (REAL_MUSA_VERSION >= 40306)
MUPTI_CALL(muptiActivityDisable(MUPTI_ACTIVITY_KIND_MEMORY_TRANSFER));
#endif
}
if (activity == ActivityType::GPU_MEMSET) {
MUPTI_CALL(muptiActivityDisable(MUPTI_ACTIVITY_KIND_MEMSET));
Expand Down
28 changes: 28 additions & 0 deletions libkineto/src/MuptiActivityProfiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -819,13 +819,22 @@ void MuptiActivityProfiler::handleMuptiActivity(
reinterpret_cast<const MUpti_ActivityMemcpy2*>(record), logger);
break;
case MUPTI_ACTIVITY_KIND_MEMORY_ATOMIC:
LOG(INFO) << "handle memory-atomic";
handleGpuActivity(
reinterpret_cast<const MUpti_ActivityMemoryAtomic*>(record), logger);
break;
case MUPTI_ACTIVITY_KIND_MEMORY_ATOMIC_VALUE:
LOG(INFO) << "handle memory-atomic-value";
handleGpuActivity(
reinterpret_cast<const MUpti_ActivityMemoryAtomicValue*>(record), logger);
break;
#if defined(REAL_MUSA_VERSION) && (REAL_MUSA_VERSION >= 40306)
case MUPTI_ACTIVITY_KIND_MEMORY_TRANSFER:
LOG(INFO) << "handle memory-transfer";
handleGpuActivity(
reinterpret_cast<const MUpti_ActivityMemoryTransfer*>(record), logger);
break;
#endif
case MUPTI_ACTIVITY_KIND_MEMSET:
handleGpuActivity(
reinterpret_cast<const MUpti_ActivityMemset*>(record), logger);
Expand Down Expand Up @@ -1233,6 +1242,25 @@ const time_point<system_clock> MuptiActivityProfiler::performRunLoopStep(
return new_wakeup_time;
}

const void MuptiActivityProfiler::performMemoryLoop(
const string& path,
uint32_t profile_time,
ActivityLogger* logger,
Config& config) {
currentRunloopState_ = RunloopState::CollectMemorySnapshot;
if (libkineto::api().client()) {
libkineto::api().client()->start_memory_profile();
LOG(INFO) << "Running memory profiling for " << profile_time << " ms";
std::this_thread::sleep_for(std::chrono::milliseconds(profile_time));
LOG(INFO) << "Exporting memory profiling results to " << path;
libkineto::api().client()->export_memory_profile(path);
libkineto::api().client()->stop_memory_profile();
LOG(INFO) << "Finalizing trace";
logger->finalizeMemoryTrace(path, config);
}
currentRunloopState_ = RunloopState::WaitForRequest;
}

void MuptiActivityProfiler::finalizeTrace(const Config& config, ActivityLogger& logger) {
LOG(INFO) << "CPU Traces Recorded:";
{
Expand Down
12 changes: 11 additions & 1 deletion libkineto/src/MuptiActivityProfiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ class MuptiActivityProfiler {
bool isActive() const {
return currentRunloopState_ != RunloopState::WaitForRequest;
}
bool isCollectingMemorySnapshot() const {
return currentRunloopState_ == RunloopState::CollectMemorySnapshot;
}

// Invoke at a regular interval to perform profiling activities.
// When not active, an interval of 1-5 seconds is probably fine,
Expand All @@ -129,6 +132,12 @@ class MuptiActivityProfiler {
const std::chrono::time_point<std::chrono::system_clock>& nextWakeupTime,
int64_t currentIter = -1);

const void performMemoryLoop(
const std::string& path,
uint32_t profile_time,
ActivityLogger* logger,
Config& config);

// Used for async requests
void setLogger(ActivityLogger* logger) {
logger_ = logger;
Expand Down Expand Up @@ -424,7 +433,8 @@ class MuptiActivityProfiler {
WaitForRequest,
Warmup,
CollectTrace,
ProcessTrace
ProcessTrace,
CollectMemorySnapshot,
};

// All recorded trace spans, both CPU and GPU
Expand Down
Loading
Loading