diff --git a/libkineto/CMakeLists.txt b/libkineto/CMakeLists.txt index 6862f96..9ff36b5 100644 --- a/libkineto/CMakeLists.txt +++ b/libkineto/CMakeLists.txt @@ -222,3 +222,39 @@ if(KINETO_BUILD_TESTS) add_subdirectory(test) add_subdirectory("${LIBKINETO_THIRDPARTY_DIR}/googletest") endif() + +execute_process( + COMMAND musa_toolkits_version + OUTPUT_VARIABLE MUSA_VERSION_OUTPUT + RESULT_VARIABLE MUSA_VERSION_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +if(MUSA_VERSION_RESULT EQUAL 0) + message(STATUS "MUSA toolkits found") + message(VERBOSE "Raw output: ${MUSA_VERSION_OUTPUT}") + + # parse the musatookit version + if(MUSA_VERSION_OUTPUT MATCHES "\"version\"[ \t]*:[ \t]*\"[ \t]*([0-9]+)[ \t]*\\.[ \t]*([0-9]+)[ \t]*\\.[ \t]*([0-9]+)[ \t]*\"") + set(MUSA_VERSION_MAJOR ${CMAKE_MATCH_1}) + set(MUSA_VERSION_MINOR ${CMAKE_MATCH_2}) + set(MUSA_VERSION_PATCH ${CMAKE_MATCH_3}) + + # such as 4.3.2 -> 4 * 10000 + 3 * 100 + 2 = 40302 + math(EXPR REAL_MUSA_VERSION_INT + "${MUSA_VERSION_MAJOR} * 10000 + ${MUSA_VERSION_MINOR} * 100 + ${MUSA_VERSION_PATCH}") + + set(REAL_MUSA_VERSION ${REAL_MUSA_VERSION_INT}) + + add_definitions(-DREAL_MUSA_VERSION=${REAL_MUSA_VERSION_INT}) + + message(STATUS "MUSA version: ${MUSA_VERSION_MAJOR}.${MUSA_VERSION_MINOR}.${MUSA_VERSION_PATCH} -> ${REAL_MUSA_VERSION_INT}") + + else() + message(WARNING "Failed to parse MUSA version from output") + set(REAL_MUSA_VERSION 0) + endif() +else() + message(WARNING "musa_toolkits_version command failed with result: ${MUSA_VERSION_RESULT}") + set(REAL_MUSA_VERSION 0) +endif() \ No newline at end of file diff --git a/libkineto/include/ActivityType.h b/libkineto/include/ActivityType.h index ae629af..b4124a2 100644 --- a/libkineto/include/ActivityType.h +++ b/libkineto/include/ActivityType.h @@ -32,6 +32,7 @@ enum class ActivityType { OVERHEAD, // CUPTI induced overhead events sampled from its overhead API. MTIA_RUNTIME, // host side MTIA runtime events MTIA_CCP_EVENTS, // MTIA ondevice CCP events + MTIA_INSIGHT, // MTIA Insight Events CUDA_SYNC, // synchronization events between runtime and kernels // Optional Activity types diff --git a/libkineto/include/ClientInterface.h b/libkineto/include/ClientInterface.h index 950c3be..296c659 100644 --- a/libkineto/include/ClientInterface.h +++ b/libkineto/include/ClientInterface.h @@ -17,6 +17,9 @@ class ClientInterface { virtual void prepare(bool, bool, bool, bool, bool) = 0; virtual void start() = 0; virtual void stop() = 0; + virtual void start_memory_profile() = 0; + virtual void stop_memory_profile() = 0; + virtual void export_memory_profile(const std::string&) = 0; }; } // namespace libkineto diff --git a/libkineto/include/Config.h b/libkineto/include/Config.h index 671eddf..831f32a 100644 --- a/libkineto/include/Config.h +++ b/libkineto/include/Config.h @@ -354,6 +354,14 @@ class Config : public AbstractConfig { return muptiDeviceBufferPoolLimit_; } + bool memoryProfilerEnabled() const { + return memoryProfilerEnabled_; + } + + int profileMemoryDuration() const { + return profileMemoryDuration_; + } + void updateActivityProfilerRequestReceivedTime(); void printActivityProfilerConfig(std::ostream& s) const override; @@ -506,6 +514,10 @@ class Config : public AbstractConfig { // MUPTI Timestamp Format bool useTSCTimestamp_{true}; + + // Memory Profiler + bool memoryProfilerEnabled_{false}; + int profileMemoryDuration_{1000}; }; constexpr char kUseDaemonEnvVar[] = "KINETO_USE_DAEMON"; diff --git a/libkineto/include/output_base.h b/libkineto/include/output_base.h index 303d3e9..788fcde 100644 --- a/libkineto/include/output_base.h +++ b/libkineto/include/output_base.h @@ -62,6 +62,7 @@ class ActivityLogger { void handleTraceStart() { handleTraceStart(std::unordered_map(), ""); } + virtual void finalizeMemoryTrace(const std::string&, const Config&) = 0; virtual void finalizeTrace( const Config& config, diff --git a/libkineto/sample_programs/kineto_mupti_profiler.cpp b/libkineto/sample_programs/kineto_mupti_profiler.cpp index c5d9f49..cbb7974 100644 --- a/libkineto/sample_programs/kineto_mupti_profiler.cpp +++ b/libkineto/sample_programs/kineto_mupti_profiler.cpp @@ -41,6 +41,7 @@ int main() { libkineto::ActivityType::OVERHEAD, libkineto::ActivityType::MTIA_RUNTIME, libkineto::ActivityType::MTIA_CCP_EVENTS, + libkineto::ActivityType::MTIA_INSIGHT, libkineto::ActivityType::CUDA_SYNC, libkineto::ActivityType::GLOW_RUNTIME, libkineto::ActivityType::CUDA_PROFILER_RANGE, diff --git a/libkineto/src/ActivityProfilerController.cpp b/libkineto/src/ActivityProfilerController.cpp index ec56e05..923ea7d 100644 --- a/libkineto/src/ActivityProfilerController.cpp +++ b/libkineto/src/ActivityProfilerController.cpp @@ -74,12 +74,16 @@ ActivityProfilerController::ActivityProfilerController( ActivityProfilerController::~ActivityProfilerController() { configLoader_.removeHandler( ConfigLoader::ConfigKind::ActivityProfiler, this); - if (profilerThread_) { - // signaling termination of the profiler loop - stopRunloop_ = true; - profilerThread_->join(); - delete profilerThread_; - profilerThread_ = nullptr; + for (int thread_type = 0; thread_type < ThreadType::THREAD_MAX_COUNT; + thread_type++) { + std::thread* profilerThread = profilerThreads_[thread_type]; + if (profilerThread) { + // signaling termination of the profiler loop + stopRunloop_ = true; + profilerThread->join(); + delete profilerThread; + profilerThread = nullptr; + } } #if !USE_GOOGLE_LOG @@ -234,6 +238,32 @@ void ActivityProfilerController::profilerLoop() { VLOG(0) << "Exited activity profiling loop"; } +void ActivityProfilerController::memoryProfilerLoop() { + std::string path = asyncRequestConfig_->activitiesLogFile(); + auto profile_time = asyncRequestConfig_->profileMemoryDuration(); + std::unique_ptr config = asyncRequestConfig_->clone(); + while (!stopRunloop_) { + // Perform Double-checked locking to reduce overhead of taking lock. + if (asyncRequestConfig_ && !profiler_->isActive()) { + std::lock_guard lock(asyncConfigLock_); + if (asyncRequestConfig_ && !profiler_->isActive() && + asyncRequestConfig_->memoryProfilerEnabled()) { + logger_ = makeLogger(*asyncRequestConfig_); + path = asyncRequestConfig_->activitiesLogFile(); + profile_time = asyncRequestConfig_->profileMemoryDuration(); + config = asyncRequestConfig_->clone(); + asyncRequestConfig_ = nullptr; + } else { + continue; + } + } else { + continue; + } + + profiler_->performMemoryLoop(path, profile_time, logger_.get(), *config); + } +} + void ActivityProfilerController::step() { // Do not remove this copy to currentIter. Otherwise count is not guaranteed. int64_t currentIter = ++iterationCount_; @@ -293,9 +323,18 @@ void ActivityProfilerController::scheduleTrace(const Config& config) { } // start a profilerLoop() thread to handle request - if (!profilerThread_) { - profilerThread_ = - new std::thread(&ActivityProfilerController::profilerLoop, this); + if (config.memoryProfilerEnabled()) { + auto thread_type = ThreadType::MEMORY_SNAPSHOT; + if (!profilerThreads_[thread_type]) { + profilerThreads_[thread_type] = new std::thread( + &ActivityProfilerController::memoryProfilerLoop, this); + } + } else { + auto thread_type = ThreadType::KINETO; + if (!profilerThreads_[thread_type]) { + profilerThreads_[thread_type] = + new std::thread(&ActivityProfilerController::profilerLoop, this); + } } } diff --git a/libkineto/src/ActivityProfilerController.h b/libkineto/src/ActivityProfilerController.h index fd22e70..d030e3e 100644 --- a/libkineto/src/ActivityProfilerController.h +++ b/libkineto/src/ActivityProfilerController.h @@ -27,6 +27,11 @@ #include "LoggerCollector.h" namespace KINETO_NAMESPACE { +enum ThreadType { + KINETO = 0, + MEMORY_SNAPSHOT, + THREAD_MAX_COUNT // Number of enum entries (used for array sizing) +}; class Config; @@ -108,6 +113,7 @@ class ActivityProfilerController : public ConfigLoader::ConfigHandler { bool shouldActivateTimestampConfig( const std::chrono::time_point& now); void profilerLoop(); + void memoryProfilerLoop(); void activateConfig(std::chrono::time_point now); std::unique_ptr asyncRequestConfig_; @@ -116,7 +122,7 @@ class ActivityProfilerController : public ConfigLoader::ConfigHandler { std::unique_ptr profiler_; std::unique_ptr logger_; std::shared_ptr loggerCollectorFactory_; - std::thread* profilerThread_{nullptr}; + std::thread* profilerThreads_[ThreadType::THREAD_MAX_COUNT] = {nullptr}; std::atomic_bool stopRunloop_{false}; std::atomic iterationCount_{-1}; ConfigLoader& configLoader_; diff --git a/libkineto/src/ActivityType.cpp b/libkineto/src/ActivityType.cpp index 066429a..20c6f04 100644 --- a/libkineto/src/ActivityType.cpp +++ b/libkineto/src/ActivityType.cpp @@ -32,6 +32,7 @@ static constexpr std::array map{{ {"overhead", ActivityType::OVERHEAD}, {"mtia_runtime", ActivityType::MTIA_RUNTIME}, {"mtia_ccp_events", ActivityType::MTIA_CCP_EVENTS}, + {"mtia_insight", ActivityType::MTIA_INSIGHT}, {"cuda_sync", ActivityType::CUDA_SYNC}, {"glow_runtime", ActivityType::GLOW_RUNTIME}, {"musa_profiler_range", ActivityType::CUDA_PROFILER_RANGE}, diff --git a/libkineto/src/MuptiActivity.cpp b/libkineto/src/MuptiActivity.cpp index 785c74e..0ed68c6 100644 --- a/libkineto/src/MuptiActivity.cpp +++ b/libkineto/src/MuptiActivity.cpp @@ -135,7 +135,7 @@ inline const std::string GpuActivity::metadataJson() cons inline std::string memcpyName(uint8_t kind, uint8_t src, uint8_t dst) { return fmt::format( - "Memcpy {} ({} -> {})", + "Memcpy1 {} ({} -> {})", memcpyKindString((MUpti_ActivityMemcpyKind)kind), memoryKindString((MUpti_ActivityMemoryKind)src), memoryKindString((MUpti_ActivityMemoryKind)dst)); @@ -145,7 +145,7 @@ inline std::string memcpyName(uint8_t kind, uint8_t src, uint8_t dst) { inline std::string memoryAtomicName(uint8_t kind, uint8_t src, uint8_t dst) { return fmt::format( - "Memcpy {} ({} -> {})", + "Memcpy2 {} ({} -> {})", memoryAtomicKindString((MUpti_ActivityMemoryAtomicKind)kind), memoryKindString((MUpti_ActivityMemoryKind)src), memoryKindString((MUpti_ActivityMemoryKind)dst)); @@ -153,7 +153,7 @@ inline std::string memoryAtomicName(uint8_t kind, uint8_t src, uint8_t dst) { inline std::string memoryAtomicValueName(uint8_t kind, uint8_t dst) { return fmt::format( - "Memcpy {} ({} -> {})", + "Memcpy3 {} ({})", memoryAtomicValueKindString((MUpti_ActivityMemoryAtomicValueKind)kind), memoryKindString((MUpti_ActivityMemoryKind)dst)); } @@ -215,6 +215,32 @@ inline const std::string GpuActivity::metadataJson() cons // clang-format on } +#if defined(REAL_MUSA_VERSION) && (REAL_MUSA_VERSION >= 40306) +template<> +inline ActivityType GpuActivity::type() const { + return ActivityType::GPU_MEMCPY; +} + +template<> +inline const std::string GpuActivity::name() const { + return "MemTransfer(d2d)"; +} + +template<> +inline const std::string GpuActivity::metadataJson() const { + const MUpti_ActivityMemoryTransfer& memcpy = raw(); + // clang-format off + return fmt::format(R"JSON( + "device": {}, "context": {}, + "stream": {}, "correlation": {}, + "bytes": {}, "memory bandwidth (GB/s)": {})JSON", + memcpy.deviceId, memcpy.contextId, + memcpy.streamId, memcpy.correlationId, + memcpy.bytes, bandwidth(memcpy.bytes, memcpy.end - memcpy.start)); + // clang-format on +} +#endif + template<> inline ActivityType GpuActivity::type() const { return ActivityType::GPU_MEMCPY; diff --git a/libkineto/src/MuptiActivityApi.cpp b/libkineto/src/MuptiActivityApi.cpp index abdc3dc..629af9b 100644 --- a/libkineto/src/MuptiActivityApi.cpp +++ b/libkineto/src/MuptiActivityApi.cpp @@ -319,6 +319,11 @@ void MuptiActivityApi::enableMuptiActivities( for (const auto& activity : selected_activities) { if (activity == ActivityType::GPU_MEMCPY) { MUPTI_CALL(muptiActivityEnable(MUPTI_ACTIVITY_KIND_MEMCPY)); + MUPTI_CALL(muptiActivityEnable(MUPTI_ACTIVITY_KIND_MEMORY_ATOMIC)); + MUPTI_CALL(muptiActivityEnable(MUPTI_ACTIVITY_KIND_MEMORY_ATOMIC_VALUE)); +#if defined(REAL_MUSA_VERSION) && (REAL_MUSA_VERSION >= 40306) + MUPTI_CALL(muptiActivityEnable(MUPTI_ACTIVITY_KIND_MEMORY_TRANSFER)); +#endif } if (activity == ActivityType::GPU_MEMSET) { MUPTI_CALL(muptiActivityEnable(MUPTI_ACTIVITY_KIND_MEMSET)); @@ -357,6 +362,11 @@ void MuptiActivityApi::disableMuptiActivities( for (const auto& activity : selected_activities) { if (activity == ActivityType::GPU_MEMCPY) { MUPTI_CALL(muptiActivityDisable(MUPTI_ACTIVITY_KIND_MEMCPY)); + MUPTI_CALL(muptiActivityDisable(MUPTI_ACTIVITY_KIND_MEMORY_ATOMIC)); + MUPTI_CALL(muptiActivityDisable(MUPTI_ACTIVITY_KIND_MEMORY_ATOMIC_VALUE)); +#if defined(REAL_MUSA_VERSION) && (REAL_MUSA_VERSION >= 40306) + MUPTI_CALL(muptiActivityDisable(MUPTI_ACTIVITY_KIND_MEMORY_TRANSFER)); +#endif } if (activity == ActivityType::GPU_MEMSET) { MUPTI_CALL(muptiActivityDisable(MUPTI_ACTIVITY_KIND_MEMSET)); diff --git a/libkineto/src/MuptiActivityProfiler.cpp b/libkineto/src/MuptiActivityProfiler.cpp index c0530db..c2624d6 100644 --- a/libkineto/src/MuptiActivityProfiler.cpp +++ b/libkineto/src/MuptiActivityProfiler.cpp @@ -819,13 +819,22 @@ void MuptiActivityProfiler::handleMuptiActivity( reinterpret_cast(record), logger); break; case MUPTI_ACTIVITY_KIND_MEMORY_ATOMIC: + LOG(INFO) << "handle memory-atomic"; handleGpuActivity( reinterpret_cast(record), logger); break; case MUPTI_ACTIVITY_KIND_MEMORY_ATOMIC_VALUE: + LOG(INFO) << "handle memory-atomic-value"; handleGpuActivity( reinterpret_cast(record), logger); break; +#if defined(REAL_MUSA_VERSION) && (REAL_MUSA_VERSION >= 40306) + case MUPTI_ACTIVITY_KIND_MEMORY_TRANSFER: + LOG(INFO) << "handle memory-transfer"; + handleGpuActivity( + reinterpret_cast(record), logger); + break; +#endif case MUPTI_ACTIVITY_KIND_MEMSET: handleGpuActivity( reinterpret_cast(record), logger); @@ -1233,6 +1242,25 @@ const time_point MuptiActivityProfiler::performRunLoopStep( return new_wakeup_time; } +const void MuptiActivityProfiler::performMemoryLoop( + const string& path, + uint32_t profile_time, + ActivityLogger* logger, + Config& config) { + currentRunloopState_ = RunloopState::CollectMemorySnapshot; + if (libkineto::api().client()) { + libkineto::api().client()->start_memory_profile(); + LOG(INFO) << "Running memory profiling for " << profile_time << " ms"; + std::this_thread::sleep_for(std::chrono::milliseconds(profile_time)); + LOG(INFO) << "Exporting memory profiling results to " << path; + libkineto::api().client()->export_memory_profile(path); + libkineto::api().client()->stop_memory_profile(); + LOG(INFO) << "Finalizing trace"; + logger->finalizeMemoryTrace(path, config); + } + currentRunloopState_ = RunloopState::WaitForRequest; +} + void MuptiActivityProfiler::finalizeTrace(const Config& config, ActivityLogger& logger) { LOG(INFO) << "CPU Traces Recorded:"; { diff --git a/libkineto/src/MuptiActivityProfiler.h b/libkineto/src/MuptiActivityProfiler.h index ad0d3f1..af62a35 100644 --- a/libkineto/src/MuptiActivityProfiler.h +++ b/libkineto/src/MuptiActivityProfiler.h @@ -118,6 +118,9 @@ class MuptiActivityProfiler { bool isActive() const { return currentRunloopState_ != RunloopState::WaitForRequest; } + bool isCollectingMemorySnapshot() const { + return currentRunloopState_ == RunloopState::CollectMemorySnapshot; + } // Invoke at a regular interval to perform profiling activities. // When not active, an interval of 1-5 seconds is probably fine, @@ -129,6 +132,12 @@ class MuptiActivityProfiler { const std::chrono::time_point& nextWakeupTime, int64_t currentIter = -1); + const void performMemoryLoop( + const std::string& path, + uint32_t profile_time, + ActivityLogger* logger, + Config& config); + // Used for async requests void setLogger(ActivityLogger* logger) { logger_ = logger; @@ -424,7 +433,8 @@ class MuptiActivityProfiler { WaitForRequest, Warmup, CollectTrace, - ProcessTrace + ProcessTrace, + CollectMemorySnapshot, }; // All recorded trace spans, both CPU and GPU diff --git a/libkineto/src/MuptiCallbackApi.cpp b/libkineto/src/MuptiCallbackApi.cpp index 83856dc..9d9d9f5 100644 --- a/libkineto/src/MuptiCallbackApi.cpp +++ b/libkineto/src/MuptiCallbackApi.cpp @@ -65,7 +65,10 @@ static void callback_switchboard( // below statement is likey going to call a mutex // on the singleton access - // MuptiCallbackApi::singleton()->__callback_switchboard(domain, cbid, cbInfo); +//#if defined(REAL_MUSA_VERSION) && (REAL_MUSA_VERSION >= 40303) +// MuptiCallbackApi::singleton()->__callback_switchboard( +// domain, cbid, cbInfo); +//#endif } @@ -94,8 +97,9 @@ void MuptiCallbackApi::__callback_switchboard( LOG(INFO) << " Calling muptiFinalize in exit callsite"; // Teardown MUPTI calling muptiFinalize() MUPTI_CALL(muptiUnsubscribe(subscriber_)); - // TODO: MUPTI muptiFinalize is not yet implemented - // MUPTI_CALL(muptiFinalize()); +#if defined(REAL_MUSA_VERSION) && (REAL_MUSA_VERSION >= 40303) + MUPTI_CALL(muptiFinalize()); +#endif initSuccess_ = false; subscriber_ = nullptr; MuptiActivityApi::singleton().teardownMupti_ = 0; diff --git a/libkineto/src/output_json.cpp b/libkineto/src/output_json.cpp index 71da3f0..69a6690 100644 --- a/libkineto/src/output_json.cpp +++ b/libkineto/src/output_json.cpp @@ -148,6 +148,10 @@ void ChromeTraceLogger::openTraceFile() { } } +void ChromeTraceLogger::finalizeMemoryTrace(const std::string&, const Config&) { + LOG(INFO) << "finalizeMemoryTrace not implemented for ChromeTraceLogger"; +} + ChromeTraceLogger::ChromeTraceLogger(const std::string& traceFileName) { fileName_ = traceFileName.empty() ? defaultFileName() : traceFileName; traceOf_.clear(std::ios_base::badbit); diff --git a/libkineto/src/output_json.h b/libkineto/src/output_json.h index 14a85ad..b72d439 100644 --- a/libkineto/src/output_json.h +++ b/libkineto/src/output_json.h @@ -81,6 +81,8 @@ class ChromeTraceLogger : public libkineto::ActivityLogger { std::unordered_map>& metadata) override; + void finalizeMemoryTrace(const std::string&, const Config&) override; + std::string traceFileName() const { return fileName_; } diff --git a/libkineto/src/output_membuf.h b/libkineto/src/output_membuf.h index 394bbc0..004ac9a 100644 --- a/libkineto/src/output_membuf.h +++ b/libkineto/src/output_membuf.h @@ -18,6 +18,7 @@ #include "ActivityBuffers.h" #include "Config.h" #include "GenericTraceActivity.h" +#include "Logger.h" #include "output_base.h" namespace KINETO_NAMESPACE { @@ -77,6 +78,10 @@ class MemoryTraceLogger : public ActivityLogger { buffers_ = std::move(buffers); endTime_ = endTime; } + + void finalizeMemoryTrace(const std::string&, const Config&) override { + LOG(INFO) << "finalizeMemoryTrace not implemented for MemLogger"; + } const std::vector* traceActivities() { return &activities_;