update tracy from 11.0 to 13.1 and fix build with tracy enabled

2026-05-01 18:24:04 +02:00
parent 7fa5294e02
commit 2adf75973a
304 changed files with 20579 additions and 170182 deletions
@@ -179,7 +179,7 @@ private:
    uint8_t m_val[6];
 };

-struct Int48Sort { bool operator()( const Int48& lhs, const Int48& rhs ) { return lhs.Val() < rhs.Val(); }; };
+struct Int48Sort { bool operator()( const Int48& lhs, const Int48& rhs ) const { return lhs.Val() < rhs.Val(); }; };


 struct SourceLocationBase
@@ -264,7 +264,7 @@ struct SampleData

 enum { SampleDataSize = sizeof( SampleData ) };

-struct SampleDataSort { bool operator()( const SampleData& lhs, const SampleData& rhs ) { return lhs.time.Val() < rhs.time.Val(); }; };
+struct SampleDataSort { bool operator()( const SampleData& lhs, const SampleData& rhs ) const { return lhs.time.Val() < rhs.time.Val(); }; };


 struct SampleDataRange
@@ -412,6 +412,7 @@ struct GpuEvent
    uint64_t _gpuStart_child1;
    uint64_t _gpuEnd_child2;
    Int24 callstack;
+    uint16_t query_id;
 };

 enum { GpuEventSize = sizeof( GpuEvent ) };
@@ -530,36 +531,93 @@ struct CrashEvent

 enum { CrashEventSize = sizeof( CrashEvent ) };

-
+/**
+* Represents a context switch.
+* Start is the when the thread wakes up (if known).
+* End is when the context switch to another thread (or idle) happens.
+*/
 struct ContextSwitchData
 {
-    enum : int8_t { Fiber = 99 };
-    enum : int8_t { NoState = 100 };
-    enum : int8_t { Wakeup = -2 };
+    enum CSReason : int8_t {
+        Wakeup = -2,
+        Fiber = 99,
+        NoState = 100,

-    tracy_force_inline int64_t Start() const { return int64_t( _start_cpu ) >> 16; }
-    tracy_force_inline void SetStart( int64_t start ) { assert( start < (int64_t)( 1ull << 47 ) ); memcpy( ((char*)&_start_cpu)+2, &start, 4 ); memcpy( ((char*)&_start_cpu)+6, ((char*)&start)+4, 2 ); }
-    tracy_force_inline int64_t End() const { return int64_t( _end_reason_state ) >> 16; }
-    tracy_force_inline void SetEnd( int64_t end ) { assert( end < (int64_t)( 1ull << 47 ) ); memcpy( ((char*)&_end_reason_state)+2, &end, 4 ); memcpy( ((char*)&_end_reason_state)+6, ((char*)&end)+4, 2 ); }
-    tracy_force_inline bool IsEndValid() const { return ( _end_reason_state >> 63 ) == 0; }
-    tracy_force_inline uint8_t Cpu() const { return uint8_t( _start_cpu & 0xFF ); }
-    tracy_force_inline void SetCpu( uint8_t cpu ) { memcpy( &_start_cpu, &cpu, 1 ); }
-    tracy_force_inline int8_t Reason() const { return int8_t( (_end_reason_state >> 8) & 0xFF ); }
-    tracy_force_inline void SetReason( int8_t reason ) { memcpy( ((char*)&_end_reason_state)+1, &reason, 1 ); }
-    tracy_force_inline int8_t State() const { return int8_t( _end_reason_state & 0xFF ); }
-    tracy_force_inline void SetState( int8_t state ) { memcpy( &_end_reason_state, &state, 1 ); }
+        // See KWAIT_REASON in the WDK's wdm.h
+        Win32_Executive         = 0 ,
+        Win32_FreePage          = 1 ,
+        Win32_PageIn            = 2 ,
+        Win32_PoolAllocation    = 3 ,
+        Win32_DelayExecution    = 4 ,
+        Win32_Suspended         = 5 ,
+        Win32_UserRequest       = 6 ,
+        Win32_WrExecutive       = 7 ,
+        Win32_WrFreePage        = 8 ,
+        Win32_WrPageIn          = 9 ,
+        Win32_WrPoolAllocation  = 10,
+        Win32_WrDelayExecution  = 11,
+        Win32_WrSuspended       = 12,
+        Win32_WrUserRequest     = 13,
+        Win32_WrEventPair       = 14,
+        Win32_WrQueue           = 15,
+        Win32_WrLpcReceive      = 16,
+        Win32_WrLpcReply        = 17,
+        Win32_WrVirtualMemory   = 18,
+        Win32_WrPageOut         = 19,
+        Win32_WrRendezvous      = 20,
+        Win32_WrKeyedEvent      = 21,
+        Win32_WrTerminated      = 22,
+        Win32_WrProcessInSwap   = 23,
+        Win32_WrCpuRateControl  = 24,
+        Win32_WrCalloutStack    = 25,
+        Win32_WrKernel          = 26,
+        Win32_WrResource        = 27,
+        Win32_WrPushLock        = 28,
+        Win32_WrMutex           = 29,
+        Win32_WrQuantumEnd      = 30,
+        Win32_WrDispatchInt     = 31,
+        Win32_WrPreempted       = 32,
+        Win32_WrYieldExecution  = 33,
+        Win32_WrFastMutex       = 34,
+        Win32_WrGuardedMutex    = 35,
+        Win32_WrRundown         = 36,
+        Win32_WrAlertByThreadId = 37,
+        Win32_WrDeferredPreempt = 38,
+        Win32_WrPhysicalFault   = 39,
+        Win32_WrIoRing          = 40,
+        Win32_WrMdlCache        = 41,
+        Win32_WrRcu             = 42,
+        Win32_MaximumWaitReason,
+    };
+
+    tracy_force_inline int64_t Start() const { return _start.Val(); }
+    tracy_force_inline void SetStart( int64_t start ) { assert( start < (int64_t)( 1ull << 47 ) ); _start.SetVal(start); }
+    tracy_force_inline int64_t End() const { return _end.Val(); }
+    tracy_force_inline void SetEnd( int64_t end ) { assert( end < (int64_t)( 1ull << 47 ) ); _end = end; }
+    tracy_force_inline bool IsEndValid() const { return _end.IsNonNegative(); }
+    tracy_force_inline uint8_t Cpu() const { return _cpu; }
+    tracy_force_inline void SetCpu( uint8_t cpu ) { _cpu = cpu; }
+    tracy_force_inline uint8_t WakeupCpu() const { return _wakeupcpu; }
+    tracy_force_inline void SetWakeupCpu( uint8_t wakeupcpu) { _wakeupcpu = wakeupcpu; }
+    tracy_force_inline CSReason Reason() const { return CSReason( _reason ); }
+    tracy_force_inline void SetReason( int8_t reason ) { _reason = reason; }
+    tracy_force_inline int8_t State() const { return _state; }
+    tracy_force_inline void SetState( int8_t state ) { _state = state; }
    tracy_force_inline int64_t WakeupVal() const { return _wakeup.Val(); }
    tracy_force_inline void SetWakeup( int64_t wakeup ) { assert( wakeup < (int64_t)( 1ull << 47 ) ); _wakeup.SetVal( wakeup ); }
    tracy_force_inline uint16_t Thread() const { return _thread; }
    tracy_force_inline void SetThread( uint16_t thread ) { _thread = thread; }

-    tracy_force_inline void SetStartCpu( int64_t start, uint8_t cpu ) { assert( start < (int64_t)( 1ull << 47 ) ); _start_cpu = ( uint64_t( start ) << 16 ) | cpu; }
-    tracy_force_inline void SetEndReasonState( int64_t end, int8_t reason, int8_t state ) { assert( end < (int64_t)( 1ull << 47 ) ); _end_reason_state = ( uint64_t( end ) << 16 ) | ( uint64_t( reason ) << 8 ) | uint8_t( state ); }
-
-    uint64_t _start_cpu;
-    uint64_t _end_reason_state;
+    Int48 _start;
+    uint8_t _cpu;
+    uint8_t _wakeupcpu;
+    
+    Int48 _end;
+    int8_t _reason;
+    int8_t _state;
+    
    Int48 _wakeup;
-    uint16_t _thread;
+    uint16_t _thread; // currently unused ? Could store next thread or prios here.
 };

 enum { ContextSwitchDataSize = sizeof( ContextSwitchData ) };
@@ -717,6 +775,8 @@ struct GpuCtxData
    uint32_t overflowMul;
    StringIdx name;
    unordered_flat_map<uint64_t, GpuCtxThreadData> threadData;
+    unordered_flat_map<int64_t, StringIdx> noteNames;
+    unordered_flat_map<uint16_t, unordered_flat_map<int64_t, double>> notes;
    short_ptr<GpuEvent> query[64*1024];
 };

@@ -742,7 +802,7 @@ enum class PlotValueFormatting : uint8_t

 struct PlotData
 {
-    struct PlotItemSort { bool operator()( const PlotItem& lhs, const PlotItem& rhs ) { return lhs.time.Val() < rhs.time.Val(); }; };
+    struct PlotItemSort { bool operator()( const PlotItem& lhs, const PlotItem& rhs ) const { return lhs.time.Val() < rhs.time.Val(); }; };

    uint64_t name;
    double min;
@@ -809,6 +869,10 @@ struct SourceLocationComparator
 struct ContextSwitch
 {
    Vector<ContextSwitchData> v;
+    struct {
+        int64_t time = 0;
+        uint8_t cpu = -1;
+    } pendingWakeUp;
    int64_t runningTime = 0;
 };

@@ -845,6 +909,16 @@ struct SymbolStats

 enum { SymbolStatsSize = sizeof( SymbolStats ) };

+
+struct FlameGraphItem
+{
+    int64_t srcloc;
+    int64_t time;
+    StringIdx name;
+    int64_t begin;
+    std::vector<FlameGraphItem> children;
+};
+
 }

 #endif
@@ -3,9 +3,9 @@

 #include <algorithm>
 #include <stddef.h>
+#include <zstd.h>

 #include "../public/common/tracy_lz4.hpp"
-#include "../zstd/zstd.h"

 namespace tracy
 {
@@ -9,16 +9,16 @@
 #include <stdio.h>
 #include <string.h>
 #include <string>
+#include <sys/stat.h>
 #include <thread>
 #include <utility>
 #include <vector>
-
-#include <sys/stat.h>
+#include <zstd.h>

 #ifdef _MSC_VER
 #  define stat64 _stat64
 #endif
-#if defined __APPLE__ || defined __FreeBSD__
+#if defined __APPLE__ || defined __FreeBSD__ || (defined __linux__ && !defined __GLIBC__)
 #  define stat64 stat
 #endif

@@ -28,7 +28,6 @@
 #include "../public/common/TracyYield.hpp"
 #include "../public/common/tracy_lz4.hpp"
 #include "../public/common/TracyForceInline.hpp"
-#include "../zstd/zstd.h"

 namespace tracy
 {
@@ -489,9 +488,9 @@ private:
            uptr->thread = std::thread( [ptr = uptr.get()] { Worker( ptr ); } );
            m_streams.emplace_back( std::move( uptr ) );
            m_dataOffset += sz;
-       }
+        }

-       GetNextDataBlock();
+        GetNextDataBlock();
    }

    tracy_force_inline uint32_t ReadBlockSize()
@@ -14,13 +14,13 @@
 #include <thread>
 #include <utility>
 #include <vector>
+#include <zstd.h>

 #include "TracyFileHeader.hpp"
 #include "TracyFileMeta.hpp"
 #include "../public/common/tracy_lz4.hpp"
 #include "../public/common/tracy_lz4hc.hpp"
 #include "../public/common/TracyForceInline.hpp"
-#include "../zstd/zstd.h"

 namespace tracy
 {
@@ -155,6 +155,22 @@ static inline void PrintSecondsFrac( char*& buf, uint64_t v )
    }
 }

+uint64_t _int64_abs( int64_t x )
+{
+    if( x < 0 )
+    {
+        // `-x` does not work if `x` is `std::numeric_limits<int64_t>::min()`,
+        // see https://github.com/wolfpld/tracy/pull/1040
+        // This works though:
+        // https://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs
+        return -(uint64_t)x;
+    }
+    else
+    {
+        return x;
+    }
+}
+
 const char* TimeToString( int64_t _ns )
 {
    enum { Pool = 8 };
@@ -164,16 +180,11 @@ const char* TimeToString( int64_t _ns )
    char* bufstart = buf;
    bufsel = ( bufsel + 1 ) % Pool;

-    uint64_t ns;
+    uint64_t ns = _int64_abs(_ns);
    if( _ns < 0 )
    {
        *buf = '-';
        buf++;
-        ns = -_ns;
-    }
-    else
-    {
-        ns = _ns;
    }

    if( ns < 1000 )
@@ -245,16 +256,11 @@ const char* TimeToStringExact( int64_t _ns )
    char* bufstart = buf;
    bufsel = ( bufsel + 1 ) % Pool;

-    uint64_t ns;
+    uint64_t ns = _int64_abs(_ns);
    if( _ns < 0 )
    {
        *buf = '-';
        buf++;
-        ns = -_ns;
-    }
-    else
-    {
-        ns = _ns;
    }

    const char* numStart = buf;
@@ -1,19 +1,10 @@
 #ifndef __TRACYSORT_HPP__
 #define __TRACYSORT_HPP__

-#ifndef NO_PARALLEL_SORT
-#  if !defined __APPLE__ && !defined __EMSCRIPTEN__ && ( ( defined _MSC_VER && _MSVC_LANG >= 201703L ) || __cplusplus >= 201703L )
-#    if __has_include(<execution>)
-#      include <algorithm>
-#      include <execution>
-#    else
-#      define NO_PARALLEL_SORT
-#    endif
-#  else
-#    define NO_PARALLEL_SORT
-#  endif
+#ifdef __EMSCRIPTEN__
+#  include "tracy_pdqsort.h"
+#else
+#  include <ppqsort.h>
 #endif

-#include "tracy_pdqsort.h"
-
 #endif
@@ -102,10 +102,10 @@ public:
        const auto se = sb + sortedEnd;
        const auto sl = se - 1;
        const auto ue = v.end();
-#ifdef NO_PARALLEL_SORT
-        pdqsort_branchless( se, ue, comp );
+#ifdef __EMSCRIPTEN__
+        pdqsort_branchless( sb, se, comp );
 #else
-        std::sort( std::execution::par_unseq, se, ue, comp );
+        ppqsort::sort( ppqsort::execution::par, sb, se, comp );
 #endif
        const auto ss = std::lower_bound( sb, se, *se, comp );
        const auto uu = std::lower_bound( se, ue, *sl, comp );
@@ -1,4 +1,4 @@
-#include "../zstd/zstd.h"
+#include <zstd.h>

 #include "TracyEvent.hpp"
 #include "TracyTextureCompression.hpp"
@@ -24,7 +24,7 @@
 #include <capstone.h>

 #define ZDICT_STATIC_LINKING_ONLY
-#include "../zstd/zdict.h"
+#include <zdict.h>

 #include "../public/common/TracyProtocol.hpp"
 #include "../public/common/TracySystem.hpp"
@@ -37,16 +37,11 @@
 #include "TracySort.hpp"
 #include "TracyTaskDispatch.hpp"
 #include "TracyWorker.hpp"
+#include "tracy_pdqsort.h"

 namespace tracy
 {

-static tracy_force_inline uint32_t UnpackFileLine( uint64_t packed, uint32_t& line )
-{
-    line = packed & 0xFFFFFFFF;
-    return packed >> 32;
-}
-
 static bool SourceFileValid( const char* fn, uint64_t olderThan )
 {
    struct stat buf;
@@ -268,14 +263,6 @@ Worker::Worker( const char* addr, uint16_t port, int64_t memoryLimit )
    , m_buffer( new char[TargetFrameSize*3 + 1] )
    , m_bufferOffset( 0 )
    , m_inconsistentSamples( false )
-    , m_pendingStrings( 0 )
-    , m_pendingThreads( 0 )
-    , m_pendingFibers( 0 )
-    , m_pendingExternalNames( 0 )
-    , m_pendingSourceLocation( 0 )
-    , m_pendingCallstackFrames( 0 )
-    , m_pendingCallstackSubframes( 0 )
-    , m_pendingSymbolCode( 0 )
    , m_memoryLimit( memoryLimit )
    , m_callstackFrameStaging( nullptr )
    , m_traceVersion( CurrentVersion )
@@ -306,7 +293,6 @@ Worker::Worker( const char* addr, uint16_t port, int64_t memoryLimit )

 Worker::Worker( const char* name, const char* program, const std::vector<ImportEventTimeline>& timeline, const std::vector<ImportEventMessages>& messages, const std::vector<ImportEventPlots>& plots, const std::unordered_map<uint64_t, std::string>& threadNames )
    : m_hasData( true )
-    , m_delay( 0 )
    , m_resolution( 0 )
    , m_captureName( name )
    , m_captureProgram( program )
@@ -581,8 +567,10 @@ Worker::Worker( FileRead& f, EventType::Type eventMask, bool bgTasks, bool allow
        {
            throw LegacyVersion( fileVer );
        }
-
-        f.Read( m_delay );
+        if( fileVer < FileVersion( 0, 12, 3 ) )
+        {
+            f.Skip( 8 );    // m_delay
+        }
    }
    else
    {
@@ -643,22 +631,50 @@ Worker::Worker( FileRead& f, EventType::Type eventMask, bool bgTasks, bool allow
        uint32_t packageId;
        uint64_t psz;
        f.Read2( packageId, psz );
-        auto& package = *m_data.cpuTopology.emplace( packageId, unordered_flat_map<uint32_t, std::vector<uint32_t>> {} ).first;
+        auto& package = *m_data.cpuTopology.emplace( packageId, unordered_flat_map<uint32_t, unordered_flat_map<uint32_t, std::vector<uint32_t>>> {} ).first;
        package.second.reserve( psz );
        for( uint64_t j=0; j<psz; j++ )
        {
-            uint32_t coreId;
-            uint64_t csz;
-            f.Read2( coreId, csz );
-            auto& core = *package.second.emplace( coreId, std::vector<uint32_t> {} ).first;
-            core.second.reserve( csz );
-            for( uint64_t k=0; k<csz; k++ )
+            if( fileVer >= FileVersion( 0, 11, 2 ) )
            {
-                uint32_t thread;
-                f.Read( thread );
-                core.second.emplace_back( thread );
+                uint32_t dieId;
+                uint64_t dsz;
+                f.Read2( dieId, dsz );
+                auto& die = *package.second.emplace( dieId, unordered_flat_map<uint32_t, std::vector<uint32_t>> {} ).first;
+                die.second.reserve( dsz );
+                for( uint64_t k=0; k<dsz; k++ )
+                {
+                    uint32_t coreId;
+                    uint64_t csz;
+                    f.Read2( coreId, csz );
+                    auto& core = *die.second.emplace( coreId, std::vector<uint32_t> {} ).first;
+                    core.second.reserve( csz );
+                    for( uint64_t l=0; l<csz; l++ )
+                    {
+                        uint32_t thread;
+                        f.Read( thread );
+                        core.second.emplace_back( thread );

-                m_data.cpuTopologyMap.emplace( thread, CpuThreadTopology { packageId, coreId } );
+                        m_data.cpuTopologyMap.emplace( thread, CpuThreadTopology { packageId, dieId, coreId } );
+                    }
+                }
+            }
+            else
+            {
+                auto& die = *package.second.emplace( 0, unordered_flat_map<uint32_t, std::vector<uint32_t>> {} ).first;
+                uint32_t coreId;
+                uint64_t csz;
+                f.Read2( coreId, csz );
+                auto& core = *die.second.emplace( coreId, std::vector<uint32_t> {} ).first;
+                core.second.reserve( csz );
+                for( uint64_t k=0; k<csz; k++ )
+                {
+                    uint32_t thread;
+                    f.Read( thread );
+                    core.second.emplace_back( thread );
+
+                    m_data.cpuTopologyMap.emplace( thread, CpuThreadTopology { packageId, 0, coreId } );
+                }
            }
        }
    }
@@ -717,7 +733,7 @@ Worker::Worker( FileRead& f, EventType::Type eventMask, bool bgTasks, bool allow
    {
        m_data.stringData.reserve_exact( sz, m_slab );
    }
-    
+
    for( uint64_t i=0; i<sz; i++ )
    {
        uint64_t ptr, ssz;
@@ -1081,6 +1097,18 @@ Worker::Worker( FileRead& f, EventType::Type eventMask, bool bgTasks, bool allow
        auto ctx = m_slab.AllocInit<GpuCtxData>();
        uint8_t calibration;
        f.Read7( ctx->thread, calibration, ctx->count, ctx->period, ctx->type, ctx->name, ctx->overflow );
+        uint64_t notesz;
+        if( fileVer >= FileVersion( 0, 12, 4 ) )
+        {
+            f.Read( notesz );
+            for( uint64_t i = 0; i < notesz; i++ )
+            {
+                decltype( ctx->noteNames )::key_type key;
+                decltype( ctx->noteNames )::mapped_type value;
+                f.Read2( key, value );
+                ctx->noteNames[key] = value;
+            }
+        }
        ctx->hasCalibration = calibration;
        ctx->hasPeriod = ctx->period != 1.f;
        m_data.gpuCnt += ctx->count;
@@ -1095,9 +1123,32 @@ Worker::Worker( FileRead& f, EventType::Type eventMask, bool bgTasks, bool allow
                int64_t refTime = 0;
                int64_t refGpuTime = 0;
                auto td = ctx->threadData.emplace( tid, GpuCtxThreadData {} ).first;
-                ReadTimeline( f, td->second.timeline, tsz, refTime, refGpuTime, childIdx );
+                ReadTimeline( f, td->second.timeline, tsz, refTime, refGpuTime, childIdx, fileVer >= FileVersion( 0, 12, 4 ) );
            }
        }
+
+        if( fileVer >= FileVersion( 0, 12, 4 ) )
+        {
+            f.Read( notesz );
+            ctx->notes.reserve( notesz );
+            for( uint64_t i = 0; i < notesz; i++ )
+            {
+                uint16_t query_id;
+                f.Read( query_id );
+                auto& notes = ctx->notes[query_id];
+                uint64_t note_count;
+                f.Read( note_count );
+                notes.reserve( note_count );
+                for( uint64_t i = 0; i < note_count; i++ )
+                {
+                    int64_t id;
+                    double value;
+                    f.Read2( id, value );
+                    notes[id] = value;
+                }
+            }
+        }
+
        m_data.gpuData[i] = ctx;
    }

@@ -1397,6 +1448,7 @@ Worker::Worker( FileRead& f, EventType::Type eventMask, bool bgTasks, bool allow
    s_loadProgress.subTotal.store( 0, std::memory_order_relaxed );
    s_loadProgress.progress.store( LoadProgress::ContextSwitches, std::memory_order_relaxed );

+    const bool ctxSwitchesHaveWakeupCpu = fileVer >= FileVersion( 0, 11, 3 );
    if( eventMask & EventType::ContextSwitches )
    {
        f.Read( sz );
@@ -1415,16 +1467,28 @@ Worker::Worker( FileRead& f, EventType::Type eventMask, bool bgTasks, bool allow
            for( uint64_t j=0; j<csz; j++ )
            {
                int64_t deltaWakeup, deltaStart, diff, thread;
-                uint8_t cpu;
+                uint8_t cpu, wakeupcpu;
                int8_t reason, state;
                f.Read7( deltaWakeup, deltaStart, diff, cpu, reason, state, thread );
+                if ( ctxSwitchesHaveWakeupCpu )
+                {
+                    f.Read(wakeupcpu);
+                }
+                else
+                {
+                    wakeupcpu = cpu;
+                }
                refTime += deltaWakeup;
                ptr->SetWakeup( refTime );
+                ptr->SetWakeupCpu( wakeupcpu );
                refTime += deltaStart;
-                ptr->SetStartCpu( refTime, cpu );
+                ptr->SetStart( refTime );
+                ptr->SetCpu( cpu );
                if( diff > 0 ) runningTime += diff;
                refTime += diff;
-                ptr->SetEndReasonState( refTime, reason, state );
+                ptr->SetEnd( refTime );
+                ptr->SetReason( reason );
+                ptr->SetState( state );
                ptr->SetThread( CompressThread( thread ) );
                ptr++;
            }
@@ -1442,7 +1506,7 @@ Worker::Worker( FileRead& f, EventType::Type eventMask, bool bgTasks, bool allow
            f.Skip( sizeof( uint64_t ) );
            uint64_t csz;
            f.Read( csz );
-            f.Skip( csz * ( sizeof( int64_t ) * 4 + sizeof( int8_t ) * 3 ) );
+            f.Skip( csz * ( sizeof( int64_t ) * 4 + sizeof( int8_t ) * ( 3 + int( ctxSwitchesHaveWakeupCpu ) ) ) );
        }
    }

@@ -1530,12 +1594,13 @@ Worker::Worker( FileRead& f, EventType::Type eventMask, bool bgTasks, bool allow
            m_data.symbolLoc[symIdx++] = SymbolLocation { symAddr, size.Val() };
        }
    }
-#ifdef NO_PARALLEL_SORT
+
+#ifdef __EMSCRIPTEN__
    pdqsort_branchless( m_data.symbolLoc.begin(), m_data.symbolLoc.end(), [] ( const auto& l, const auto& r ) { return l.addr < r.addr; } );
    pdqsort_branchless( m_data.symbolLocInline.begin(), m_data.symbolLocInline.end() );
 #else
-    std::sort( std::execution::par_unseq, m_data.symbolLoc.begin(), m_data.symbolLoc.end(), [] ( const auto& l, const auto& r ) { return l.addr < r.addr; } );
-    std::sort( std::execution::par_unseq, m_data.symbolLocInline.begin(), m_data.symbolLocInline.end() );
+    ppqsort::sort( ppqsort::execution::par, m_data.symbolLoc.begin(), m_data.symbolLoc.end(), [] ( const auto& l, const auto& r ) { return l.addr < r.addr; } );
+    ppqsort::sort( ppqsort::execution::par, m_data.symbolLocInline.begin(), m_data.symbolLocInline.end() );
 #endif

    f.Read( sz );
@@ -2441,12 +2506,17 @@ const SourceLocation& Worker::GetSourceLocation( int16_t srcloc ) const
    {
        return *m_data.sourceLocationPayload[-srcloc-1];
    }
-    else
+    else if( srcloc != std::numeric_limits<int16_t>::max() )
    {
        const auto it = m_data.sourceLocation.find( m_data.sourceLocationExpand[srcloc] );
        assert( it != m_data.sourceLocation.end() );
        return it->second;
    }
+    else
+    {
+        static const SourceLocation emptySourceLoc = {};
+        return emptySourceLoc;
+    }
 }

 std::pair<const char*, const char*> Worker::GetExternalName( uint64_t id ) const
@@ -2715,7 +2785,6 @@ void Worker::Exec()
        m_data.framesBase->frames.push_back( FrameEvent{ 0, -1, -1 } );
        m_data.framesBase->frames.push_back( FrameEvent{ initEnd, -1, -1 } );
        m_data.lastTime = initEnd;
-        m_delay = TscPeriod( welcome.delay );
        m_resolution = TscPeriod( welcome.resolution );
        m_pid = welcome.pid;
        m_samplingPeriod = welcome.samplingPeriod;
@@ -2723,7 +2792,7 @@ void Worker::Exec()
        m_captureProgram = welcome.programName;
        m_captureTime = welcome.epoch;
        m_executableTime = welcome.exectime;
-        m_ignoreMemFreeFaults = ( welcome.flags & WelcomeFlag::OnDemand ) || ( welcome.flags & WelcomeFlag::IsApple );
+        m_ignoreMemFreeFaults = ( welcome.flags & WelcomeFlag::OnDemand ) || ( welcome.flags & WelcomeFlag::IgnoreMemFaults );
        m_ignoreFrameEndFaults = welcome.flags & WelcomeFlag::OnDemand;
        m_data.cpuArch = (CpuArchitecture)welcome.cpuArch;
        m_codeTransfer = welcome.flags & WelcomeFlag::CodeTransfer;
@@ -2790,7 +2859,15 @@ void Worker::Exec()
        const char* end = ptr + netbuf.size;

        {
-            std::lock_guard<std::mutex> lock( m_data.lock );
+            std::unique_lock<std::mutex> lk( m_data.lock );
+            if( m_data.mainThreadWantsLock )
+            {
+                // Hand over the lock to the main thread to avoid starving it.
+                // Wait for a millisecond maximum to avoid the opposite 
+                // problem where main thread would never let us execute
+                m_data.lockCv.wait_for( lk, std::chrono::milliseconds( 1 ) );
+            }
+
            while( ptr < end )
            {
                auto ev = (const QueueItem*)ptr;
@@ -3155,6 +3232,16 @@ void Worker::QueryDataTransfer( const void* ptr, size_t size )
    }
 }

+void Worker::QueryCallstackFrame( uint64_t addr )
+{
+    const auto packed = PackPointer( addr );
+    if( m_data.callstackFrameMap.contains( packed ) ) return;
+
+    m_pendingCallstackFrames++;
+    m_data.callstackFrameMap.emplace( packed, nullptr );
+    Query( ServerQueryCallstackFrame, addr );
+}
+
 bool Worker::DispatchProcess( const QueueItem& ev, const char*& ptr )
 {
    if( ev.hdr.idx >= (int)QueueType::StringData )
@@ -3310,6 +3397,9 @@ int16_t Worker::ShrinkSourceLocationReal( uint64_t srcloc )
 int16_t Worker::NewShrinkedSourceLocation( uint64_t srcloc )
 {
    assert( m_data.sourceLocationExpand.size() < std::numeric_limits<int16_t>::max() );
+    if( ( m_data.sourceLocationExpand.size() + 1 ) == std::numeric_limits<int16_t>::max() )
+        return std::numeric_limits<int16_t>::max();
+
    const auto sz = int16_t( m_data.sourceLocationExpand.size() );
    m_data.sourceLocationExpand.push_back( srcloc );
 #ifndef TRACY_NO_STATISTICS
@@ -3822,7 +3912,7 @@ void Worker::AddSymbolCode( uint64_t ptr, const char* data, size_t sz )
        rval = cs_open( CS_ARCH_ARM, CS_MODE_ARM, &handle );
        break;
    case CpuArchArm64:
-        rval = cs_open( CS_ARCH_ARM64, CS_MODE_ARM, &handle );
+        rval = cs_open( CS_ARCH_AARCH64, CS_MODE_ARM, &handle );
        break;
    default:
        assert( false );
@@ -3838,11 +3928,7 @@ void Worker::AddSymbolCode( uint64_t ptr, const char* data, size_t sz )
        {
            const auto& op = insn[i];
            const auto addr = op.address;
-            if( m_data.callstackFrameMap.find( PackPointer( addr ) ) == m_data.callstackFrameMap.end() )
-            {
-                m_pendingCallstackFrames++;
-                Query( ServerQueryCallstackFrame, addr );
-            }
+            QueryCallstackFrame( addr );

            uint64_t callAddr = 0;
            const auto& detail = *op.detail;
@@ -3866,9 +3952,9 @@ void Worker::AddSymbolCode( uint64_t ptr, const char* data, size_t sz )
                        }
                        break;
                    case CpuArchArm64:
-                        if( detail.arm64.op_count == 1 && detail.arm64.operands[0].type == ARM64_OP_IMM )
+                        if( detail.aarch64.op_count == 1 && detail.aarch64.operands[0].type == AARCH64_OP_IMM )
                        {
-                            callAddr = (uint64_t)detail.arm64.operands[0].imm;
+                            callAddr = (uint64_t)detail.aarch64.operands[0].imm;
                        }
                        break;
                    default:
@@ -3878,11 +3964,7 @@ void Worker::AddSymbolCode( uint64_t ptr, const char* data, size_t sz )
                    if( callAddr != 0 ) break;
                }
            }
-            if( callAddr != 0 && m_data.callstackFrameMap.find( PackPointer( callAddr ) ) == m_data.callstackFrameMap.end() )
-            {
-                m_pendingCallstackFrames++;
-                Query( ServerQueryCallstackFrame, callAddr );
-            }
+            if( callAddr != 0 ) QueryCallstackFrame( callAddr );
        }
        cs_free( insn, cnt );
    }
@@ -3947,12 +4029,7 @@ void Worker::AddCallstackPayload( const char* _data, size_t _sz )

        for( auto& frame : *arr )
        {
-            auto fit = m_data.callstackFrameMap.find( frame );
-            if( fit == m_data.callstackFrameMap.end() )
-            {
-                m_pendingCallstackFrames++;
-                Query( ServerQueryCallstackFrame, GetCanonicalPointer( frame ) );
-            }
+            QueryCallstackFrame( GetCanonicalPointer( frame ) );
        }
    }
    else
@@ -4040,12 +4117,7 @@ void Worker::AddCallstackAllocPayload( const char* data )

        for( auto& frame : *arr )
        {
-            auto fit = m_data.callstackFrameMap.find( frame );
-            if( fit == m_data.callstackFrameMap.end() )
-            {
-                m_pendingCallstackFrames++;
-                Query( ServerQueryCallstackFrame, GetCanonicalPointer( frame ) );
-            }
+            QueryCallstackFrame( GetCanonicalPointer( frame ) );
        }
    }
    else
@@ -4133,10 +4205,10 @@ void Worker::DoPostponedSymbols()
 {
    if( m_data.newSymbolsIndex >= 0 )
    {
-#ifdef NO_PARALLEL_SORT
+#ifdef __EMSCRIPTEN__
        pdqsort_branchless( m_data.symbolLoc.begin() + m_data.newSymbolsIndex, m_data.symbolLoc.end(), [] ( const auto& l, const auto& r ) { return l.addr < r.addr; } );
 #else
-        std::sort( std::execution::par_unseq, m_data.symbolLoc.begin() + m_data.newSymbolsIndex, m_data.symbolLoc.end(), [] ( const auto& l, const auto& r ) { return l.addr < r.addr; } );
+        ppqsort::sort( ppqsort::execution::par, m_data.symbolLoc.begin() + m_data.newSymbolsIndex, m_data.symbolLoc.end(), [] ( const auto& l, const auto& r ) { return l.addr < r.addr; } );
 #endif
        const auto ms = std::lower_bound( m_data.symbolLoc.begin(), m_data.symbolLoc.begin() + m_data.newSymbolsIndex, m_data.symbolLoc[m_data.newSymbolsIndex], [] ( const auto& l, const auto& r ) { return l.addr < r.addr; } );
        std::inplace_merge( ms, m_data.symbolLoc.begin() + m_data.newSymbolsIndex, m_data.symbolLoc.end(), [] ( const auto& l, const auto& r ) { return l.addr < r.addr; } );
@@ -4148,10 +4220,10 @@ void Worker::DoPostponedInlineSymbols()
 {
    if( m_data.newInlineSymbolsIndex >= 0 )
    {
-#ifdef NO_PARALLEL_SORT
+#ifdef __EMSCRIPTEN__
        pdqsort_branchless( m_data.symbolLocInline.begin() + m_data.newInlineSymbolsIndex, m_data.symbolLocInline.end() );
 #else
-        std::sort( std::execution::par_unseq, m_data.symbolLocInline.begin() + m_data.newInlineSymbolsIndex, m_data.symbolLocInline.end() );
+        ppqsort::sort( ppqsort::execution::par, m_data.symbolLocInline.begin() + m_data.newInlineSymbolsIndex, m_data.symbolLocInline.end() );
 #endif
        const auto ms = std::lower_bound( m_data.symbolLocInline.begin(), m_data.symbolLocInline.begin() + m_data.newInlineSymbolsIndex, m_data.symbolLocInline[m_data.newInlineSymbolsIndex] );
        std::inplace_merge( ms, m_data.symbolLocInline.begin() + m_data.newInlineSymbolsIndex, m_data.symbolLocInline.end() );
@@ -4577,6 +4649,12 @@ bool Worker::Process( const QueueItem& ev )
    case QueueType::GpuContextName:
        ProcessGpuContextName( ev.gpuContextName );
        break;
+    case QueueType::GpuAnnotationName:
+        ProcessGpuAnnotationName( ev.gpuAnnotationName );
+        break;
+    case QueueType::GpuZoneAnnotation:
+        ProcessGpuZoneAnnotation( ev.zoneAnnotation );
+        break;
    case QueueType::MemAlloc:
        ProcessMemAlloc( ev.memAlloc );
        break;
@@ -4601,6 +4679,12 @@ bool Worker::Process( const QueueItem& ev )
    case QueueType::MemFreeCallstackNamed:
        ProcessMemFreeCallstackNamed( ev.memFree );
        break;
+    case QueueType::MemDiscard:
+        ProcessMemDiscard( ev.memDiscard );
+        break;
+    case QueueType::MemDiscardCallstack:
+        ProcessMemDiscardCallstack( ev.memDiscard );
+        break;
    case QueueType::CallstackSerial:
        ProcessCallstackSerial();
        break;
@@ -5069,7 +5153,7 @@ void Worker::ProcessFrameMarkStart( const QueueFrameMark& ev )

 void Worker::ProcessFrameMarkEnd( const QueueFrameMark& ev )
 {
-    auto fd = m_data.frames.Retrieve( ev.name, [this] ( uint64_t name ) -> FrameData* {
+    auto fd = m_data.frames.Retrieve( ev.name, [] ( uint64_t name ) -> FrameData* {
        return nullptr;
    }, [this] ( uint64_t name ) {
        Query( ServerQueryFrameName, name );
@@ -5278,8 +5362,8 @@ void Worker::ProcessZoneColor( const QueueZoneColor& ev )

 void Worker::ProcessZoneValue( const QueueZoneValue& ev )
 {
-    char tmp[32];
-    const auto tsz = sprintf( tmp, "%" PRIu64, ev.value );
+    char tmp[64];
+    const auto tsz = sprintf( tmp, "%" PRIu64 " [0x%" PRIx64 "]", ev.value, ev.value );

    auto td = RetrieveThread( m_threadCtx );
    if( !td )
@@ -5707,6 +5791,7 @@ void Worker::ProcessGpuZoneBeginImplCommon( GpuEvent* zone, const QueueGpuZoneBe
    zone->SetGpuEnd( -1 );
    zone->callstack.SetVal( 0 );
    zone->SetChild( -1 );
+    zone->query_id = ev.queryId;

    uint64_t ztid;
    if( ctx->thread == 0 )
@@ -5930,7 +6015,7 @@ void Worker::ProcessGpuCalibration( const QueueGpuCalibration& ev )
    ctx->calibratedGpuTime = gpuTime;
    ctx->calibratedCpuTime = TscTime( ev.cpuTime );
 }
-    
+
 void Worker::ProcessGpuTimeSync( const QueueGpuTimeSync& ev )
 {
    auto ctx = m_gpuCtxMap[ev.context];
@@ -5962,6 +6047,26 @@ void Worker::ProcessGpuContextName( const QueueGpuContextName& ev )
    ctx->name = StringIdx( idx );
 }

+void Worker::ProcessGpuAnnotationName( const QueueGpuAnnotationName& ev )
+{
+    auto ctx = m_gpuCtxMap[ev.context];
+    assert( ctx );
+    const auto idx = GetSingleStringIdx();
+    ctx->noteNames[ev.noteId] = StringIdx( idx );
+}
+
+void Worker::ProcessGpuZoneAnnotation( const QueueGpuZoneAnnotation& ev )
+{
+    auto ctx = m_gpuCtxMap[ev.context];
+    assert( ctx );
+    auto note = ctx->notes.find( ev.queryId );
+    if( note == ctx->notes.end() ) {
+      note = ctx->notes.emplace( ev.queryId, decltype(ctx->notes)::mapped_type{} ).first;
+      note->second.reserve( ctx->noteNames.size() );
+    }
+    note->second[ev.noteId] = ev.value;
+}
+
 MemEvent* Worker::ProcessMemAllocImpl( MemData& memdata, const QueueMemAlloc& ev )
 {
    if( memdata.active.find( ev.ptr ) != memdata.active.end() )
@@ -6130,6 +6235,65 @@ void Worker::ProcessMemFreeCallstackNamed( const QueueMemFree& ev )
    m_serialNextCallstack = 0;
 }

+void Worker::ProcessMemDiscard( const QueueMemDiscard& ev )
+{
+    assert( m_memNamePayload == 0 );
+    auto it = m_data.memNameMap.find( ev.name );
+    if( it == m_data.memNameMap.end() ) return;
+
+    const auto refTime = RefTime( m_refTimeSerial, ev.time );
+    auto& memdata = *it->second;
+
+    const auto time = TscTime( refTime );
+    if( m_data.lastTime < time ) m_data.lastTime = time;
+    NoticeThread( ev.thread );
+    const auto thread = CompressThread( ev.thread );
+
+    for( auto& v : memdata.active )
+    {
+        memdata.frees.push_back( v.second );
+        auto& mem = memdata.data[v.second];
+        mem.SetTimeThreadFree( time, thread );
+        memdata.usage -= mem.Size();
+        MemAllocChanged( memdata, time );
+    }
+
+    memdata.active.clear();
+    assert( memdata.usage == 0 );
+}
+
+void Worker::ProcessMemDiscardCallstack( const QueueMemDiscard& ev )
+{
+    assert( m_serialNextCallstack != 0 );
+    auto cs = m_serialNextCallstack;
+    m_serialNextCallstack = 0;
+
+    assert( m_memNamePayload == 0 );
+    auto it = m_data.memNameMap.find( ev.name );
+    if( it == m_data.memNameMap.end() ) return;
+
+    const auto refTime = RefTime( m_refTimeSerial, ev.time );
+    auto& memdata = *it->second;
+
+    const auto time = TscTime( refTime );
+    if( m_data.lastTime < time ) m_data.lastTime = time;
+    NoticeThread( ev.thread );
+    const auto thread = CompressThread( ev.thread );
+
+    for( auto& v : memdata.active )
+    {
+        memdata.frees.push_back( v.second );
+        auto& mem = memdata.data[v.second];
+        mem.SetTimeThreadFree( time, thread );
+        mem.csFree.SetVal( cs );
+        memdata.usage -= mem.Size();
+        MemAllocChanged( memdata, time );
+    }
+
+    memdata.active.clear();
+    assert( memdata.usage == 0 );
+}
+
 void Worker::ProcessCallstackSerial()
 {
    assert( m_pendingCallstackId != 0 );
@@ -6414,7 +6578,7 @@ void Worker::ProcessCallstackFrameSize( const QueueCallstackFrameSize& ev )

    // Frames may be duplicated due to recursion
    auto fmit = m_data.callstackFrameMap.find( PackPointer( ev.ptr ) );
-    if( fmit == m_data.callstackFrameMap.end() )
+    if( !fmit->second )
    {
        m_callstackFrameStaging = m_slab.Alloc<CallstackFrameData>();
        m_callstackFrameStaging->size = ev.size;
@@ -6527,8 +6691,10 @@ void Worker::ProcessCallstackFrame( const QueueCallstackFrame& ev, bool querySym

        if( --m_pendingCallstackSubframes == 0 )
        {
-            assert( m_data.callstackFrameMap.find( frameId ) == m_data.callstackFrameMap.end() );
-            m_data.callstackFrameMap.emplace( frameId, m_callstackFrameStaging );
+            auto fit = m_data.callstackFrameMap.find( frameId );
+            assert( fit != m_data.callstackFrameMap.end() );
+            assert( !fit->second );
+            fit->second = m_callstackFrameStaging;
            m_data.codeSymbolMap.emplace( m_callstackFrameStagingPtr, m_callstackFrameStaging->data[0].symAddr );
            m_callstackFrameStaging = nullptr;
        }
@@ -6683,9 +6849,12 @@ void Worker::ProcessContextSwitch( const QueueContextSwitch& ev )
            auto& item = data.back();
            assert( item.Start() <= time );
            assert( item.End() == -1 );
+            //TODO: It may happen that events are being dropped (for example due to breaking in the debugger, or we are simply too slow to handle the events)
+            //      We should handle this properly in some way, but it is unclear how. We can't even really detect it properly here other than when cpu doesn't match.
+            //      Something could be displayed onscreen when gaps are detected at the event ringbuffer level?
            item.SetEnd( time );
-            item.SetReason( ev.reason );
-            item.SetState( ev.state );
+            item.SetReason( ev.oldThreadWaitReason );
+            item.SetState( ev.oldThreadState );

            const auto dt = time - item.Start();
            it->second->runningTime += dt;
@@ -6733,6 +6902,25 @@ void Worker::ProcessContextSwitch( const QueueContextSwitch& ev )
            }
            item = &data.push_next();
            item->SetWakeup( time );
+            item->SetWakeupCpu( ev.cpu );
+
+            if ( it->second->pendingWakeUp.time != 0 )
+            {
+                auto wakeupTime = it->second->pendingWakeUp.time;
+                if ( data.size() > 1 )
+                {
+                    // Sometimes the OS tell us it scheduled a thread that was still alive but on the
+                    // verge of being switched out. We thus end up with `wakeup < switchout`. 
+                    // So instead, compare with the previous wakeup.
+                    const auto previousWakeup = data[data.size() - 2].WakeupVal();
+                    if ( previousWakeup <= wakeupTime && wakeupTime <= time )
+                    {
+                        item->SetWakeup( wakeupTime );
+                        item->SetWakeupCpu( it->second->pendingWakeUp.cpu );
+                        it->second->pendingWakeUp.time = 0;
+                    }
+                }
+            }
        }
        item->SetStart( time );
        item->SetEnd( -1 );
@@ -6772,15 +6960,32 @@ void Worker::ProcessThreadWakeup( const QueueThreadWakeup& ev )
        it = m_data.ctxSwitch.emplace( ev.thread, ctx ).first;
    }
    auto& data = it->second->v;
-    if( !data.empty() && !data.back().IsEndValid() ) return;        // wakeup of a running thread
-    auto& item = data.push_next();
-    item.SetWakeup( time );
-    item.SetStart( time );
-    item.SetEnd( -1 );
-    item.SetCpu( 0 );
-    item.SetReason( ContextSwitchData::Wakeup );
-    item.SetState( -1 );
-    item.SetThread( 0 );
+    if( !data.empty() && !data.back().IsEndValid() )
+    {
+        // We received the wakeup before thread switches out. This can actually happen!
+        // So instead of dropping the information, keep the last one around so that we
+        // may fetch it once the thread actually switches out.
+        // We rely on the fact we won't get another one in the meantime.
+        auto& item = data.back();
+        it->second->pendingWakeUp.time = time;
+        it->second->pendingWakeUp.cpu = ev.cpu;
+        return;
+    }
+    else
+    {
+        auto& item = data.push_next();
+        item.SetWakeupCpu( ev.cpu );
+        item.SetWakeup( time );
+        item.SetStart( time );
+        item.SetEnd( -1 );
+        item.SetCpu( 0 );
+        item.SetReason( ContextSwitchData::Wakeup );
+        item.SetState( -1 );
+        item.SetThread( 0 );
+        //TODO: Adjust reason + adjust count instead of thread which is unused?
+        // Adjust Reason 1 => Unwait
+        // Adjust Reason 2 => Boost
+    }
 }

 void Worker::ProcessTidToPid( const QueueTidToPid& ev )
@@ -6859,13 +7064,17 @@ void Worker::ProcessSourceCodeNotAvailable( const QueueSourceCodeNotAvailable& e
 void Worker::ProcessCpuTopology( const QueueCpuTopology& ev )
 {
    auto package = m_data.cpuTopology.find( ev.package );
-    if( package == m_data.cpuTopology.end() ) package = m_data.cpuTopology.emplace( ev.package, unordered_flat_map<uint32_t, std::vector<uint32_t>> {} ).first;
-    auto core = package->second.find( ev.core );
-    if( core == package->second.end() ) core = package->second.emplace( ev.core, std::vector<uint32_t> {} ).first;
+    if( package == m_data.cpuTopology.end() ) package = m_data.cpuTopology.emplace( ev.package, unordered_flat_map<uint32_t, unordered_flat_map<uint32_t, std::vector<uint32_t>>> {} ).first;
+
+    auto die = package->second.find( ev.die );
+    if( die == package->second.end() ) die = package->second.emplace( ev.die, unordered_flat_map<uint32_t, std::vector<uint32_t>> {} ).first;
+
+    auto core = die->second.find( ev.core );
+    if( core == die->second.end() ) core = die->second.emplace( ev.core, std::vector<uint32_t> {} ).first;
    core->second.emplace_back( ev.thread );

    assert( m_data.cpuTopologyMap.find( ev.thread ) == m_data.cpuTopologyMap.end() );
-    m_data.cpuTopologyMap.emplace( ev.thread, CpuThreadTopology { ev.package, ev.core } );
+    m_data.cpuTopologyMap.emplace( ev.thread, CpuThreadTopology { ev.package, ev.die, ev.core } );
 }

 void Worker::ProcessMemNamePayload( const QueueMemNamePayload& ev )
@@ -6922,9 +7131,13 @@ void Worker::ProcessFiberEnter( const QueueFiberEnter& ev )
    }
    auto& data = cit->second->v;
    auto& item = data.push_next();
-    item.SetStartCpu( t, 0 );
+    item.SetStart( t );
+    item.SetCpu( 0 );
    item.SetWakeup( t );
-    item.SetEndReasonState( -1, ContextSwitchData::Fiber, -1 );
+    item.SetWakeupCpu( 0 );
+    item.SetEnd( -1 );
+    item.SetReason( ContextSwitchData::Fiber );
+    item.SetState( -1 );
    item.SetThread( CompressThread( ev.thread ) );
 }

@@ -6991,10 +7204,10 @@ void Worker::CreateMemAllocPlot( MemData& memdata )

 void Worker::ReconstructMemAllocPlot( MemData& mem )
 {
-#ifdef NO_PARALLEL_SORT
+#ifdef __EMSCRIPTEN__
    pdqsort_branchless( mem.frees.begin(), mem.frees.end(), [&mem] ( const auto& lhs, const auto& rhs ) { return mem.data[lhs].TimeFree() < mem.data[rhs].TimeFree(); } );
 #else
-    std::sort( std::execution::par_unseq, mem.frees.begin(), mem.frees.end(), [&mem] ( const auto& lhs, const auto& rhs ) { return mem.data[lhs].TimeFree() < mem.data[rhs].TimeFree(); } );
+    ppqsort::sort( ppqsort::execution::par, mem.frees.begin(), mem.frees.end(), [&mem] ( const auto& lhs, const auto& rhs ) { return mem.data[lhs].TimeFree() < mem.data[rhs].TimeFree(); } );
 #endif

    const auto psz = mem.data.size() + mem.frees.size() + 1;
@@ -7440,14 +7653,14 @@ int64_t Worker::ReadTimelineHaveSize( FileRead& f, ZoneEvent* zone, int64_t refT
    }
 }

-void Worker::ReadTimeline( FileRead& f, GpuEvent* zone, int64_t& refTime, int64_t& refGpuTime, int32_t& childIdx )
+void Worker::ReadTimeline( FileRead& f, GpuEvent* zone, int64_t& refTime, int64_t& refGpuTime, int32_t& childIdx, bool hasQueryId )
 {
    uint64_t sz;
    f.Read( sz );
-    ReadTimelineHaveSize( f, zone, refTime, refGpuTime, childIdx, sz );
+    ReadTimelineHaveSize( f, zone, refTime, refGpuTime, childIdx, sz, hasQueryId );
 }

-void Worker::ReadTimelineHaveSize( FileRead& f, GpuEvent* zone, int64_t& refTime, int64_t& refGpuTime, int32_t& childIdx, uint64_t sz )
+void Worker::ReadTimelineHaveSize( FileRead& f, GpuEvent* zone, int64_t& refTime, int64_t& refGpuTime, int32_t& childIdx, uint64_t sz, bool hasQueryId )
 {
    if( sz == 0 )
    {
@@ -7458,7 +7671,7 @@ void Worker::ReadTimelineHaveSize( FileRead& f, GpuEvent* zone, int64_t& refTime
        const auto idx = childIdx;
        childIdx++;
        zone->SetChild( idx );
-        ReadTimeline( f, m_data.gpuChildren[idx], sz, refTime, refGpuTime, childIdx );
+        ReadTimeline( f, m_data.gpuChildren[idx], sz, refTime, refGpuTime, childIdx, hasQueryId );
    }
 }

@@ -7600,7 +7813,7 @@ int64_t Worker::ReadTimeline( FileRead& f, Vector<short_ptr<ZoneEvent>>& _vec, u
    return refTime;
 }

-void Worker::ReadTimeline( FileRead& f, Vector<short_ptr<GpuEvent>>& _vec, uint64_t size, int64_t& refTime, int64_t& refGpuTime, int32_t& childIdx )
+void Worker::ReadTimeline( FileRead& f, Vector<short_ptr<GpuEvent>>& _vec, uint64_t size, int64_t& refTime, int64_t& refGpuTime, int32_t& childIdx, bool hasQueryId )
 {
    assert( size != 0 );
    const auto lp = s_loadProgress.subProgress.load( std::memory_order_relaxed );
@@ -7624,13 +7837,14 @@ void Worker::ReadTimeline( FileRead& f, Vector<short_ptr<GpuEvent>>& _vec, uint6
        zone->SetCpuStart( refTime );
        zone->SetGpuStart( refGpuTime );

-        ReadTimelineHaveSize( f, zone, refTime, refGpuTime, childIdx, childSz );
+        ReadTimelineHaveSize( f, zone, refTime, refGpuTime, childIdx, childSz, hasQueryId );

        f.Read2( tcpu, tgpu );
        refTime += tcpu;
        refGpuTime += tgpu;
        zone->SetCpuEnd( refTime );
        zone->SetGpuEnd( refGpuTime );
+        if( hasQueryId ) f.Read( zone->query_id );
    }
    while( ++zone != end );
 }
@@ -7663,7 +7877,6 @@ void Worker::Write( FileWrite& f, bool fiDict )

    f.Write( FileHeader, sizeof( FileHeader ) );

-    f.Write( &m_delay, sizeof( m_delay ) );
    f.Write( &m_resolution, sizeof( m_resolution ) );
    f.Write( &m_timerMul, sizeof( m_timerMul ) );
    f.Write( &m_data.lastTime, sizeof( m_data.lastTime ) );
@@ -7699,14 +7912,20 @@ void Worker::Write( FileWrite& f, bool fiDict )
        sz = package.second.size();
        f.Write( &package.first, sizeof( package.first ) );
        f.Write( &sz, sizeof( sz ) );
-        for( auto& core : package.second )
+        for( auto& die : package.second )
        {
-            sz = core.second.size();
-            f.Write( &core.first, sizeof( core.first ) );
+            sz = die.second.size();
+            f.Write( &die.first, sizeof( die.first ) );
            f.Write( &sz, sizeof( sz ) );
-            for( auto& thread : core.second )
+            for( auto& core : die.second )
            {
-                f.Write( &thread, sizeof( thread ) );
+                sz = core.second.size();
+                f.Write( &core.first, sizeof( core.first ) );
+                f.Write( &sz, sizeof( sz ) );
+                for( auto& thread : core.second )
+                {
+                    f.Write( &thread, sizeof( thread ) );
+                }
            }
        }
    }
@@ -7930,10 +8149,10 @@ void Worker::Write( FileWrite& f, bool fiDict )
        }
        if( m_inconsistentSamples )
        {
-#ifdef NO_PARALLEL_SORT
+#ifdef __EMSCRIPTEN__
            pdqsort_branchless( thread->samples.begin(), thread->samples.end(), [] ( const auto& lhs, const auto& rhs ) { return lhs.time.Val() < rhs.time.Val(); } );
 #else
-            std::sort( std::execution::par_unseq, thread->samples.begin(), thread->samples.end(), [] ( const auto& lhs, const auto& rhs ) { return lhs.time.Val() < rhs.time.Val(); } );
+            ppqsort::sort( ppqsort::execution::par, thread->samples.begin(), thread->samples.end(), [] ( const auto& lhs, const auto& rhs ) { return lhs.time.Val() < rhs.time.Val(); } );
 #endif
        }
        sz = thread->samples.size();
@@ -7963,6 +8182,13 @@ void Worker::Write( FileWrite& f, bool fiDict )
        f.Write( &ctx->type, sizeof( ctx->type ) );
        f.Write( &ctx->name, sizeof( ctx->name ) );
        f.Write( &ctx->overflow, sizeof( ctx->overflow ) );
+        sz = ctx->noteNames.size();
+        f.Write( &sz, sizeof( sz ) );
+        for( auto& p : ctx->noteNames )
+        {
+            f.Write( &p.first, sizeof( p.first ) );
+            f.Write( &p.second, sizeof( p.second ) );
+        }
        sz = ctx->threadData.size();
        f.Write( &sz, sizeof( sz ) );
        for( auto& td : ctx->threadData )
@@ -7973,6 +8199,20 @@ void Worker::Write( FileWrite& f, bool fiDict )
            f.Write( &tid, sizeof( tid ) );
            WriteTimeline( f, td.second.timeline, refTime, refGpuTime );
        }
+
+        sz = ctx->notes.size();
+        f.Write( &sz, sizeof( sz ) );
+        for( auto& notes : ctx->notes )
+        {
+            f.Write( &notes.first, sizeof( notes.first ) );
+            sz = notes.second.size();
+            f.Write( &sz, sizeof( sz ) );
+            for( auto& note : notes.second )
+            {
+                f.Write( &note.first, sizeof( note.first ) );
+                f.Write( &note.second, sizeof( note.second ) );
+            }
+        }
    }

    sz = m_data.plots.Data().size();
@@ -8057,15 +8297,19 @@ void Worker::Write( FileWrite& f, bool fiDict )
        f.Write( cs->data(), sizeof( CallstackFrameId ) * csz );
    }

-    sz = m_data.callstackFrameMap.size();
+    sz = m_data.callstackFrameMap.size() - m_pendingCallstackFrames;
    f.Write( &sz, sizeof( sz ) );
+    uint64_t check = 0;
    for( auto& frame : m_data.callstackFrameMap )
    {
+        if( !frame.second ) continue;
        f.Write( &frame.first, sizeof( CallstackFrameId ) );
        f.Write( &frame.second->size, sizeof( frame.second->size ) );
        f.Write( &frame.second->imageName, sizeof( frame.second->imageName ) );
        f.Write( frame.second->data, sizeof( CallstackFrame ) * frame.second->size );
+        check++;
    }
+    assert( check == sz );

    sz = m_data.appInfo.size();
    f.Write( &sz, sizeof( sz ) );
@@ -8171,6 +8415,7 @@ void Worker::Write( FileWrite& f, bool fiDict )
            WriteTimeOffset( f, refTime, cs.Start() );
            WriteTimeOffset( f, refTime, cs.End() );
            uint8_t cpu = cs.Cpu();
+            uint8_t wakeupcpu = cs.WakeupCpu();
            int8_t reason = cs.Reason();
            int8_t state = cs.State();
            uint64_t thread = DecompressThread( cs.Thread() );
@@ -8178,6 +8423,7 @@ void Worker::Write( FileWrite& f, bool fiDict )
            f.Write( &reason, sizeof( reason ) );
            f.Write( &state, sizeof( state ) );
            f.Write( &thread, sizeof( thread ) );
+            f.Write( &wakeupcpu, sizeof( wakeupcpu ) );
        }
    }

@@ -8347,6 +8593,7 @@ void Worker::WriteTimelineImpl( FileWrite& f, const V& vec, int64_t& refTime, in

        WriteTimeOffset( f, refTime, v.CpuEnd() );
        WriteTimeOffset( f, refGpuTime, v.GpuEnd() );
+        f.Write( &v.query_id, sizeof( v.query_id ) );
    }
 }

@@ -126,7 +126,7 @@ public:
    struct ZoneThreadData
    {
        tracy_force_inline ZoneEvent* Zone() const { return (ZoneEvent*)( _zone_thread >> 16 ); }
-        tracy_force_inline void SetZone( ZoneEvent* zone ) { assert( ( uint64_t( zone ) & 0xFFFF000000000000 ) == 0 ); memcpy( ((char*)&_zone_thread)+2, &zone, 4 ); memcpy( ((char*)&_zone_thread)+6, ((char*)&zone)+4, 2 ); }
+        tracy_force_inline void SetZone( ZoneEvent* zone ) { auto z64 = (uint64_t)zone; assert( ( z64 & 0xFFFF000000000000 ) == 0 ); memcpy( ((char*)&_zone_thread)+2, &z64, 4 ); memcpy( ((char*)&_zone_thread)+6, ((char*)&z64)+4, 2 ); }
        tracy_force_inline uint16_t Thread() const { return uint16_t( _zone_thread & 0xFFFF ); }
        tracy_force_inline void SetThread( uint16_t thread ) { memcpy( &_zone_thread, &thread, 2 ); }

@@ -137,7 +137,7 @@ public:
    struct GpuZoneThreadData
    {
        tracy_force_inline GpuEvent* Zone() const { return (GpuEvent*)( _zone_thread >> 16 ); }
-        tracy_force_inline void SetZone( GpuEvent* zone ) { assert( ( uint64_t( zone ) & 0xFFFF000000000000 ) == 0 ); memcpy( ((char*)&_zone_thread)+2, &zone, 4 ); memcpy( ((char*)&_zone_thread)+6, ((char*)&zone)+4, 2 ); }
+        tracy_force_inline void SetZone( GpuEvent* zone ) { auto z64 = (uint64_t)zone; assert( ( z64 & 0xFFFF000000000000 ) == 0 ); memcpy( ((char*)&_zone_thread)+2, &z64, 4 ); memcpy( ((char*)&_zone_thread)+6, ((char*)&z64)+4, 2 ); }
        tracy_force_inline uint16_t Thread() const { return uint16_t( _zone_thread & 0xFFFF ); }
        tracy_force_inline void SetThread( uint16_t thread ) { memcpy( &_zone_thread, &thread, 2 ); }

@@ -148,6 +148,7 @@ public:
    struct CpuThreadTopology
    {
        uint32_t package;
+        uint32_t die;
        uint32_t core;
    };

@@ -197,7 +198,7 @@ public:
 private:
    struct SourceLocationZones
    {
-        struct ZtdSort { bool operator()( const ZoneThreadData& lhs, const ZoneThreadData& rhs ) { return lhs.Zone()->Start() < rhs.Zone()->Start(); } };
+        struct ZtdSort { bool operator()( const ZoneThreadData& lhs, const ZoneThreadData& rhs ) const { return lhs.Zone()->Start() < rhs.Zone()->Start(); } };

        SortedVector<ZoneThreadData, ZtdSort> zones;
        int64_t min = std::numeric_limits<int64_t>::max();
@@ -216,7 +217,7 @@ private:

    struct GpuSourceLocationZones
    {
-        struct GpuZtdSort { bool operator()( const GpuZoneThreadData& lhs, const GpuZoneThreadData& rhs ) { return lhs.Zone()->GpuStart() < rhs.Zone()->GpuStart(); } };
+        struct GpuZtdSort { bool operator()( const GpuZoneThreadData& lhs, const GpuZoneThreadData& rhs ) const { return lhs.Zone()->GpuStart() < rhs.Zone()->GpuStart(); } };

        SortedVector<GpuZoneThreadData, GpuZtdSort> zones;
        int64_t min = std::numeric_limits<int64_t>::max();
@@ -276,6 +277,8 @@ private:

    struct DataBlock
    {
+        std::atomic<bool> mainThreadWantsLock = false;
+        std::condition_variable lockCv;
        std::mutex lock;
        StringDiscovery<FrameData*> frames;
        FrameData* framesBase;
@@ -392,7 +395,7 @@ private:
        bool ctxUsageReady = false;
 #endif

-        unordered_flat_map<uint32_t, unordered_flat_map<uint32_t, std::vector<uint32_t>>> cpuTopology;
+        unordered_flat_map<uint32_t, unordered_flat_map<uint32_t, unordered_flat_map<uint32_t, std::vector<uint32_t>>>> cpuTopology;
        unordered_flat_map<uint32_t, CpuThreadTopology> cpuTopologyMap;

        unordered_flat_map<uint64_t, MemoryBlock> symbolCode;
@@ -464,7 +467,6 @@ public:
    uint64_t GetCaptureTime() const { return m_captureTime; }
    uint64_t GetExecutableTime() const { return m_executableTime; }
    const std::string& GetHostInfo() const { return m_hostInfo; }
-    int64_t GetDelay() const { return m_delay; }
    int64_t GetResolution() const { return m_resolution; }
    uint64_t GetPid() const { return m_pid; };
    CpuArchitecture GetCpuArch() const { return m_data.cpuArch; }
@@ -472,6 +474,31 @@ public:
    const char* GetCpuManufacturer() const { return m_data.cpuManufacturer; }

    std::mutex& GetDataLock() { return m_data.lock; }
+
+    // This guard helps prevent main thread starvation by coordinating lock acquisition between
+    // the main thread and worker threads. It uses an atomic flag (mainThreadWantsLock) to signal
+    // the main thread's intent to acquire the lock, and a condition variable to notify workers when
+    // the main thread is done. This prioritization reduces contention and ensures the main thread
+    // can acquire the lock promptly, especially during critical phases like initialization.
+    struct MainThreadDataLockGuard
+    {
+        MainThreadDataLockGuard( DataBlock& m_data )
+            : m_data( m_data )
+        {
+            m_data.mainThreadWantsLock = true;
+            m_data.lock.lock();
+        }
+        ~MainThreadDataLockGuard()
+        {
+            m_data.mainThreadWantsLock = false;
+            m_data.lock.unlock();
+            m_data.lockCv.notify_one();
+        }
+    private:
+        DataBlock& m_data;
+    };
+    MainThreadDataLockGuard ObtainLockForMainThread() { return { m_data }; }
+
    size_t GetFrameCount( const FrameData& fd ) const { return fd.frames.size(); }
    size_t GetFullFrameCount( const FrameData& fd ) const;
    bool AreFramesUsed() const;
@@ -492,7 +519,7 @@ public:
    uint64_t GetCallstackParentPayloadCount() const { return m_data.parentCallstackPayload.size(); }
    uint64_t GetCallstackParentFrameCount() const { return m_callstackParentNextIdx; }
 #endif
-    uint64_t GetCallstackFrameCount() const { return m_data.callstackFrameMap.size(); }
+    uint64_t GetCallstackFrameCount() const { return m_data.callstackFrameMap.size() - m_pendingCallstackFrames; }
    uint64_t GetCallstackSampleCount() const { return m_data.samplesCnt; }
    uint64_t GetSymbolsCount() const { return m_data.symbolMap.size(); }
    uint64_t GetSymbolCodeCount() const { return m_data.symbolCode.size(); }
@@ -688,6 +715,7 @@ private:
    void QueryTerminate();
    void QuerySourceFile( const char* fn, const char* image );
    void QueryDataTransfer( const void* ptr, size_t size );
+    void QueryCallstackFrame( uint64_t addr );

    tracy_force_inline bool DispatchProcess( const QueueItem& ev, const char*& ptr );
    tracy_force_inline bool Process( const QueueItem& ev );
@@ -740,14 +768,18 @@ private:
    tracy_force_inline void ProcessGpuCalibration( const QueueGpuCalibration& ev );
    tracy_force_inline void ProcessGpuTimeSync( const QueueGpuTimeSync& ev );
    tracy_force_inline void ProcessGpuContextName( const QueueGpuContextName& ev );
+    tracy_force_inline void ProcessGpuAnnotationName( const QueueGpuAnnotationName& ev );
+    tracy_force_inline void ProcessGpuZoneAnnotation( const QueueGpuZoneAnnotation& ev );
    tracy_force_inline MemEvent* ProcessMemAlloc( const QueueMemAlloc& ev );
    tracy_force_inline MemEvent* ProcessMemAllocNamed( const QueueMemAlloc& ev );
    tracy_force_inline MemEvent* ProcessMemFree( const QueueMemFree& ev );
    tracy_force_inline MemEvent* ProcessMemFreeNamed( const QueueMemFree& ev );
+    tracy_force_inline void ProcessMemDiscard( const QueueMemDiscard& ev );
    tracy_force_inline void ProcessMemAllocCallstack( const QueueMemAlloc& ev );
    tracy_force_inline void ProcessMemAllocCallstackNamed( const QueueMemAlloc& ev );
    tracy_force_inline void ProcessMemFreeCallstack( const QueueMemFree& ev );
    tracy_force_inline void ProcessMemFreeCallstackNamed( const QueueMemFree& ev );
+    tracy_force_inline void ProcessMemDiscardCallstack( const QueueMemDiscard& ev );
    tracy_force_inline void ProcessCallstackSerial();
    tracy_force_inline void ProcessCallstack();
    tracy_force_inline void ProcessCallstackSample( const QueueCallstackSample& ev );
@@ -932,8 +964,8 @@ private:

    tracy_force_inline int64_t ReadTimeline( FileRead& f, ZoneEvent* zone, int64_t refTime, int32_t& childIdx );
    tracy_force_inline int64_t ReadTimelineHaveSize( FileRead& f, ZoneEvent* zone, int64_t refTime, int32_t& childIdx, uint32_t sz );
-    tracy_force_inline void ReadTimeline( FileRead& f, GpuEvent* zone, int64_t& refTime, int64_t& refGpuTime, int32_t& childIdx );
-    tracy_force_inline void ReadTimelineHaveSize( FileRead& f, GpuEvent* zone, int64_t& refTime, int64_t& refGpuTime, int32_t& childIdx, uint64_t sz );
+    tracy_force_inline void ReadTimeline( FileRead& f, GpuEvent* zone, int64_t& refTime, int64_t& refGpuTime, int32_t& childIdx, bool hasQueryId );
+    tracy_force_inline void ReadTimelineHaveSize( FileRead& f, GpuEvent* zone, int64_t& refTime, int64_t& refGpuTime, int32_t& childIdx, uint64_t sz, bool hasQueryId );

 #ifndef TRACY_NO_STATISTICS
    tracy_force_inline void ReconstructZoneStatistics( uint8_t* countMap, ZoneEvent& zone, uint16_t thread );
@@ -953,7 +985,7 @@ private:
    void UpdateMbps( int64_t td );

    int64_t ReadTimeline( FileRead& f, Vector<short_ptr<ZoneEvent>>& vec, uint32_t size, int64_t refTime, int32_t& childIdx );
-    void ReadTimeline( FileRead& f, Vector<short_ptr<GpuEvent>>& vec, uint64_t size, int64_t& refTime, int64_t& refGpuTime, int32_t& childIdx );
+    void ReadTimeline( FileRead& f, Vector<short_ptr<GpuEvent>>& vec, uint64_t size, int64_t& refTime, int64_t& refGpuTime, int32_t& childIdx, bool hasQueryId );

    tracy_force_inline void WriteTimeline( FileWrite& f, const Vector<short_ptr<ZoneEvent>>& vec, int64_t& refTime );
    tracy_force_inline void WriteTimeline( FileWrite& f, const Vector<short_ptr<GpuEvent>>& vec, int64_t& refTime, int64_t& refGpuTime );
@@ -979,7 +1011,6 @@ private:
    std::atomic<bool> m_backgroundDone { true };
    std::thread m_threadBackground;

-    int64_t m_delay;
    int64_t m_resolution;
    double m_timerMul;
    std::string m_captureName;
@@ -1018,14 +1049,14 @@ private:
    StringLocation m_pendingSingleString = {};
    StringLocation m_pendingSecondString = {};

-    uint32_t m_pendingStrings;
-    uint32_t m_pendingThreads;
-    uint32_t m_pendingFibers;
-    uint32_t m_pendingExternalNames;
-    uint32_t m_pendingSourceLocation;
-    uint32_t m_pendingCallstackFrames;
-    uint8_t m_pendingCallstackSubframes;
-    uint32_t m_pendingSymbolCode;
+    uint32_t m_pendingStrings = 0;
+    uint32_t m_pendingThreads = 0;
+    uint32_t m_pendingFibers = 0;
+    uint32_t m_pendingExternalNames = 0;
+    uint32_t m_pendingSourceLocation = 0;
+    uint32_t m_pendingCallstackFrames = 0;
+    uint8_t m_pendingCallstackSubframes = 0;
+    uint32_t m_pendingSymbolCode = 0;

    CallstackFrameData* m_callstackFrameStaging;
    uint64_t m_callstackFrameStagingPtr;