diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev index 3ed7211..d2a1f24 100644 --- a/docker/Dockerfile.dev +++ b/docker/Dockerfile.dev @@ -1,10 +1,11 @@ # Dev-окружение для cuframes — содержит CUDA toolkit (с nvcc), build tools, # линкеры/анализаторы. GPU прокидывается через `--gpus all` на runtime. # -# Base: nvidia/cuda devel-image c CUDA 12.4 + cuDNN 9 на Ubuntu 24.04. +# Base: nvidia/cuda devel-image c CUDA 13.0 + cuDNN на Ubuntu 24.04. # devel-вариант (не runtime) — нужен для компиляции CUDA-кода (nvcc, headers). +# CUDA 13.x — текущая stable линейка с поддержкой sm_120 (Blackwell, RTX 5090). -FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu24.04 +FROM nvidia/cuda:13.0.3-cudnn-devel-ubuntu24.04 # Не запрашивать tzdata interactive при apt ENV DEBIAN_FRONTEND=noninteractive diff --git a/docs/measurements/spike-v2/A-4k30-stream-3840x2160-30fps-consumer.log b/docs/measurements/spike-v2/A-4k30-stream-3840x2160-30fps-consumer.log new file mode 100644 index 0000000..592197e --- /dev/null +++ b/docs/measurements/spike-v2/A-4k30-stream-3840x2160-30fps-consumer.log @@ -0,0 +1,17 @@ +[consumer] key=A-4k30 count=600 +[consumer] connected, 3840x2160 sync=stream + +=== cuframes spike-v2 summary === +scenario: A (stream sync) +frames received: 600 +duration: 19.9625 s +effective fps: 30.0563 +skipped (caught up): 0 +TORN FRAMES: 0 ← ✓ clean + +latency consumer-receive-to-kernel-done (microseconds): + mean: 164 us + p50: 145 us + p95: 206 us + p99: 606 us + max: 4412 us diff --git a/docs/measurements/spike-v2/A-4k30-stream-3840x2160-30fps-producer.log b/docs/measurements/spike-v2/A-4k30-stream-3840x2160-30fps-producer.log new file mode 100644 index 0000000..e69de29 diff --git a/docs/measurements/spike-v2/A-fhd60-stream-1920x1080-60fps-consumer.log b/docs/measurements/spike-v2/A-fhd60-stream-1920x1080-60fps-consumer.log new file mode 100644 index 0000000..2e39f75 --- /dev/null +++ b/docs/measurements/spike-v2/A-fhd60-stream-1920x1080-60fps-consumer.log @@ -0,0 +1,17 @@ +[consumer] key=A-fhd60 count=1500 +[consumer] connected, 1920x1080 sync=stream + +=== cuframes spike-v2 summary === +scenario: A (stream sync) +frames received: 1500 +duration: 24.9799 s +effective fps: 60.0482 +skipped (caught up): 0 +TORN FRAMES: 0 ← ✓ clean + +latency consumer-receive-to-kernel-done (microseconds): + mean: 140 us + p50: 122 us + p95: 187 us + p99: 267 us + max: 14701 us diff --git a/docs/measurements/spike-v2/A-fhd60-stream-1920x1080-60fps-producer.log b/docs/measurements/spike-v2/A-fhd60-stream-1920x1080-60fps-producer.log new file mode 100644 index 0000000..e69de29 diff --git a/docs/measurements/spike-v2/B-4k30-event-3840x2160-30fps-consumer.log b/docs/measurements/spike-v2/B-4k30-event-3840x2160-30fps-consumer.log new file mode 100644 index 0000000..c9cdbdc --- /dev/null +++ b/docs/measurements/spike-v2/B-4k30-event-3840x2160-30fps-consumer.log @@ -0,0 +1,18 @@ +[consumer] key=B-4k30 count=600 +[consumer] connected, 3840x2160 sync=event +[consumer] opened producer's cuda event + +=== cuframes spike-v2 summary === +scenario: B (event sync) +frames received: 600 +duration: 19.9633 s +effective fps: 30.0552 +skipped (caught up): 0 +TORN FRAMES: 0 ← ✓ clean + +latency consumer-receive-to-kernel-done (microseconds): + mean: 184 us + p50: 171 us + p95: 199 us + p99: 437 us + max: 3739 us diff --git a/docs/measurements/spike-v2/B-4k30-event-3840x2160-30fps-producer.log b/docs/measurements/spike-v2/B-4k30-event-3840x2160-30fps-producer.log new file mode 100644 index 0000000..e69de29 diff --git a/docs/measurements/spike-v2/B-fhd60-event-1920x1080-60fps-consumer.log b/docs/measurements/spike-v2/B-fhd60-event-1920x1080-60fps-consumer.log new file mode 100644 index 0000000..018353b --- /dev/null +++ b/docs/measurements/spike-v2/B-fhd60-event-1920x1080-60fps-consumer.log @@ -0,0 +1,18 @@ +[consumer] key=B-fhd60 count=1500 +[consumer] connected, 1920x1080 sync=event +[consumer] opened producer's cuda event + +=== cuframes spike-v2 summary === +scenario: B (event sync) +frames received: 1500 +duration: 24.9784 s +effective fps: 60.0518 +skipped (caught up): 0 +TORN FRAMES: 0 ← ✓ clean + +latency consumer-receive-to-kernel-done (microseconds): + mean: 163 us + p50: 149 us + p95: 187 us + p99: 344 us + max: 5229 us diff --git a/docs/measurements/spike-v2/B-fhd60-event-1920x1080-60fps-producer.log b/docs/measurements/spike-v2/B-fhd60-event-1920x1080-60fps-producer.log new file mode 100644 index 0000000..e69de29 diff --git a/tools/spike-v2/CMakeLists.txt b/tools/spike-v2/CMakeLists.txt new file mode 100644 index 0000000..78d0e01 --- /dev/null +++ b/tools/spike-v2/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 3.20) +project(cuframes_spike_v2 LANGUAGES CXX CUDA) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CUDA_STANDARD 17) + +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES "120") # sm_120 для RTX 5090; добавьте 86/89/90 при необходимости +endif() + +add_executable(spike2_producer producer.cu) +add_executable(spike2_consumer consumer.cu) + +foreach(target spike2_producer spike2_consumer) + target_compile_options(${target} PRIVATE + $<$:-Wall -Wextra -O2 -g> + $<$:-O2 -g -lineinfo> + ) + target_link_libraries(${target} PRIVATE cuda) +endforeach() diff --git a/tools/spike-v2/README.md b/tools/spike-v2/README.md new file mode 100644 index 0000000..5b8404a --- /dev/null +++ b/tools/spike-v2/README.md @@ -0,0 +1,59 @@ +# Phase 0 spike-v2 — kernel-on-consumer-stream torn-frame test + +Архитектор (review 2026-05-15) указал на **R1 / R2** в `docs/architecture.md`: +оригинальный PoC (`tools/spike/`) использовал `cudaMemcpy` на consumer-стороне, +который имеет implicit cross-context barriers. Это маскирует фундаментальную +проблему: **`cudaStreamSynchronize` на producer не достаточен** для гарантии +visibility GPU-writes для consumer'а на **отдельном stream'е**. + +NVIDIA Programming Guide §3.2.8 явно говорит: «Synchronization between producer +and consumer processes is required and the responsibility of the application» — +то есть нужен `cudaIpcEventHandle_t` (cross-process CUDA event), не stream sync. + +## Что проверяем + +Два сценария: + +### Сценарий A: stream sync only (текущий PoC v1 дизайн) + +Producer: +- Fill_Y_kernel записывает `seq + row` в каждый пиксель строки `row` Y-plane +- `cudaStreamSynchronize(producer_stream)` +- Publish seq через atomic в SHM + +Consumer: +- Read seq atomic ACQUIRE +- **Verify_kernel на consumer_stream'е** проверяет что Y-plane имеет ожидаемое + значение во **всех** строках. Если есть строка с другим значением → torn. +- `cudaDeviceSynchronize` чтобы kernel завершился, проверить result. + +Ожидаем: torn frames на high-fps (60+) на FullHD или 4K. + +### Сценарий B: event handle sync (предлагаемый fix) + +Producer: +- `cudaEventCreateWithFlags(cudaEventInterprocess | cudaEventDisableTiming)` +- `cudaIpcGetEventHandle` → handle в SHM +- На каждый publish: `cudaEventRecord(event, producer_stream)` (вместо sync) +- Publish seq + +Consumer: +- На subscribe: `cudaIpcOpenEventHandle` → local event +- На каждый next: `cudaStreamWaitEvent(consumer_stream, event, 0)` — GPU-side wait +- Verify_kernel на том же consumer_stream'е +- Result должен быть без torn frames даже на high resolution + high fps + +## Acceptance + +- A показал torn frames > 0 → R1 confirmed → нужны events +- B показал torn frames == 0 → R2 fix validated +- Latency B vs A — изменения не должно быть значимого (events = GPU-side wait, no CPU sync) + +## Запуск (внутри cuframes-dev container) + +```bash +cd /workspace +cmake -B build-v2 -S tools/spike-v2 -G Ninja +cmake --build build-v2 +./tools/spike-v2/bench.sh +``` diff --git a/tools/spike-v2/bench.sh b/tools/spike-v2/bench.sh new file mode 100755 index 0000000..a62b7b9 --- /dev/null +++ b/tools/spike-v2/bench.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# Phase 0 spike-v2 — validate R1 (sync semantics). +# Runs both scenarios (A: stream sync, B: event sync) на 60 fps × FullHD. +# Если architecture's R1 правилен — Scenario A покажет torn frames > 0. + +set -euo pipefail +cd "$(dirname "$0")/../.." + +if [ ! -x build-v2/spike2_producer ]; then + echo "==> build first: cmake -B build-v2 -S tools/spike-v2 -G Ninja && cmake --build build-v2" + exit 1 +fi + +mkdir -p docs/measurements/spike-v2 + +run_scenario() { + local label="$1" + local sync="$2" + local fps="$3" + local res="$4" + local width="${res%x*}" + local height="${res#*x}" + local count="$5" + + echo + echo "=== Scenario $label: sync=$sync, ${res}@${fps}fps, n=$count ===" + + rm -f /dev/shm/cuframes-v2-* 2>/dev/null || true + + ./build-v2/spike2_producer \ + --key "$label" --width "$width" --height "$height" \ + --fps "$fps" --sync "$sync" --duration 60 \ + > "docs/measurements/spike-v2/${label}-${sync}-${res}-${fps}fps-producer.log" 2>&1 & + local prod_pid=$! + sleep 1 + + ./build-v2/spike2_consumer --key "$label" --count "$count" \ + 2>&1 | tee "docs/measurements/spike-v2/${label}-${sync}-${res}-${fps}fps-consumer.log" \ + | tail -25 || true + + kill "$prod_pid" 2>/dev/null || true + wait "$prod_pid" 2>/dev/null || true +} + +run_scenario "A-fhd60" "stream" 60 "1920x1080" 1500 +run_scenario "B-fhd60" "event" 60 "1920x1080" 1500 +run_scenario "A-4k30" "stream" 30 "3840x2160" 600 +run_scenario "B-4k30" "event" 30 "3840x2160" 600 + +echo +echo "=== Results stored in docs/measurements/spike-v2/ ===" +echo "=== Compare torn_frames count between A and B scenarios ===" diff --git a/tools/spike-v2/common.h b/tools/spike-v2/common.h new file mode 100644 index 0000000..8b42f00 --- /dev/null +++ b/tools/spike-v2/common.h @@ -0,0 +1,67 @@ +// Phase 0 spike-v2 — общие типы. Расширенный относительно spike v1: добавлена +// поддержка cuda IPC event handle (для scenarios B) и pattern-fill per-row +// (для verify внутри кадра). + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace cuframes_spike_v2 { + +constexpr int RING_SIZE = 2; + +// Pattern: pixel [row][col] = (seq * 31 + row * 7) & 0xFF +// Использует разные значения по строкам — позволяет verify обнаружить если часть +// кадра ещё имеет старый seq. +__host__ __device__ inline uint8_t pattern_value(uint64_t seq, int row) { + return static_cast((seq * 31u + row * 7u) & 0xFF); +} + +struct FrameMeta { + int32_t width; + int32_t height; + int32_t pitch_y; +}; + +struct SlotDescriptor { + cudaIpcMemHandle_t mem_handle; + uint64_t producer_seq; + int64_t pts_ns; +}; + +struct SharedHeader { + uint32_t magic; + uint32_t version; + int32_t use_events; // 1 = sync mode B (events), 0 = sync mode A + cudaIpcEventHandle_t event_handle; // valid only if use_events == 1 + FrameMeta meta; + SlotDescriptor slots[RING_SIZE]; + uint64_t global_seq; + // Diagnostics + uint64_t torn_frame_count; // consumer записывает; producer читает для лога +}; + +constexpr uint32_t MAGIC = 0xCC7C2D02u; +constexpr uint32_t VERSION = 2; + +#define CHECK_CUDA(call) do { \ + cudaError_t _err = (call); \ + if (_err != cudaSuccess) { \ + fprintf(stderr, "CUDA error at %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(_err)); \ + std::exit(1); \ + } \ +} while (0) + +static inline int64_t now_ns() { + timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return static_cast(ts.tv_sec) * 1000000000LL + ts.tv_nsec; +} + +} // namespace cuframes_spike_v2 diff --git a/tools/spike-v2/consumer.cu b/tools/spike-v2/consumer.cu new file mode 100644 index 0000000..b159982 --- /dev/null +++ b/tools/spike-v2/consumer.cu @@ -0,0 +1,215 @@ +// spike-v2 consumer. +// +// КЛЮЧЕВОЕ отличие от v1: verify работает через CUDA kernel на ОТДЕЛЬНОМ +// (consumer'ском) stream'е, а не через cudaMemcpy. Это правильно эмулирует +// real-world consumer (PyTorch, OpenCV CUDA) и проверяет sync semantics. +// +// Scenario A (--sync=stream): должны быть torn frames на high-fps +// Scenario B (--sync=event): torn frames должны быть 0 + +#include "common.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace cuframes_spike_v2; + +// Verify kernel: проверяет что каждая строка имеет pattern(seq, row). +// Записывает кол-во несовпадающих байтов в out_bad_count. +__global__ void verify_pattern(const uint8_t* y, int width, int height, + int pitch_y, uint64_t expected_seq, + int* out_bad_count) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int row = blockIdx.y * blockDim.y + threadIdx.y; + if (x < width && row < height) { + uint8_t expect = pattern_value(expected_seq, row); + uint8_t actual = y[row * pitch_y + x]; + if (actual != expect) { + atomicAdd(out_bad_count, 1); + } + } +} + +struct Args { + std::string key = "A"; + int count = 500; +}; + +static Args parse_args(int argc, char** argv) { + Args a; + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + auto next = [&] { return std::string(argv[++i]); }; + if (arg == "--key") a.key = next(); + else if (arg == "--count") a.count = std::stoi(next()); + } + return a; +} + +int main(int argc, char** argv) { + Args args = parse_args(argc, argv); + std::cout << "[consumer] key=" << args.key << " count=" << args.count << "\n"; + + CHECK_CUDA(cudaSetDevice(0)); + + // Open SHM + std::string shm_path = "/dev/shm/cuframes-v2-" + args.key; + int fd = -1; + for (int retry = 0; retry < 50; ++retry) { + fd = open(shm_path.c_str(), O_RDWR); + if (fd >= 0) break; + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + if (fd < 0) { + std::cerr << "[consumer] no producer at " << shm_path << "\n"; + return 1; + } + auto* shared = static_cast( + mmap(nullptr, sizeof(SharedHeader), PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0)); + if (shared == MAP_FAILED) { perror("[consumer] mmap"); return 1; } + close(fd); + + if (shared->magic != MAGIC || shared->version != VERSION) { + std::cerr << "[consumer] protocol mismatch magic=" << std::hex + << shared->magic << " ver=" << shared->version << "\n"; + return 1; + } + + bool use_events = shared->use_events != 0; + std::cout << "[consumer] connected, " << shared->meta.width + << "x" << shared->meta.height + << " sync=" << (use_events ? "event" : "stream") << "\n"; + + // Map producer's memory + void* slot_ptrs[RING_SIZE]; + for (int i = 0; i < RING_SIZE; ++i) { + CHECK_CUDA(cudaIpcOpenMemHandle(&slot_ptrs[i], + shared->slots[i].mem_handle, + cudaIpcMemLazyEnablePeerAccess)); + } + + // Map producer's event если sync=event + cudaEvent_t producer_event = nullptr; + if (use_events) { + CHECK_CUDA(cudaIpcOpenEventHandle(&producer_event, + shared->event_handle)); + std::cout << "[consumer] opened producer's cuda event\n"; + } + + // Consumer's own stream — это и есть ключевой момент. + cudaStream_t consumer_stream; + CHECK_CUDA(cudaStreamCreate(&consumer_stream)); + + // Counter в device memory для verify_pattern kernel + int* d_bad_count; + CHECK_CUDA(cudaMalloc(&d_bad_count, sizeof(int))); + + const int width = shared->meta.width; + const int height = shared->meta.height; + const int pitch_y = shared->meta.pitch_y; + + dim3 block(32, 8); + dim3 grid((width + block.x - 1) / block.x, + (height + block.y - 1) / block.y); + + std::vector latencies_ns; + latencies_ns.reserve(args.count); + + uint64_t last_seen = UINT64_MAX; + int frames_received = 0; + int torn_frames = 0; + int skipped_frames = 0; + auto t_start = std::chrono::steady_clock::now(); + + while (frames_received < args.count) { + uint64_t seq = __atomic_load_n(&shared->global_seq, __ATOMIC_ACQUIRE); + if (seq == last_seen || + (last_seen == UINT64_MAX && seq == 0 && + __atomic_load_n(&shared->slots[0].producer_seq, __ATOMIC_ACQUIRE) == 0)) { + std::this_thread::sleep_for(std::chrono::microseconds(50)); + continue; + } + if (last_seen != UINT64_MAX && seq > last_seen + 1) { + skipped_frames += (seq - last_seen - 1); + } + last_seen = seq; + + const int slot_idx = seq % RING_SIZE; + const int64_t pts_ns = __atomic_load_n(&shared->slots[slot_idx].pts_ns, + __ATOMIC_ACQUIRE); + + // *** КЛЮЧЕВОЕ *** + // Если используем events — wait на producer's event перед kernel'ом. + // Если stream-only sync — НЕ ждём (consumer не знает producer's stream). + if (use_events) { + CHECK_CUDA(cudaStreamWaitEvent(consumer_stream, producer_event, 0)); + } + + // Reset bad_count + verify kernel на consumer_stream + CHECK_CUDA(cudaMemsetAsync(d_bad_count, 0, sizeof(int), consumer_stream)); + verify_pattern<<>>( + static_cast(slot_ptrs[slot_idx]), + width, height, pitch_y, seq, d_bad_count); + + // Получить result (это синхронизирует consumer_stream) + int bad_count = 0; + CHECK_CUDA(cudaMemcpyAsync(&bad_count, d_bad_count, sizeof(int), + cudaMemcpyDeviceToHost, consumer_stream)); + CHECK_CUDA(cudaStreamSynchronize(consumer_stream)); + + const int64_t recv_ns = now_ns(); + const int64_t latency_ns = recv_ns - pts_ns; + + if (bad_count > 0) { + torn_frames++; + // Записать в SHM чтобы producer тоже видел + __atomic_fetch_add(&shared->torn_frame_count, 1, __ATOMIC_RELEASE); + } + + latencies_ns.push_back(latency_ns); + frames_received++; + } + + auto t_end = std::chrono::steady_clock::now(); + double duration_sec = std::chrono::duration(t_end - t_start).count(); + + for (int i = 0; i < RING_SIZE; ++i) { + CHECK_CUDA(cudaIpcCloseMemHandle(slot_ptrs[i])); + } + if (producer_event) cudaEventDestroy(producer_event); + cudaFree(d_bad_count); + cudaStreamDestroy(consumer_stream); + munmap(shared, sizeof(SharedHeader)); + + std::sort(latencies_ns.begin(), latencies_ns.end()); + auto pct = [&](double p) -> int64_t { + return latencies_ns[static_cast(latencies_ns.size() * p)]; + }; + int64_t sum = 0; + for (auto v : latencies_ns) sum += v; + + std::cout << "\n=== cuframes spike-v2 summary ===\n"; + std::cout << "scenario: " << (use_events ? "B (event sync)" : "A (stream sync)") << "\n"; + std::cout << "frames received: " << frames_received << "\n"; + std::cout << "duration: " << duration_sec << " s\n"; + std::cout << "effective fps: " << frames_received / duration_sec << "\n"; + std::cout << "skipped (caught up): " << skipped_frames << "\n"; + std::cout << "TORN FRAMES: " << torn_frames + << " ← " << (torn_frames == 0 ? "✓ clean" : "✗ DATA RACE!") << "\n"; + std::cout << "\nlatency consumer-receive-to-kernel-done (microseconds):\n"; + std::cout << " mean: " << sum / 1000 / latencies_ns.size() << " us\n"; + std::cout << " p50: " << pct(0.50) / 1000 << " us\n"; + std::cout << " p95: " << pct(0.95) / 1000 << " us\n"; + std::cout << " p99: " << pct(0.99) / 1000 << " us\n"; + std::cout << " max: " << latencies_ns.back() / 1000 << " us\n"; + + return torn_frames == 0 ? 0 : 2; +} diff --git a/tools/spike-v2/producer.cu b/tools/spike-v2/producer.cu new file mode 100644 index 0000000..6b2b79f --- /dev/null +++ b/tools/spike-v2/producer.cu @@ -0,0 +1,176 @@ +// spike-v2 producer. +// +// В отличие от spike v1: pattern зависит от (seq, row) — каждая строка пишется +// своим значением. Если у consumer'а проявятся torn frames, они будут видны +// как «строки с разным seq» в одном кадре. +// +// Опция --sync=stream → cudaStreamSynchronize (Scenario A, текущий дизайн) +// Опция --sync=event → cudaIpcEvent (Scenario B, предлагаемый fix) +// +// Usage: +// ./producer --key A --width 1920 --height 1080 --fps 60 --sync stream +// ./producer --key B --width 1920 --height 1080 --fps 60 --sync event + +#include "common.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace cuframes_spike_v2; + +// Kernel: каждая строка получает значение pattern_value(seq, row) +__global__ void fill_pattern(uint8_t* y, int width, int height, int pitch_y, + uint64_t seq) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int row = blockIdx.y * blockDim.y + threadIdx.y; + if (x < width && row < height) { + uint8_t v = pattern_value(seq, row); + y[row * pitch_y + x] = v; + } +} + +struct Args { + std::string key = "A"; + int width = 1920; + int height = 1080; + int fps = 60; + int duration_sec = 0; + std::string sync = "stream"; // "stream" | "event" +}; + +static Args parse_args(int argc, char** argv) { + Args a; + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + auto next = [&] { return std::string(argv[++i]); }; + if (arg == "--key") a.key = next(); + else if (arg == "--width") a.width = std::stoi(next()); + else if (arg == "--height") a.height = std::stoi(next()); + else if (arg == "--fps") a.fps = std::stoi(next()); + else if (arg == "--duration") a.duration_sec = std::stoi(next()); + else if (arg == "--sync") a.sync = next(); + } + return a; +} + +int main(int argc, char** argv) { + Args args = parse_args(argc, argv); + bool use_events = (args.sync == "event"); + + std::cout << "[producer] key=" << args.key + << " " << args.width << "x" << args.height + << " @ " << args.fps << " fps" + << " sync=" << args.sync << "\n"; + + CHECK_CUDA(cudaSetDevice(0)); + + const int pitch_y = args.width; + const size_t y_bytes = static_cast(pitch_y) * args.height; + + void* slot_ptrs[RING_SIZE]; + cudaIpcMemHandle_t slot_handles[RING_SIZE]; + for (int i = 0; i < RING_SIZE; ++i) { + CHECK_CUDA(cudaMalloc(&slot_ptrs[i], y_bytes)); + CHECK_CUDA(cudaIpcGetMemHandle(&slot_handles[i], slot_ptrs[i])); + } + + // Create event for cross-process sync (Scenario B) + cudaEvent_t event = nullptr; + cudaIpcEventHandle_t event_handle = {}; + if (use_events) { + CHECK_CUDA(cudaEventCreateWithFlags(&event, + cudaEventDisableTiming | cudaEventInterprocess)); + CHECK_CUDA(cudaIpcGetEventHandle(&event_handle, event)); + std::cout << "[producer] cuda event for IPC sync created\n"; + } + + // POSIX shm + std::string shm_path = "/dev/shm/cuframes-v2-" + args.key; + int fd = open(shm_path.c_str(), O_CREAT | O_RDWR, 0666); + if (fd < 0 || ftruncate(fd, sizeof(SharedHeader)) < 0) { + perror("[producer] shm"); + return 1; + } + auto* shared = static_cast( + mmap(nullptr, sizeof(SharedHeader), PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0)); + if (shared == MAP_FAILED) { + perror("[producer] mmap"); + return 1; + } + std::memset(shared, 0, sizeof(SharedHeader)); + shared->magic = MAGIC; + shared->version = VERSION; + shared->use_events = use_events ? 1 : 0; + if (use_events) shared->event_handle = event_handle; + shared->meta = {args.width, args.height, pitch_y}; + for (int i = 0; i < RING_SIZE; ++i) { + shared->slots[i].mem_handle = slot_handles[i]; + } + __atomic_thread_fence(__ATOMIC_RELEASE); + std::cout << "[producer] shm ready at " << shm_path << "\n"; + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + const auto frame_interval = std::chrono::nanoseconds(1'000'000'000LL / args.fps); + auto next_frame = std::chrono::steady_clock::now(); + + dim3 block(32, 8); + dim3 grid((args.width + block.x - 1) / block.x, + (args.height + block.y - 1) / block.y); + + uint64_t seq = 0; + const int64_t end_ns = args.duration_sec > 0 + ? now_ns() + args.duration_sec * 1'000'000'000LL : 0; + + while (true) { + if (end_ns && now_ns() > end_ns) break; + + const int slot_idx = seq % RING_SIZE; + + // Заполнить slot pattern'ом + fill_pattern<<>>( + static_cast(slot_ptrs[slot_idx]), + args.width, args.height, pitch_y, seq); + + // Sync: либо stream sync (Scenario A), либо event record (Scenario B) + if (use_events) { + CHECK_CUDA(cudaEventRecord(event, stream)); + // НЕ делаем cudaStreamSynchronize — consumer сам wait'нет event + } else { + CHECK_CUDA(cudaStreamSynchronize(stream)); + } + + // Publish (атомарный seq bump после sync/record) + __atomic_store_n(&shared->slots[slot_idx].producer_seq, seq, __ATOMIC_RELEASE); + __atomic_store_n(&shared->slots[slot_idx].pts_ns, now_ns(), __ATOMIC_RELEASE); + __atomic_store_n(&shared->global_seq, seq, __ATOMIC_RELEASE); + + if (seq % 300 == 0 && seq > 0) { + uint64_t torn = __atomic_load_n(&shared->torn_frame_count, __ATOMIC_ACQUIRE); + std::cout << "[producer] seq=" << seq << " torn_frames_so_far=" << torn << "\n"; + } + + seq++; + next_frame += frame_interval; + std::this_thread::sleep_until(next_frame); + } + + uint64_t torn_final = __atomic_load_n(&shared->torn_frame_count, __ATOMIC_ACQUIRE); + std::cout << "[producer] FINAL: published=" << seq + << " torn_frames=" << torn_final << "\n"; + + for (int i = 0; i < RING_SIZE; ++i) CHECK_CUDA(cudaFree(slot_ptrs[i])); + if (event) cudaEventDestroy(event); + munmap(shared, sizeof(SharedHeader)); + close(fd); + unlink(shm_path.c_str()); + return 0; +}