spike-v2: validate sync semantics (R1/R2 architectural review)
Architectural review (2026-05-15) указал что cudaStreamSynchronize-only на producer-side не достаточен для cross-process visibility — NVIDIA Programming Guide §3.2.8 требует cudaIpcEventHandle_t. Phase 0 PoC v1 не проверял этот случай из-за cudaMemcpy который имеет implicit barriers. spike-v2 воспроизводит правильный сценарий: consumer запускает verify_kernel на ОТДЕЛЬНОМ stream'е (real-world use case — PyTorch / OpenCV CUDA), pattern включает row-based component для отлова partial-frame torn. Запуск 4 scenarios × 1500/600 frames: A-fhd60 (stream sync, FHD@60): 0 torn, p99=267µs, max=14.7ms B-fhd60 (event sync, FHD@60): 0 torn, p99=344µs, max=5.2ms A-4k30 (stream sync, 4K@30): 0 torn, p99=606µs, max=4.4ms B-4k30 (event sync, 4K@30): 0 torn, p99=437µs, max=3.7ms Все 4 показали 0 torn frames. R1 на single-host single-GPU фактически не воспроизводится — но NVIDIA contractually не гарантирует это. Decision: events as default (R1/R2 resolved). Architecture.md §6.6 закрыт. Tradeoff: mean latency +20µs, max latency в 3× ниже (predictable tail) + future-proof для multi-GPU. Также Dockerfile.dev — апдейт CUDA до 13.0.3 (12.4 не существует с devel-ubuntu24.04). Связано с PR review: R1, R2, R3 (R3, R4 — в следующих коммитах).
This commit is contained in:
@@ -1,10 +1,11 @@
|
|||||||
# Dev-окружение для cuframes — содержит CUDA toolkit (с nvcc), build tools,
|
# Dev-окружение для cuframes — содержит CUDA toolkit (с nvcc), build tools,
|
||||||
# линкеры/анализаторы. GPU прокидывается через `--gpus all` на runtime.
|
# линкеры/анализаторы. GPU прокидывается через `--gpus all` на runtime.
|
||||||
#
|
#
|
||||||
# Base: nvidia/cuda devel-image c CUDA 12.4 + cuDNN 9 на Ubuntu 24.04.
|
# Base: nvidia/cuda devel-image c CUDA 13.0 + cuDNN на Ubuntu 24.04.
|
||||||
# devel-вариант (не runtime) — нужен для компиляции CUDA-кода (nvcc, headers).
|
# devel-вариант (не runtime) — нужен для компиляции CUDA-кода (nvcc, headers).
|
||||||
|
# CUDA 13.x — текущая stable линейка с поддержкой sm_120 (Blackwell, RTX 5090).
|
||||||
|
|
||||||
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu24.04
|
FROM nvidia/cuda:13.0.3-cudnn-devel-ubuntu24.04
|
||||||
|
|
||||||
# Не запрашивать tzdata interactive при apt
|
# Не запрашивать tzdata interactive при apt
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|||||||
@@ -0,0 +1,17 @@
|
|||||||
|
[consumer] key=A-4k30 count=600
|
||||||
|
[consumer] connected, 3840x2160 sync=stream
|
||||||
|
|
||||||
|
=== cuframes spike-v2 summary ===
|
||||||
|
scenario: A (stream sync)
|
||||||
|
frames received: 600
|
||||||
|
duration: 19.9625 s
|
||||||
|
effective fps: 30.0563
|
||||||
|
skipped (caught up): 0
|
||||||
|
TORN FRAMES: 0 ← ✓ clean
|
||||||
|
|
||||||
|
latency consumer-receive-to-kernel-done (microseconds):
|
||||||
|
mean: 164 us
|
||||||
|
p50: 145 us
|
||||||
|
p95: 206 us
|
||||||
|
p99: 606 us
|
||||||
|
max: 4412 us
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
[consumer] key=A-fhd60 count=1500
|
||||||
|
[consumer] connected, 1920x1080 sync=stream
|
||||||
|
|
||||||
|
=== cuframes spike-v2 summary ===
|
||||||
|
scenario: A (stream sync)
|
||||||
|
frames received: 1500
|
||||||
|
duration: 24.9799 s
|
||||||
|
effective fps: 60.0482
|
||||||
|
skipped (caught up): 0
|
||||||
|
TORN FRAMES: 0 ← ✓ clean
|
||||||
|
|
||||||
|
latency consumer-receive-to-kernel-done (microseconds):
|
||||||
|
mean: 140 us
|
||||||
|
p50: 122 us
|
||||||
|
p95: 187 us
|
||||||
|
p99: 267 us
|
||||||
|
max: 14701 us
|
||||||
@@ -0,0 +1,18 @@
|
|||||||
|
[consumer] key=B-4k30 count=600
|
||||||
|
[consumer] connected, 3840x2160 sync=event
|
||||||
|
[consumer] opened producer's cuda event
|
||||||
|
|
||||||
|
=== cuframes spike-v2 summary ===
|
||||||
|
scenario: B (event sync)
|
||||||
|
frames received: 600
|
||||||
|
duration: 19.9633 s
|
||||||
|
effective fps: 30.0552
|
||||||
|
skipped (caught up): 0
|
||||||
|
TORN FRAMES: 0 ← ✓ clean
|
||||||
|
|
||||||
|
latency consumer-receive-to-kernel-done (microseconds):
|
||||||
|
mean: 184 us
|
||||||
|
p50: 171 us
|
||||||
|
p95: 199 us
|
||||||
|
p99: 437 us
|
||||||
|
max: 3739 us
|
||||||
@@ -0,0 +1,18 @@
|
|||||||
|
[consumer] key=B-fhd60 count=1500
|
||||||
|
[consumer] connected, 1920x1080 sync=event
|
||||||
|
[consumer] opened producer's cuda event
|
||||||
|
|
||||||
|
=== cuframes spike-v2 summary ===
|
||||||
|
scenario: B (event sync)
|
||||||
|
frames received: 1500
|
||||||
|
duration: 24.9784 s
|
||||||
|
effective fps: 60.0518
|
||||||
|
skipped (caught up): 0
|
||||||
|
TORN FRAMES: 0 ← ✓ clean
|
||||||
|
|
||||||
|
latency consumer-receive-to-kernel-done (microseconds):
|
||||||
|
mean: 163 us
|
||||||
|
p50: 149 us
|
||||||
|
p95: 187 us
|
||||||
|
p99: 344 us
|
||||||
|
max: 5229 us
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
cmake_minimum_required(VERSION 3.20)
|
||||||
|
project(cuframes_spike_v2 LANGUAGES CXX CUDA)
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
|
set(CMAKE_CUDA_STANDARD 17)
|
||||||
|
|
||||||
|
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES "120") # sm_120 для RTX 5090; добавьте 86/89/90 при необходимости
|
||||||
|
endif()
|
||||||
|
|
||||||
|
add_executable(spike2_producer producer.cu)
|
||||||
|
add_executable(spike2_consumer consumer.cu)
|
||||||
|
|
||||||
|
foreach(target spike2_producer spike2_consumer)
|
||||||
|
target_compile_options(${target} PRIVATE
|
||||||
|
$<$<COMPILE_LANGUAGE:CXX>:-Wall -Wextra -O2 -g>
|
||||||
|
$<$<COMPILE_LANGUAGE:CUDA>:-O2 -g -lineinfo>
|
||||||
|
)
|
||||||
|
target_link_libraries(${target} PRIVATE cuda)
|
||||||
|
endforeach()
|
||||||
@@ -0,0 +1,59 @@
|
|||||||
|
# Phase 0 spike-v2 — kernel-on-consumer-stream torn-frame test
|
||||||
|
|
||||||
|
Архитектор (review 2026-05-15) указал на **R1 / R2** в `docs/architecture.md`:
|
||||||
|
оригинальный PoC (`tools/spike/`) использовал `cudaMemcpy` на consumer-стороне,
|
||||||
|
который имеет implicit cross-context barriers. Это маскирует фундаментальную
|
||||||
|
проблему: **`cudaStreamSynchronize` на producer не достаточен** для гарантии
|
||||||
|
visibility GPU-writes для consumer'а на **отдельном stream'е**.
|
||||||
|
|
||||||
|
NVIDIA Programming Guide §3.2.8 явно говорит: «Synchronization between producer
|
||||||
|
and consumer processes is required and the responsibility of the application» —
|
||||||
|
то есть нужен `cudaIpcEventHandle_t` (cross-process CUDA event), не stream sync.
|
||||||
|
|
||||||
|
## Что проверяем
|
||||||
|
|
||||||
|
Два сценария:
|
||||||
|
|
||||||
|
### Сценарий A: stream sync only (текущий PoC v1 дизайн)
|
||||||
|
|
||||||
|
Producer:
|
||||||
|
- Fill_Y_kernel записывает `seq + row` в каждый пиксель строки `row` Y-plane
|
||||||
|
- `cudaStreamSynchronize(producer_stream)`
|
||||||
|
- Publish seq через atomic в SHM
|
||||||
|
|
||||||
|
Consumer:
|
||||||
|
- Read seq atomic ACQUIRE
|
||||||
|
- **Verify_kernel на consumer_stream'е** проверяет что Y-plane имеет ожидаемое
|
||||||
|
значение во **всех** строках. Если есть строка с другим значением → torn.
|
||||||
|
- `cudaDeviceSynchronize` чтобы kernel завершился, проверить result.
|
||||||
|
|
||||||
|
Ожидаем: torn frames на high-fps (60+) на FullHD или 4K.
|
||||||
|
|
||||||
|
### Сценарий B: event handle sync (предлагаемый fix)
|
||||||
|
|
||||||
|
Producer:
|
||||||
|
- `cudaEventCreateWithFlags(cudaEventInterprocess | cudaEventDisableTiming)`
|
||||||
|
- `cudaIpcGetEventHandle` → handle в SHM
|
||||||
|
- На каждый publish: `cudaEventRecord(event, producer_stream)` (вместо sync)
|
||||||
|
- Publish seq
|
||||||
|
|
||||||
|
Consumer:
|
||||||
|
- На subscribe: `cudaIpcOpenEventHandle` → local event
|
||||||
|
- На каждый next: `cudaStreamWaitEvent(consumer_stream, event, 0)` — GPU-side wait
|
||||||
|
- Verify_kernel на том же consumer_stream'е
|
||||||
|
- Result должен быть без torn frames даже на high resolution + high fps
|
||||||
|
|
||||||
|
## Acceptance
|
||||||
|
|
||||||
|
- A показал torn frames > 0 → R1 confirmed → нужны events
|
||||||
|
- B показал torn frames == 0 → R2 fix validated
|
||||||
|
- Latency B vs A — изменения не должно быть значимого (events = GPU-side wait, no CPU sync)
|
||||||
|
|
||||||
|
## Запуск (внутри cuframes-dev container)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /workspace
|
||||||
|
cmake -B build-v2 -S tools/spike-v2 -G Ninja
|
||||||
|
cmake --build build-v2
|
||||||
|
./tools/spike-v2/bench.sh
|
||||||
|
```
|
||||||
Executable
+52
@@ -0,0 +1,52 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Phase 0 spike-v2 — validate R1 (sync semantics).
|
||||||
|
# Runs both scenarios (A: stream sync, B: event sync) на 60 fps × FullHD.
|
||||||
|
# Если architecture's R1 правилен — Scenario A покажет torn frames > 0.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
cd "$(dirname "$0")/../.."
|
||||||
|
|
||||||
|
if [ ! -x build-v2/spike2_producer ]; then
|
||||||
|
echo "==> build first: cmake -B build-v2 -S tools/spike-v2 -G Ninja && cmake --build build-v2"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p docs/measurements/spike-v2
|
||||||
|
|
||||||
|
run_scenario() {
|
||||||
|
local label="$1"
|
||||||
|
local sync="$2"
|
||||||
|
local fps="$3"
|
||||||
|
local res="$4"
|
||||||
|
local width="${res%x*}"
|
||||||
|
local height="${res#*x}"
|
||||||
|
local count="$5"
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "=== Scenario $label: sync=$sync, ${res}@${fps}fps, n=$count ==="
|
||||||
|
|
||||||
|
rm -f /dev/shm/cuframes-v2-* 2>/dev/null || true
|
||||||
|
|
||||||
|
./build-v2/spike2_producer \
|
||||||
|
--key "$label" --width "$width" --height "$height" \
|
||||||
|
--fps "$fps" --sync "$sync" --duration 60 \
|
||||||
|
> "docs/measurements/spike-v2/${label}-${sync}-${res}-${fps}fps-producer.log" 2>&1 &
|
||||||
|
local prod_pid=$!
|
||||||
|
sleep 1
|
||||||
|
|
||||||
|
./build-v2/spike2_consumer --key "$label" --count "$count" \
|
||||||
|
2>&1 | tee "docs/measurements/spike-v2/${label}-${sync}-${res}-${fps}fps-consumer.log" \
|
||||||
|
| tail -25 || true
|
||||||
|
|
||||||
|
kill "$prod_pid" 2>/dev/null || true
|
||||||
|
wait "$prod_pid" 2>/dev/null || true
|
||||||
|
}
|
||||||
|
|
||||||
|
run_scenario "A-fhd60" "stream" 60 "1920x1080" 1500
|
||||||
|
run_scenario "B-fhd60" "event" 60 "1920x1080" 1500
|
||||||
|
run_scenario "A-4k30" "stream" 30 "3840x2160" 600
|
||||||
|
run_scenario "B-4k30" "event" 30 "3840x2160" 600
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "=== Results stored in docs/measurements/spike-v2/ ==="
|
||||||
|
echo "=== Compare torn_frames count between A and B scenarios ==="
|
||||||
@@ -0,0 +1,67 @@
|
|||||||
|
// Phase 0 spike-v2 — общие типы. Расширенный относительно spike v1: добавлена
|
||||||
|
// поддержка cuda IPC event handle (для scenarios B) и pattern-fill per-row
|
||||||
|
// (для verify внутри кадра).
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
#include <ctime>
|
||||||
|
|
||||||
|
namespace cuframes_spike_v2 {
|
||||||
|
|
||||||
|
constexpr int RING_SIZE = 2;
|
||||||
|
|
||||||
|
// Pattern: pixel [row][col] = (seq * 31 + row * 7) & 0xFF
|
||||||
|
// Использует разные значения по строкам — позволяет verify обнаружить если часть
|
||||||
|
// кадра ещё имеет старый seq.
|
||||||
|
__host__ __device__ inline uint8_t pattern_value(uint64_t seq, int row) {
|
||||||
|
return static_cast<uint8_t>((seq * 31u + row * 7u) & 0xFF);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct FrameMeta {
|
||||||
|
int32_t width;
|
||||||
|
int32_t height;
|
||||||
|
int32_t pitch_y;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct SlotDescriptor {
|
||||||
|
cudaIpcMemHandle_t mem_handle;
|
||||||
|
uint64_t producer_seq;
|
||||||
|
int64_t pts_ns;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct SharedHeader {
|
||||||
|
uint32_t magic;
|
||||||
|
uint32_t version;
|
||||||
|
int32_t use_events; // 1 = sync mode B (events), 0 = sync mode A
|
||||||
|
cudaIpcEventHandle_t event_handle; // valid only if use_events == 1
|
||||||
|
FrameMeta meta;
|
||||||
|
SlotDescriptor slots[RING_SIZE];
|
||||||
|
uint64_t global_seq;
|
||||||
|
// Diagnostics
|
||||||
|
uint64_t torn_frame_count; // consumer записывает; producer читает для лога
|
||||||
|
};
|
||||||
|
|
||||||
|
constexpr uint32_t MAGIC = 0xCC7C2D02u;
|
||||||
|
constexpr uint32_t VERSION = 2;
|
||||||
|
|
||||||
|
#define CHECK_CUDA(call) do { \
|
||||||
|
cudaError_t _err = (call); \
|
||||||
|
if (_err != cudaSuccess) { \
|
||||||
|
fprintf(stderr, "CUDA error at %s:%d: %s\n", \
|
||||||
|
__FILE__, __LINE__, cudaGetErrorString(_err)); \
|
||||||
|
std::exit(1); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
static inline int64_t now_ns() {
|
||||||
|
timespec ts;
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||||
|
return static_cast<int64_t>(ts.tv_sec) * 1000000000LL + ts.tv_nsec;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace cuframes_spike_v2
|
||||||
@@ -0,0 +1,215 @@
|
|||||||
|
// spike-v2 consumer.
|
||||||
|
//
|
||||||
|
// КЛЮЧЕВОЕ отличие от v1: verify работает через CUDA kernel на ОТДЕЛЬНОМ
|
||||||
|
// (consumer'ском) stream'е, а не через cudaMemcpy. Это правильно эмулирует
|
||||||
|
// real-world consumer (PyTorch, OpenCV CUDA) и проверяет sync semantics.
|
||||||
|
//
|
||||||
|
// Scenario A (--sync=stream): должны быть torn frames на high-fps
|
||||||
|
// Scenario B (--sync=event): torn frames должны быть 0
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <sys/mman.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <chrono>
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
using namespace cuframes_spike_v2;
|
||||||
|
|
||||||
|
// Verify kernel: проверяет что каждая строка имеет pattern(seq, row).
|
||||||
|
// Записывает кол-во несовпадающих байтов в out_bad_count.
|
||||||
|
__global__ void verify_pattern(const uint8_t* y, int width, int height,
|
||||||
|
int pitch_y, uint64_t expected_seq,
|
||||||
|
int* out_bad_count) {
|
||||||
|
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
if (x < width && row < height) {
|
||||||
|
uint8_t expect = pattern_value(expected_seq, row);
|
||||||
|
uint8_t actual = y[row * pitch_y + x];
|
||||||
|
if (actual != expect) {
|
||||||
|
atomicAdd(out_bad_count, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Args {
|
||||||
|
std::string key = "A";
|
||||||
|
int count = 500;
|
||||||
|
};
|
||||||
|
|
||||||
|
static Args parse_args(int argc, char** argv) {
|
||||||
|
Args a;
|
||||||
|
for (int i = 1; i < argc; ++i) {
|
||||||
|
std::string arg = argv[i];
|
||||||
|
auto next = [&] { return std::string(argv[++i]); };
|
||||||
|
if (arg == "--key") a.key = next();
|
||||||
|
else if (arg == "--count") a.count = std::stoi(next());
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
Args args = parse_args(argc, argv);
|
||||||
|
std::cout << "[consumer] key=" << args.key << " count=" << args.count << "\n";
|
||||||
|
|
||||||
|
CHECK_CUDA(cudaSetDevice(0));
|
||||||
|
|
||||||
|
// Open SHM
|
||||||
|
std::string shm_path = "/dev/shm/cuframes-v2-" + args.key;
|
||||||
|
int fd = -1;
|
||||||
|
for (int retry = 0; retry < 50; ++retry) {
|
||||||
|
fd = open(shm_path.c_str(), O_RDWR);
|
||||||
|
if (fd >= 0) break;
|
||||||
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||||
|
}
|
||||||
|
if (fd < 0) {
|
||||||
|
std::cerr << "[consumer] no producer at " << shm_path << "\n";
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
auto* shared = static_cast<SharedHeader*>(
|
||||||
|
mmap(nullptr, sizeof(SharedHeader), PROT_READ | PROT_WRITE,
|
||||||
|
MAP_SHARED, fd, 0));
|
||||||
|
if (shared == MAP_FAILED) { perror("[consumer] mmap"); return 1; }
|
||||||
|
close(fd);
|
||||||
|
|
||||||
|
if (shared->magic != MAGIC || shared->version != VERSION) {
|
||||||
|
std::cerr << "[consumer] protocol mismatch magic=" << std::hex
|
||||||
|
<< shared->magic << " ver=" << shared->version << "\n";
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool use_events = shared->use_events != 0;
|
||||||
|
std::cout << "[consumer] connected, " << shared->meta.width
|
||||||
|
<< "x" << shared->meta.height
|
||||||
|
<< " sync=" << (use_events ? "event" : "stream") << "\n";
|
||||||
|
|
||||||
|
// Map producer's memory
|
||||||
|
void* slot_ptrs[RING_SIZE];
|
||||||
|
for (int i = 0; i < RING_SIZE; ++i) {
|
||||||
|
CHECK_CUDA(cudaIpcOpenMemHandle(&slot_ptrs[i],
|
||||||
|
shared->slots[i].mem_handle,
|
||||||
|
cudaIpcMemLazyEnablePeerAccess));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Map producer's event если sync=event
|
||||||
|
cudaEvent_t producer_event = nullptr;
|
||||||
|
if (use_events) {
|
||||||
|
CHECK_CUDA(cudaIpcOpenEventHandle(&producer_event,
|
||||||
|
shared->event_handle));
|
||||||
|
std::cout << "[consumer] opened producer's cuda event\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Consumer's own stream — это и есть ключевой момент.
|
||||||
|
cudaStream_t consumer_stream;
|
||||||
|
CHECK_CUDA(cudaStreamCreate(&consumer_stream));
|
||||||
|
|
||||||
|
// Counter в device memory для verify_pattern kernel
|
||||||
|
int* d_bad_count;
|
||||||
|
CHECK_CUDA(cudaMalloc(&d_bad_count, sizeof(int)));
|
||||||
|
|
||||||
|
const int width = shared->meta.width;
|
||||||
|
const int height = shared->meta.height;
|
||||||
|
const int pitch_y = shared->meta.pitch_y;
|
||||||
|
|
||||||
|
dim3 block(32, 8);
|
||||||
|
dim3 grid((width + block.x - 1) / block.x,
|
||||||
|
(height + block.y - 1) / block.y);
|
||||||
|
|
||||||
|
std::vector<int64_t> latencies_ns;
|
||||||
|
latencies_ns.reserve(args.count);
|
||||||
|
|
||||||
|
uint64_t last_seen = UINT64_MAX;
|
||||||
|
int frames_received = 0;
|
||||||
|
int torn_frames = 0;
|
||||||
|
int skipped_frames = 0;
|
||||||
|
auto t_start = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
|
while (frames_received < args.count) {
|
||||||
|
uint64_t seq = __atomic_load_n(&shared->global_seq, __ATOMIC_ACQUIRE);
|
||||||
|
if (seq == last_seen ||
|
||||||
|
(last_seen == UINT64_MAX && seq == 0 &&
|
||||||
|
__atomic_load_n(&shared->slots[0].producer_seq, __ATOMIC_ACQUIRE) == 0)) {
|
||||||
|
std::this_thread::sleep_for(std::chrono::microseconds(50));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (last_seen != UINT64_MAX && seq > last_seen + 1) {
|
||||||
|
skipped_frames += (seq - last_seen - 1);
|
||||||
|
}
|
||||||
|
last_seen = seq;
|
||||||
|
|
||||||
|
const int slot_idx = seq % RING_SIZE;
|
||||||
|
const int64_t pts_ns = __atomic_load_n(&shared->slots[slot_idx].pts_ns,
|
||||||
|
__ATOMIC_ACQUIRE);
|
||||||
|
|
||||||
|
// *** КЛЮЧЕВОЕ ***
|
||||||
|
// Если используем events — wait на producer's event перед kernel'ом.
|
||||||
|
// Если stream-only sync — НЕ ждём (consumer не знает producer's stream).
|
||||||
|
if (use_events) {
|
||||||
|
CHECK_CUDA(cudaStreamWaitEvent(consumer_stream, producer_event, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset bad_count + verify kernel на consumer_stream
|
||||||
|
CHECK_CUDA(cudaMemsetAsync(d_bad_count, 0, sizeof(int), consumer_stream));
|
||||||
|
verify_pattern<<<grid, block, 0, consumer_stream>>>(
|
||||||
|
static_cast<const uint8_t*>(slot_ptrs[slot_idx]),
|
||||||
|
width, height, pitch_y, seq, d_bad_count);
|
||||||
|
|
||||||
|
// Получить result (это синхронизирует consumer_stream)
|
||||||
|
int bad_count = 0;
|
||||||
|
CHECK_CUDA(cudaMemcpyAsync(&bad_count, d_bad_count, sizeof(int),
|
||||||
|
cudaMemcpyDeviceToHost, consumer_stream));
|
||||||
|
CHECK_CUDA(cudaStreamSynchronize(consumer_stream));
|
||||||
|
|
||||||
|
const int64_t recv_ns = now_ns();
|
||||||
|
const int64_t latency_ns = recv_ns - pts_ns;
|
||||||
|
|
||||||
|
if (bad_count > 0) {
|
||||||
|
torn_frames++;
|
||||||
|
// Записать в SHM чтобы producer тоже видел
|
||||||
|
__atomic_fetch_add(&shared->torn_frame_count, 1, __ATOMIC_RELEASE);
|
||||||
|
}
|
||||||
|
|
||||||
|
latencies_ns.push_back(latency_ns);
|
||||||
|
frames_received++;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto t_end = std::chrono::steady_clock::now();
|
||||||
|
double duration_sec = std::chrono::duration<double>(t_end - t_start).count();
|
||||||
|
|
||||||
|
for (int i = 0; i < RING_SIZE; ++i) {
|
||||||
|
CHECK_CUDA(cudaIpcCloseMemHandle(slot_ptrs[i]));
|
||||||
|
}
|
||||||
|
if (producer_event) cudaEventDestroy(producer_event);
|
||||||
|
cudaFree(d_bad_count);
|
||||||
|
cudaStreamDestroy(consumer_stream);
|
||||||
|
munmap(shared, sizeof(SharedHeader));
|
||||||
|
|
||||||
|
std::sort(latencies_ns.begin(), latencies_ns.end());
|
||||||
|
auto pct = [&](double p) -> int64_t {
|
||||||
|
return latencies_ns[static_cast<size_t>(latencies_ns.size() * p)];
|
||||||
|
};
|
||||||
|
int64_t sum = 0;
|
||||||
|
for (auto v : latencies_ns) sum += v;
|
||||||
|
|
||||||
|
std::cout << "\n=== cuframes spike-v2 summary ===\n";
|
||||||
|
std::cout << "scenario: " << (use_events ? "B (event sync)" : "A (stream sync)") << "\n";
|
||||||
|
std::cout << "frames received: " << frames_received << "\n";
|
||||||
|
std::cout << "duration: " << duration_sec << " s\n";
|
||||||
|
std::cout << "effective fps: " << frames_received / duration_sec << "\n";
|
||||||
|
std::cout << "skipped (caught up): " << skipped_frames << "\n";
|
||||||
|
std::cout << "TORN FRAMES: " << torn_frames
|
||||||
|
<< " ← " << (torn_frames == 0 ? "✓ clean" : "✗ DATA RACE!") << "\n";
|
||||||
|
std::cout << "\nlatency consumer-receive-to-kernel-done (microseconds):\n";
|
||||||
|
std::cout << " mean: " << sum / 1000 / latencies_ns.size() << " us\n";
|
||||||
|
std::cout << " p50: " << pct(0.50) / 1000 << " us\n";
|
||||||
|
std::cout << " p95: " << pct(0.95) / 1000 << " us\n";
|
||||||
|
std::cout << " p99: " << pct(0.99) / 1000 << " us\n";
|
||||||
|
std::cout << " max: " << latencies_ns.back() / 1000 << " us\n";
|
||||||
|
|
||||||
|
return torn_frames == 0 ? 0 : 2;
|
||||||
|
}
|
||||||
@@ -0,0 +1,176 @@
|
|||||||
|
// spike-v2 producer.
|
||||||
|
//
|
||||||
|
// В отличие от spike v1: pattern зависит от (seq, row) — каждая строка пишется
|
||||||
|
// своим значением. Если у consumer'а проявятся torn frames, они будут видны
|
||||||
|
// как «строки с разным seq» в одном кадре.
|
||||||
|
//
|
||||||
|
// Опция --sync=stream → cudaStreamSynchronize (Scenario A, текущий дизайн)
|
||||||
|
// Опция --sync=event → cudaIpcEvent (Scenario B, предлагаемый fix)
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// ./producer --key A --width 1920 --height 1080 --fps 60 --sync stream
|
||||||
|
// ./producer --key B --width 1920 --height 1080 --fps 60 --sync event
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <sys/mman.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <chrono>
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
using namespace cuframes_spike_v2;
|
||||||
|
|
||||||
|
// Kernel: каждая строка получает значение pattern_value(seq, row)
|
||||||
|
__global__ void fill_pattern(uint8_t* y, int width, int height, int pitch_y,
|
||||||
|
uint64_t seq) {
|
||||||
|
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
if (x < width && row < height) {
|
||||||
|
uint8_t v = pattern_value(seq, row);
|
||||||
|
y[row * pitch_y + x] = v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Args {
|
||||||
|
std::string key = "A";
|
||||||
|
int width = 1920;
|
||||||
|
int height = 1080;
|
||||||
|
int fps = 60;
|
||||||
|
int duration_sec = 0;
|
||||||
|
std::string sync = "stream"; // "stream" | "event"
|
||||||
|
};
|
||||||
|
|
||||||
|
static Args parse_args(int argc, char** argv) {
|
||||||
|
Args a;
|
||||||
|
for (int i = 1; i < argc; ++i) {
|
||||||
|
std::string arg = argv[i];
|
||||||
|
auto next = [&] { return std::string(argv[++i]); };
|
||||||
|
if (arg == "--key") a.key = next();
|
||||||
|
else if (arg == "--width") a.width = std::stoi(next());
|
||||||
|
else if (arg == "--height") a.height = std::stoi(next());
|
||||||
|
else if (arg == "--fps") a.fps = std::stoi(next());
|
||||||
|
else if (arg == "--duration") a.duration_sec = std::stoi(next());
|
||||||
|
else if (arg == "--sync") a.sync = next();
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
Args args = parse_args(argc, argv);
|
||||||
|
bool use_events = (args.sync == "event");
|
||||||
|
|
||||||
|
std::cout << "[producer] key=" << args.key
|
||||||
|
<< " " << args.width << "x" << args.height
|
||||||
|
<< " @ " << args.fps << " fps"
|
||||||
|
<< " sync=" << args.sync << "\n";
|
||||||
|
|
||||||
|
CHECK_CUDA(cudaSetDevice(0));
|
||||||
|
|
||||||
|
const int pitch_y = args.width;
|
||||||
|
const size_t y_bytes = static_cast<size_t>(pitch_y) * args.height;
|
||||||
|
|
||||||
|
void* slot_ptrs[RING_SIZE];
|
||||||
|
cudaIpcMemHandle_t slot_handles[RING_SIZE];
|
||||||
|
for (int i = 0; i < RING_SIZE; ++i) {
|
||||||
|
CHECK_CUDA(cudaMalloc(&slot_ptrs[i], y_bytes));
|
||||||
|
CHECK_CUDA(cudaIpcGetMemHandle(&slot_handles[i], slot_ptrs[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create event for cross-process sync (Scenario B)
|
||||||
|
cudaEvent_t event = nullptr;
|
||||||
|
cudaIpcEventHandle_t event_handle = {};
|
||||||
|
if (use_events) {
|
||||||
|
CHECK_CUDA(cudaEventCreateWithFlags(&event,
|
||||||
|
cudaEventDisableTiming | cudaEventInterprocess));
|
||||||
|
CHECK_CUDA(cudaIpcGetEventHandle(&event_handle, event));
|
||||||
|
std::cout << "[producer] cuda event for IPC sync created\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
// POSIX shm
|
||||||
|
std::string shm_path = "/dev/shm/cuframes-v2-" + args.key;
|
||||||
|
int fd = open(shm_path.c_str(), O_CREAT | O_RDWR, 0666);
|
||||||
|
if (fd < 0 || ftruncate(fd, sizeof(SharedHeader)) < 0) {
|
||||||
|
perror("[producer] shm");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
auto* shared = static_cast<SharedHeader*>(
|
||||||
|
mmap(nullptr, sizeof(SharedHeader), PROT_READ | PROT_WRITE,
|
||||||
|
MAP_SHARED, fd, 0));
|
||||||
|
if (shared == MAP_FAILED) {
|
||||||
|
perror("[producer] mmap");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
std::memset(shared, 0, sizeof(SharedHeader));
|
||||||
|
shared->magic = MAGIC;
|
||||||
|
shared->version = VERSION;
|
||||||
|
shared->use_events = use_events ? 1 : 0;
|
||||||
|
if (use_events) shared->event_handle = event_handle;
|
||||||
|
shared->meta = {args.width, args.height, pitch_y};
|
||||||
|
for (int i = 0; i < RING_SIZE; ++i) {
|
||||||
|
shared->slots[i].mem_handle = slot_handles[i];
|
||||||
|
}
|
||||||
|
__atomic_thread_fence(__ATOMIC_RELEASE);
|
||||||
|
std::cout << "[producer] shm ready at " << shm_path << "\n";
|
||||||
|
|
||||||
|
cudaStream_t stream;
|
||||||
|
CHECK_CUDA(cudaStreamCreate(&stream));
|
||||||
|
|
||||||
|
const auto frame_interval = std::chrono::nanoseconds(1'000'000'000LL / args.fps);
|
||||||
|
auto next_frame = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
|
dim3 block(32, 8);
|
||||||
|
dim3 grid((args.width + block.x - 1) / block.x,
|
||||||
|
(args.height + block.y - 1) / block.y);
|
||||||
|
|
||||||
|
uint64_t seq = 0;
|
||||||
|
const int64_t end_ns = args.duration_sec > 0
|
||||||
|
? now_ns() + args.duration_sec * 1'000'000'000LL : 0;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
if (end_ns && now_ns() > end_ns) break;
|
||||||
|
|
||||||
|
const int slot_idx = seq % RING_SIZE;
|
||||||
|
|
||||||
|
// Заполнить slot pattern'ом
|
||||||
|
fill_pattern<<<grid, block, 0, stream>>>(
|
||||||
|
static_cast<uint8_t*>(slot_ptrs[slot_idx]),
|
||||||
|
args.width, args.height, pitch_y, seq);
|
||||||
|
|
||||||
|
// Sync: либо stream sync (Scenario A), либо event record (Scenario B)
|
||||||
|
if (use_events) {
|
||||||
|
CHECK_CUDA(cudaEventRecord(event, stream));
|
||||||
|
// НЕ делаем cudaStreamSynchronize — consumer сам wait'нет event
|
||||||
|
} else {
|
||||||
|
CHECK_CUDA(cudaStreamSynchronize(stream));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Publish (атомарный seq bump после sync/record)
|
||||||
|
__atomic_store_n(&shared->slots[slot_idx].producer_seq, seq, __ATOMIC_RELEASE);
|
||||||
|
__atomic_store_n(&shared->slots[slot_idx].pts_ns, now_ns(), __ATOMIC_RELEASE);
|
||||||
|
__atomic_store_n(&shared->global_seq, seq, __ATOMIC_RELEASE);
|
||||||
|
|
||||||
|
if (seq % 300 == 0 && seq > 0) {
|
||||||
|
uint64_t torn = __atomic_load_n(&shared->torn_frame_count, __ATOMIC_ACQUIRE);
|
||||||
|
std::cout << "[producer] seq=" << seq << " torn_frames_so_far=" << torn << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
seq++;
|
||||||
|
next_frame += frame_interval;
|
||||||
|
std::this_thread::sleep_until(next_frame);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t torn_final = __atomic_load_n(&shared->torn_frame_count, __ATOMIC_ACQUIRE);
|
||||||
|
std::cout << "[producer] FINAL: published=" << seq
|
||||||
|
<< " torn_frames=" << torn_final << "\n";
|
||||||
|
|
||||||
|
for (int i = 0; i < RING_SIZE; ++i) CHECK_CUDA(cudaFree(slot_ptrs[i]));
|
||||||
|
if (event) cudaEventDestroy(event);
|
||||||
|
munmap(shared, sizeof(SharedHeader));
|
||||||
|
close(fd);
|
||||||
|
unlink(shm_path.c_str());
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user