UPDATE LOG

complete the kernel Generate Training Samples NeRF

NOTE
This draft is base on the commit d64e353db28109a81657879fc88025713d8fad53 (Oct 8, 2025)
Instant-NGP Official Repository:
NVlabs
/
instant-ngp
Waiting for api.github.com...
00K
0K
0K
Waiting...

1. Introduction and Motivation#

In this article, we’re going to untangle the core training pipeline, in a first-principle manner, and finally rewrite a clean, tidy, modern, and easy-to-understand version, and achieve better performance.

2. Kernel: Generate Training Samples NeRF#

Location: instant-ngp/src/testbed_nerf.cu -> generate_training_samples_nerf

Parameters List

Name	Type
`n_rays`	`uint32_t`
`aabb`	BoundingBox (custom struct)
`max_samples`	`uint32_t`
`n_rays_total`	`uint32_t`
`rng`	default_rng_t (PCG RNG wrapper)
`ray_counter`	`uint32_t*`
`numsteps_counter`	`uint32_t*`
`ray_indices_out`	`uint32_t*`
`rays_out_unnormalized`	Ray* (custom ray structure pointer)
`numsteps_out`	`uint32_t*`
`coords_out`	PitchedPtr (pitched GPU buffer of sampled coords)
`n_training_images`	`uint32_t`
`metadata`	TrainingImageMetadata* (per-image camera + rays + focal info)
`training_xforms`	TrainingXForm* (start/end view transform, rolling shutter)
`density_grid`	`const uint8_t*`
`max_mip`	`uint32_t`
`max_level_rand_training`	`bool`
`max_level_ptr`	`float*`
`snap_to_pixel_centers`	`bool`
`train_envmap`	`bool`
`cone_angle_constant`	`float`
`distortion`	Buffer2DView (image-space distortion LUT)
`cdf_x_cond_y`	`const float*`
`cdf_y`	`const float*`
`cdf_img`	`const float*`
`cdf_res`	ivec2 (2D integer vector)
`extra_dims_gpu`	`const float*`
`n_extra_dims`	`uint32_t`

Click here to show complete code

1
__global__ void generate_training_samples_nerf(
2
    const uint32_t n_rays, BoundingBox aabb, const uint32_t max_samples, const uint32_t n_rays_total,
3
    default_rng_t rng, uint32_t* __restrict__ ray_counter, uint32_t* __restrict__ numsteps_counter,
4
    uint32_t* __restrict__ ray_indices_out, Ray* __restrict__ rays_out_unnormalized,
5
    uint32_t* __restrict__ numsteps_out, PitchedPtr<NerfCoordinate> coords_out, const uint32_t n_training_images,
6
    const TrainingImageMetadata* __restrict__ metadata, const TrainingXForm* training_xforms,
7
    const uint8_t* __restrict__ density_grid, uint32_t max_mip, bool max_level_rand_training,
8
    float* __restrict__ max_level_ptr, bool snap_to_pixel_centers, bool train_envmap, float cone_angle_constant,
9
    Buffer2DView<const vec2> distortion, const float* __restrict__ cdf_x_cond_y, const float* __restrict__ cdf_y,
10
    const float* __restrict__ cdf_img, const ivec2 cdf_res, const float* __restrict__ extra_dims_gpu,
11
    uint32_t n_extra_dims)
12
{
13
    const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
14
    if (i >= n_rays)
15
    {
16
        return;
17
    }
18

19
    uint32_t img = image_idx(i, n_rays, n_rays_total, n_training_images, cdf_img);
20
    ivec2 resolution = metadata[img].resolution;
21

22
    rng.advance(i * N_MAX_RANDOM_SAMPLES_PER_RAY());
23
    vec2 uv =
24
        nerf_random_image_pos_training(rng, resolution, snap_to_pixel_centers, cdf_x_cond_y, cdf_y, cdf_res, img);
25

26
    // Negative values indicate masked-away regions
27
    size_t pix_idx = pixel_idx(uv, resolution, 0);
28
    if (read_rgba(uv, resolution, metadata[img].pixels, metadata[img].image_data_type).x < 0.0f)
29
    {
30
        return;
31
    }
32

33
    float max_level = max_level_rand_training ? (random_val(rng) * 2.0f)
34
                                              : 1.0f; // Multiply by 2 to ensure 50% of training is at max level
35

36
    float motionblur_time = random_val(rng);
37

38
    const vec2 focal_length = metadata[img].focal_length;
39
    const vec2 principal_point = metadata[img].principal_point;
40
    const float* extra_dims = extra_dims_gpu + img * n_extra_dims;
41
    const Lens lens = metadata[img].lens;
42

43
    const mat4x3 xform =
44
        get_xform_given_rolling_shutter(training_xforms[img], metadata[img].rolling_shutter, uv, motionblur_time);
45

46
    Ray ray_unnormalized;
47
    const Ray* rays_in_unnormalized = metadata[img].rays;
48
    if (rays_in_unnormalized)
49
    {
50
        // Rays have been explicitly supplied. Read them.
51
        ray_unnormalized = rays_in_unnormalized[pix_idx];
52

53
        /* DEBUG - compare the stored rays to the computed ones
54
        const mat4x3 xform = get_xform_given_rolling_shutter(training_xforms[img], metadata[img].rolling_shutter,
55
        uv, 0.f); Ray ray2; ray2.o = xform[3]; ray2.d = f_theta_distortion(uv, principal_point, lens); ray2.d =
56
        (xform.block<3, 3>(0, 0) * ray2.d).normalized(); if (i==1000) { printf("\n%d uv %0.3f,%0.3f pixel
57
        %0.2f,%0.2f transform from [%0.5f %0.5f %0.5f] to [%0.5f %0.5f %0.5f]\n" " origin    [%0.5f %0.5f %0.5f] vs
58
        [%0.5f %0.5f %0.5f]\n" " direction [%0.5f %0.5f %0.5f] vs [%0.5f %0.5f %0.5f]\n" , img,uv.x, uv.y,
59
        uv.x*resolution.x, uv.y*resolution.y,
60
                training_xforms[img].start[3].x,training_xforms[img].start[3].y,training_xforms[img].start[3].z,
61
                training_xforms[img].end[3].x,training_xforms[img].end[3].y,training_xforms[img].end[3].z,
62
                ray_unnormalized.o.x,ray_unnormalized.o.y,ray_unnormalized.o.z,
63
                ray2.o.x,ray2.o.y,ray2.o.z,
64
                ray_unnormalized.d.x,ray_unnormalized.d.y,ray_unnormalized.d.z,
65
                ray2.d.x,ray2.d.y,ray2.d.z);
66
        }
67
        */
68
    }
69
    else
70
    {
71
        ray_unnormalized = uv_to_ray(0, uv, resolution, focal_length, xform, principal_point, vec3(0.0f), 0.0f,
72
                                     1.0f, 0.0f, {}, {}, lens, distortion);
73
        if (!ray_unnormalized.is_valid())
74
        {
75
            ray_unnormalized = {xform[3], xform[2]};
76
        }
77
    }
78

79
    vec3 ray_d_normalized = normalize(ray_unnormalized.d);
80

81
    vec2 tminmax = aabb.ray_intersect(ray_unnormalized.o, ray_d_normalized);
82
    float cone_angle = calc_cone_angle(dot(ray_d_normalized, xform[2]), focal_length, cone_angle_constant);
83

84
    // The near distance prevents learning of camera-specific fudge right in front of the camera
85
    tminmax.x = fmaxf(tminmax.x, 0.0f);
86

87
    float startt = advance_n_steps(tminmax.x, cone_angle, random_val(rng));
88
    vec3 idir = vec3(1.0f) / ray_d_normalized;
89

90
    // first pass to compute an accurate number of steps
91
    uint32_t j = 0;
92
    float t = startt;
93
    vec3 pos;
94

95
    while (aabb.contains(pos = ray_unnormalized.o + t * ray_d_normalized) && j < NERF_STEPS())
96
    {
97
        float dt = calc_dt(t, cone_angle);
98
        uint32_t mip = mip_from_dt(dt, pos, max_mip);
99
        if (density_grid_occupied_at(pos, density_grid, mip))
100
        {
101
            ++j;
102
            t += dt;
103
        }
104
        else
105
        {
106
            t = advance_to_next_voxel(t, cone_angle, pos, ray_d_normalized, idir, mip);
107
        }
108
    }
109
    if (j == 0 && !train_envmap)
110
    {
111
        return;
112
    }
113
    uint32_t numsteps = j;
114
    uint32_t base = atomicAdd(numsteps_counter, numsteps); // first entry in the array is a counter
115
    if (base + numsteps > max_samples)
116
    {
117
        return;
118
    }
119

120
    coords_out += base;
121

122
    uint32_t ray_idx = atomicAdd(ray_counter, 1);
123

124
    ray_indices_out[ray_idx] = i;
125
    rays_out_unnormalized[ray_idx] = ray_unnormalized;
126
    numsteps_out[ray_idx * 2 + 0] = numsteps;
127
    numsteps_out[ray_idx * 2 + 1] = base;
128

129
    vec3 warped_dir = warp_direction(ray_d_normalized);
130
    t = startt;
131
    j = 0;
132
    while (aabb.contains(pos = ray_unnormalized.o + t * ray_d_normalized) && j < numsteps)
133
    {
134
        float dt = calc_dt(t, cone_angle);
135
        uint32_t mip = mip_from_dt(dt, pos, max_mip);
136
        if (density_grid_occupied_at(pos, density_grid, mip))
137
        {
138
            coords_out(j)->set_with_optional_extra_dims(warp_position(pos, aabb), warped_dir, warp_dt(dt),
139
                                                        extra_dims, coords_out.stride_in_bytes);
140
            ++j;
141
            t += dt;
142
        }
143
        else
144
        {
145
            t = advance_to_next_voxel(t, cone_angle, pos, ray_d_normalized, idir, mip);
146
        }
147
    }
148

149
    if (max_level_rand_training)
150
    {
151
        max_level_ptr += base;
152
        for (j = 0; j < numsteps; ++j)
153
        {
154
            max_level_ptr[j] = max_level;
155
        }
156
    }
157
}

2.1 CUDA indexing formula#

1
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
2
if (i >= n_elements)
3
{
4
    return;
5
}

2.1.1 Global Thread Index [1]#

\boxed{ i = \text{threadIdx}_x + \text{blockIdx}_x \cdot \text{blockDim}_x }

CUDA Indexing

Term	Meaning
$\text{threadIdx}_x$	index of thread inside its block
$\text{blockIdx}_x$	index of block inside the grid
$\text{blockDim}_x$	number of threads per block

2.2 Determine image index for a given ray#

1
uint32_t img = image_idx(i, n_rays, n_rays_total, n_training_images, cdf_img);

2.2.1 CUDA Function `image_idx`#

1
inline NGP_HOST_DEVICE uint32_t image_idx(uint32_t base_idx, uint32_t n_rays, uint32_t n_rays_total, uint32_t n_training_images, const float* __restrict__ cdf = nullptr, float* __restrict__ pdf = nullptr) {
2
  if (cdf) {
3
    float sample = ld_random_val(base_idx/* + n_rays_total*/, 0xdeadbeef);
4
    // float sample = random_val(base_idx/* + n_rays_total*/);
5
    uint32_t img = binary_search(sample, cdf, n_training_images);
6

7
    if (pdf) {
8
      float prev = img > 0 ? cdf[img-1] : 0.0f;
9
      *pdf = (cdf[img] - prev) * n_training_images;
10
    }
11

12
    return img;
13
  }
14

15
  // return ((base_idx/* + n_rays_total*/) * 56924617 + 96925573) % n_training_images;
16

17
  // Neighboring threads in the warp process the same image. Increases locality.
18
  if (pdf) {
19
    *pdf = 1.0f;
20
  }
21
  return (((base_idx/* + n_rays_total*/) * n_training_images) / n_rays) % n_training_images;
22
}

Parameter	Type	Note
`base_idx`	`uint32_t`	Unique ray/thread index used for hashing image selection
`n_rays`	`uint32_t`	Total rays scheduled in current iteration (controls uniform mapping)
~~`n_rays_total`~~	~~`uint32_t`~~	(Unused in training — relevance removed)
`n_training_images`	`uint32_t`	Number of images available for sampling (upper bound of output index)
`cdf`	`const float*`	Optional CDF for importance sampling — always `nullptr` in NeRF training
`pdf`	`float*`	Output for probability weight only used when `cdf != nullptr` (never touched in training)

2.2.2 Base Version#

1
__device__ uint32_t image_idx(
2
    const uint32_t base_idx,
3
    const uint32_t n_rays,
4
    const uint32_t n_training_images
5
    ) {
6
    return base_idx * n_training_images / n_rays % n_training_images;
7
}

Intuitive interpretation: Each image receives approximately $\frac{N_R}{N_I}$ rays. Rays are distributed proportionally among the images.

f(i) = \Biggl(\left\lfloor \frac{i \cdot N_I}{N_R} \right\rfloor \Biggr) \bmod N_I

Where:

Symbol	Corresponding variable
$i$	`base_idx`
$N_R$	`n_rays`
$N_I$	`n_training_images`

2.2.3 CDF and PDF#

For more details about CDF and PDF, please refer to Appendix CDF: Cumulative Distribution Function.

TODO: explain the CDF & PDF branch

2.3 Get Image Resolution#

1
ivec2 resolution = metadata[img].resolution;

2.3.1 `TrainingImageMetadata` Struct#

1
struct TrainingImageMetadata {
2
  // Camera intrinsics and additional data associated with a NeRF training image
3
  // the memory to back the pixels and rays is held by GPUMemory objects in the NerfDataset and copied here.
4
  const void* pixels = nullptr;
5
  EImageDataType image_data_type = EImageDataType::Half;
6

7
  const float* depth = nullptr;
8
  const Ray* rays = nullptr;
9

10
  Lens lens = {};
11
  ivec2 resolution = ivec2(0);
12
  vec2 principal_point = vec2(0.5f);
13
  vec2 focal_length = vec2(1000.f);
14
  vec4 rolling_shutter = vec4(0.0f);
15
  vec3 light_dir = vec3(0.f); // TODO: replace this with more generic float[] of task-specific metadata.
16
};

Field	Type	Meaning
`pixels`	`const void*`	Pointer to pixel buffer in GPU memory
`image_data_type`	`EImageDataType`	Pixel storage format (Byte/Half Float etc.)
`depth`	`const float*`	Optional depth values per pixel (nullable)
`rays`	`const Ray*`	Optional precomputed rays (nullable)
`lens`	`Lens`	Lens configuration (distortion and optical parameters)
`resolution`	`ivec2`	Image width & height
`principal_point`	`vec2`	Camera optical center offset
`focal_length`	`vec2`	Focal length fx, fy
`rolling_shutter`	`vec4`	Rolling shutter timing & motion model
`light_dir`	`vec3`	View lighting direction (non-general metadata placeholder)

2.3.2 How to compute `resolution`#

NOTE
In order to avid being trapped in the endless details of image loading and preprocessing, we now assume the image resolution is precomputed and stored in the TrainingImageMetadata struct. We will cover the image loading and preprocessing in a future article.

Here, for NeRF Synthetic dataset, we can simply assume the resolution is a constant (800 x 800). (Obviously, it’s a safe assumption that all images in the dataset share the same resolution and never change during training.)

2.4 Advance RNG State#

1
rng.advance(i * N_MAX_RANDOM_SAMPLES_PER_RAY());

2.4.1 `default_rng_t` (`tcnn::pcg32`) Struct#

default_rng_t (tcnn::pcg32) is a wrapper around the PCG Random Number Generator. For more details about PCG, please refer to Appendix PCG: Permuted Congruential Generator.

2.4.2 Why advance RNG state?#

In tiny-cuda-nn / instant-ngp, each CUDA thread generates one ray:

1
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;  // unique per thread
2
rng.advance(i * N_MAX_RANDOM_SAMPLES_PER_RAY());

This line is not random — it is a design requirement.

PCG produces a sequence of numbers. A PCG generator is deterministic:

x_{n+1} = f(x_n)

If all threads start with the same RNG state, then:

Thread	RNG values
Thread 0	0.83, 0.21, 0.55, …
Thread 1	0.83, 0.21, 0.55, …
Thread 2	0.83, 0.21, 0.55, …

every pixel ray gets the same random samples
training collapses (all rays identical → no learning)

2.4.3 How `advance(k)` jumps ahead in the PCG sequence#

The function:

1
rng.advance(K);

mathematically means:

x_{n+K} = f^{(K)}(x_0)

It fast-forwards the PCG stream without generating intermediate numbers.

Each thread should get different random numbers, so they offset the RNG state using the thread ID:

That means:

Thread `i`	RNG will begin at position
0	base + 0 $\times$ stride
1	base + 1 $\times$ stride
2	base + 2 $\times$ stride
…	…

Therefore:

no collision
parallel-safe randomness
deterministic reproducibility

2.4.4 Why multiply by `N_MAX_RANDOM_SAMPLES_PER_RAY()`?#

Because each ray will generate up to that many random numbers. So they space threads far enough apart so streams don’t overlap.

If worst case = 64 random samples per ray, then:

Ray index `i`	RNG range reserved
0	0–63
1	64–127
2	128–191

Each ray lives in its own section of the RNG sequence.

2.5 Sample Image Position#

1
vec2 uv = nerf_random_image_pos_training(rng, resolution, snap_to_pixel_centers, cdf_x_cond_y, cdf_y, cdf_res, img);

2.5.1 CUDA Function `nerf_random_image_pos_training`#

1
inline __device__ vec2 nerf_random_image_pos_training(default_rng_t& rng, const ivec2& resolution, bool snap_to_pixel_centers, const float* __restrict__ cdf_x_cond_y, const float* __restrict__ cdf_y, const ivec2& cdf_res, uint32_t img, float* __restrict__ pdf = nullptr) {
2
  vec2 uv = random_val_2d(rng);
3

4
  if (cdf_x_cond_y) {
5
    uv = sample_cdf_2d(uv, img, cdf_res, cdf_x_cond_y, cdf_y, pdf);
6
  } else {
7
    // // Warp-coherent tile
8
    // uv.x = __shfl_sync(0xFFFFFFFF, uv.x, 0);
9
    // uv.y = __shfl_sync(0xFFFFFFFF, uv.y, 0);
10

11
    // const ivec2 TILE_SIZE = {8, 4};
12
    // uv = (uv * vec2(resolution - TILE_SIZE) + vec2(tcnn::lane_id() % TILE_SIZE.x, tcnn::lane_id() / threadIdx.x)) / vec2(resolution);
13

14
    if (pdf) {
15
      *pdf = 1.0f;
16
    }
17
  }
18

19
  if (snap_to_pixel_centers) {
20
    uv = (vec2(clamp(ivec2(uv * vec2(resolution)), 0, resolution - 1)) + 0.5f) / vec2(resolution);
21
  }
22

23
  return uv;
24
}

Parameter	Type	Note
`rng`	`default_rng_t&`	Random number generator reference — mutated each call
`resolution`	`ivec2`	Image width/height used to scale UV coordinates
`snap_to_pixel_centers`	`bool`	If `true`, UV snapped to pixel center rather than continuous sampling
`cdf_x_cond_y`	`const float` (optional)*	X-conditioned CDF table for importance sampling — if non-null enables 2D CDF sampling
`cdf_y`	`const float` (optional)*	Marginal distribution along Y axis for CDF sampling
`cdf_res`	`ivec2`	Resolution of CDF grid `(width,height)` corresponding to `cdf_x_cond_y/cdf_y`
`img`	`uint32_t`	Image index — determines which image’s CDF to sample from
`pdf`	`float` (optional)*	Output probability density — set only if CDF sampling used or PDF requested

2.5.2 Base Version#

1
__device__ tcnn::vec2 nerf_random_image_pos_training(
2
    tcnn::pcg32& rng,
3
    const tcnn::ivec2& resolution,
4
    const bool snap_to_pixel_centers
5
    ) {
6
    tcnn::vec2 uv = {rng.next_float(), rng.next_float()};
7

8
    if (snap_to_pixel_centers) {
9
        uv = (tcnn::vec2(tcnn::clamp(tcnn::ivec2(uv * tcnn::vec2(resolution)), 0, resolution - 1)) + 0.5f) / tcnn::vec2(resolution);
10
    }
11
    return uv;
12
}

It generates a random UV coordinate inside a training image. UV is normalized to [0,1] × [0,1]. This UV is later turned into a ray shooting into the NeRF scene.

2.5.3 The Key Takeaway#

NeRF Synthetic training uses the else branch almost always.

Meaning:

UV is uniformly random
PDF defaults to 1.0
CDF importance sampling is disabled by default

2.6 Get Pixel Index#

1
size_t pix_idx = pixel_idx(uv, resolution, 0);

2.6.1 CUDA Function `pixel_idx`#

1
inline NGP_HOST_DEVICE ivec2 image_pos(const vec2& pos, const ivec2& resolution)
2
{
3
    return clamp(ivec2(pos * vec2(resolution)), 0, resolution - 1);
4
}
5

6
inline NGP_HOST_DEVICE uint64_t pixel_idx(const ivec2& px, const ivec2& resolution, uint32_t img)
7
{
8
    return px.x + px.y * resolution.x + img * (uint64_t)resolution.x * resolution.y;
9
}
10

11
inline NGP_HOST_DEVICE uint64_t pixel_idx(const vec2& uv, const ivec2& resolution, uint32_t img)
12
{
13
    return pixel_idx(image_pos(uv, resolution), resolution, img);
14
}

2.6.2 Base Version#

1
inline __device__ uint64_t pixel_idx(const tcnn::vec2& uv, const tcnn::ivec2& resolution, uint32_t img) {
2
    tcnn::ivec2 px = tcnn::clamp(tcnn::ivec2(uv * tcnn::vec2(resolution)), 0, resolution - 1);
3
    return px.x + px.y * resolution.x + img * (uint64_t) resolution.x * resolution.y;
4
}

They map uv (float normalized coordinates) → pixel(x,y) → flat pixel index in entire dataset

CUDA Function `image_pos()`#

1
inline NGP_HOST_DEVICE ivec2 image_pos(const vec2& pos, const ivec2& resolution)
2
{
3
    return clamp(ivec2(pos * vec2(resolution)), 0, resolution - 1);
4
}

Input

pos = uv ∈ [0,1] (normalized image space)
resolution = (W,H)

What it does

pos * resolution converts normalized UV → pixel space Example → (0.2,0.5) * (800,800) → (160,400)
Convert to integer ivec2(...) (drop decimals)
clamp(..., 0, resolution-1) ensures pixel cannot go outside image

Output

A valid pixel coordinate (x,y) inside the image:

1
0 ≤ x < width
2
0 ≤ y < height

CUDA Function `pixel_idx(px)`#

1
inline NGP_HOST_DEVICE uint64_t pixel_idx(const ivec2& px, const ivec2& resolution, uint32_t img)
2
{
3
    return px.x + px.y * resolution.x + img * (uint64_t)resolution.x * resolution.y;
4
}

Meaning

This converts pixel index (x,y) + image number (img) into a 1D index for flattened dataset storage.

Breakdown:

\text{pixel offset in image} = x + y \cdot \text{width}

\text{image offset} = img \cdot (width \cdot height)

So total index = index inside image + offset to image block

CUDA Function `pixel_idx(uv)` — UV version#

1
inline NGP_HOST_DEVICE uint64_t pixel_idx(const vec2& uv, const ivec2& resolution, uint32_t img)
2
{
3
    return pixel_idx(image_pos(uv, resolution), resolution, img);
4
}

This is just a convenience overload:

Steps internally:

1
uv → pixel(x,y) using image_pos()
2
(x,y,img) → 1D index using pixel_idx()

So this lets you write:

1
pixel_idx(uv, resolution, img);

instead of:

1
ivec2 px = image_pos(uv, resolution);
2
pixel_idx(px, resolution, img);

2.7 Check Pixel Validity#

1
if (read_rgba(uv, resolution, metadata[img].pixels, metadata[img].image_data_type).x < 0.0f)
2
{
3
    return;
4
}

Given a pixel coordinate (either uv or integer px), look into GPU image data and return a vec4(R,G,B,A) in linear RGB space.

2.7.1 CUDA Function `read_rgba`#

1
inline NGP_HOST_DEVICE vec4 read_rgba(ivec2 px, const ivec2& resolution, const void* pixels,
2
                                      EImageDataType image_data_type, uint32_t img = 0)
3
{
4
    switch (image_data_type)
5
    {
6
    default:
7
        // This should never happen. Bright red to indicate this.
8
        return vec4{5.0f, 0.0f, 0.0f, 1.0f};
9
    case EImageDataType::Byte:
10
        {
11
            uint32_t val = ((uint32_t*)pixels)[pixel_idx(px, resolution, img)];
12
            if (val == 0x00FF00FF)
13
            {
14
                return vec4(-1.0f);
15
            }
16

17
            vec4 result = rgba32_to_rgba(val);
18
            result.rgb() = srgb_to_linear(result.rgb()) * result.a;
19
            return result;
20
        }
21
    case EImageDataType::Half:
22
        {
23
            __half val[4];
24
            *(uint64_t*)&val[0] = ((uint64_t*)pixels)[pixel_idx(px, resolution, img)];
25
            return vec4{(float)val[0], (float)val[1], (float)val[2], (float)val[3]};
26
        }
27
    case EImageDataType::Float:
28
        return ((vec4*)pixels)[pixel_idx(px, resolution, img)];
29
    }
30
}
31
inline NGP_HOST_DEVICE vec4 read_rgba(vec2 pos, const ivec2& resolution, const void* pixels,
32
                                      EImageDataType image_data_type, uint32_t img = 0)
33
{
34
    return read_rgba(image_pos(pos, resolution), resolution, pixels, image_data_type, img);
35
}

It supports three image formats:

Format	Stored As	Per Channel	Explanation
`Byte`	`uint32_t`	8-bit RGBA	sRGB → linear conversion with premultiply
`Half`	`__half[4]` → packed in `uint64_t`	16-bit float	No conversion, read directly
`Float`	`vec4*`	32-bit float	Pure float, no transformation

Case 1: `Byte` — 8-bit texture stored as `uint32_t`#

1
uint32_t val = ((uint32_t*)pixels)[pixel_idx(px, resolution, img)];
2
if (val == 0x00FF00FF) {
3
    return vec4(-1.0f);
4
}
5
vec4 result = rgba32_to_rgba(val);
6
result.rgb() = srgb_to_linear(result.rgb()) * result.a;
7
return result;

Explanation:

Load 4 × 8-bit channels in one uint32_t
If value is 0x00FF00FF, treat pixel as masked → returns vec4(-1) meaning invalid pixel
Convert BGRA/ARGB → linear RGBA (rgba32_to_rgba)
Convert sRGB → linear + premultiply by alpha

This format comes from NeRF synthetic datasets.

Case 2: `Half` — 16-bit floating point (stored compact)#

1
__half val[4];
2
*(uint64_t*)&val[0] = ((uint64_t*)pixels)[pixel_idx(...)]
3
return vec4{(float)val[0], ... }

Breakdown:

Stored as	Read as	Why?
4×half (each 16-bit) = 8 bytes	`uint64_t` load	faster & coalesced
Then reinterpret as `__half[4]`	convert to float	for computation

Used for lighter GPU memory footprint with HDR capability.

Case 3: `Float` — direct `vec4`#

1
return ((vec4*)pixels)[pixel_idx(px, resolution, img)];

Fastest — no conversion. Used when training directly with float images.

Second overload — UV input#

1
inline vec4 read_rgba(vec2 pos, ...)
2
{
3
    return read_rgba(image_pos(pos, resolution), ...);
4
}

Meaning:

Convert ${u,v} \in [0,1)$ → pixel coordinate image_pos() = uv * resolution → clamp to image bounds
Call integer version

2.7.2 Base Version#

1
inline __device__ float srgb_to_linear(float x) {
2
    return (x <= 0.04045f) ? (x * (1.f / 12.92f)) : powf((x + 0.055f) * (1.f / 1.055f), 2.4f);
3
}
4

5
inline __device__ tcnn::vec4 read_rgba(
6
    const tcnn::vec2& uv,
7
    const tcnn::ivec2& resolution,
8
    const void* pixels,
9
    const uint32_t img = 0 // optional, default works same as before
10
    ) {
11
    // ---------------------------------------------
12
    // 1. Get pixel address from uv + resolution
13
    // ---------------------------------------------
14
    const uint64_t idx  = pixel_idx(uv, resolution, img);
15
    const uint32_t rgba = static_cast<const uint32_t*>(pixels)[idx]; // packed 0xAARRGGBB
16

17
    // ---------------------------------------------
18
    // 2. Masked pixel → skip (-1 = INVALID)
19
    // ---------------------------------------------
20
    if (rgba == 0x00FF00FFu) return {-1.f, -1.f, -1.f, -1.f};
21

22
    // ---------------------------------------------
23
    // 3. Extract channels [0–255] → float [0–1]
24
    // ---------------------------------------------
25
    const float r = static_cast<float>((rgba >> 0) & 0xFF) * (1.f / 255.f);
26
    const float g = static_cast<float>((rgba >> 8) & 0xFF) * (1.f / 255.f);
27
    const float b = static_cast<float>((rgba >> 16) & 0xFF) * (1.f / 255.f);
28
    const float a = static_cast<float>((rgba >> 24) & 0xFF) * (1.f / 255.f);
29

30
    return {srgb_to_linear(r) * a,
31
            srgb_to_linear(g) * a,
32
            srgb_to_linear(b) * a,
33
            a};
34
}

2.8 Determine Maximum Mip Level for Training#

1
float max_level = max_level_rand_training ? (random_val(rng) * 2.0f) : 1.0f; // Multiply by 2 to ensure 50% of training is at max level

It seems that NeRF Synthetic training always uses max_level = 1.0f because max_level_rand_training is false by default.

2.8.1 Base Version#

1
float max_level = 1.0f; // default

As mentioned, NeRF Synthetic training does not use random mip levels. It’s safe to assume max_level = 1.0f always.

2.9 Get Transform with Rolling Shutter and Motion Blur#

1
float motionblur_time = random_val(rng);
2
...
3
const mat4x3 xform = get_xform_given_rolling_shutter(training_xforms[img], metadata[img].rolling_shutter, uv, motionblur_time);

Samples a random time in [0,1] for motion blur simulation during training, then computes the camera-to-world transform at that time, accounting for rolling shutter effects.

2.9.1 Why Motion Blur?#

In Instant-NGP, motionblur_time controls sampling along temporal exposure of rolling-shutter or moving scene. Think of it like simulating a camera where the shutter isn’t instantaneous—different rays observe the world at slightly different times during the frame capture.

2.10 CUDA Function `get_xform_given_rolling_shutter`#

1
        Ray ray_unnormalized;
2
        const Ray* rays_in_unnormalized = metadata[img].rays;
3
        if (rays_in_unnormalized)
4
        {
5
            // Rays have been explicitly supplied. Read them.
6
            ray_unnormalized = rays_in_unnormalized[pix_idx];
7

8
            /* DEBUG - compare the stored rays to the computed ones
9
            const mat4x3 xform = get_xform_given_rolling_shutter(training_xforms[img], metadata[img].rolling_shutter,
10
            uv, 0.f); Ray ray2; ray2.o = xform[3]; ray2.d = f_theta_distortion(uv, principal_point, lens); ray2.d =
11
            (xform.block<3, 3>(0, 0) * ray2.d).normalized(); if (i==1000) { printf("\n%d uv %0.3f,%0.3f pixel
12
            %0.2f,%0.2f transform from [%0.5f %0.5f %0.5f] to [%0.5f %0.5f %0.5f]\n" " origin    [%0.5f %0.5f %0.5f] vs
13
            [%0.5f %0.5f %0.5f]\n" " direction [%0.5f %0.5f %0.5f] vs [%0.5f %0.5f %0.5f]\n" , img,uv.x, uv.y,
14
            uv.x*resolution.x, uv.y*resolution.y,
15
                    training_xforms[img].start[3].x,training_xforms[img].start[3].y,training_xforms[img].start[3].z,
16
                    training_xforms[img].end[3].x,training_xforms[img].end[3].y,training_xforms[img].end[3].z,
17
                    ray_unnormalized.o.x,ray_unnormalized.o.y,ray_unnormalized.o.z,
18
                    ray2.o.x,ray2.o.y,ray2.o.z,
19
                    ray_unnormalized.d.x,ray_unnormalized.d.y,ray_unnormalized.d.z,
20
                    ray2.d.x,ray2.d.y,ray2.d.z);
21
            }
22
            */
23
        }
24
        else
25
        {
26
            ray_unnormalized = uv_to_ray(0, uv, resolution, focal_length, xform, principal_point, vec3(0.0f), 0.0f,
27
                                         1.0f, 0.0f, {}, {}, lens, distortion);
28
            if (!ray_unnormalized.is_valid())
29
            {
30
                ray_unnormalized = {xform[3], xform[2]};
31
            }
32
        }

Welcome to Xayah Music TV

In solitude, where we are least alone.

1. Introduction and Motivation#

2. Kernel: Generate Training Samples NeRF#

2.1 CUDA indexing formula#

2.1.1 Global Thread Index [1]#

2.2 Determine image index for a given ray#

2.2.1 CUDA Function `image_idx`#

2.2.2 Base Version#

2.2.3 CDF and PDF#

2.3 Get Image Resolution#

2.3.1 `TrainingImageMetadata` Struct#

2.3.2 How to compute `resolution`#

2.4 Advance RNG State#

2.4.1 `default_rng_t` (`tcnn::pcg32`) Struct#

2.4.2 Why advance RNG state?#

2.4.3 How `advance(k)` jumps ahead in the PCG sequence#

2.4.4 Why multiply by `N_MAX_RANDOM_SAMPLES_PER_RAY()`?#

2.5 Sample Image Position#

2.5.1 CUDA Function `nerf_random_image_pos_training`#

2.5.2 Base Version#

2.5.3 The Key Takeaway#

2.6 Get Pixel Index#

2.6.1 CUDA Function `pixel_idx`#

2.6.2 Base Version#

CUDA Function `image_pos()`#

CUDA Function `pixel_idx(px)`#

CUDA Function `pixel_idx(uv)` — UV version#

2.7 Check Pixel Validity#

2.7.1 CUDA Function `read_rgba`#

Case 1: `Byte` — 8-bit texture stored as `uint32_t`#

Case 2: `Half` — 16-bit floating point (stored compact)#

Case 3: `Float` — direct `vec4`#

Second overload — UV input#

2.7.2 Base Version#

2.8 Determine Maximum Mip Level for Training#

2.8.1 Base Version#

2.9 Get Transform with Rolling Shutter and Motion Blur#

2.9.1 Why Motion Blur?#

2.10 CUDA Function `get_xform_given_rolling_shutter`#

Welcome to Xayah Music TV

In solitude, where we are least alone.

1. Introduction and Motivation#

2. Kernel: Generate Training Samples NeRF#

2.1 CUDA indexing formula#

2.1.1 Global Thread Index [1]#

2.2 Determine image index for a given ray#

2.2.1 CUDA Function image_idx#

2.2.2 Base Version#

2.2.3 CDF and PDF#

2.3 Get Image Resolution#

2.3.1 TrainingImageMetadata Struct#

2.3.2 How to compute resolution#

2.4 Advance RNG State#

2.4.1 default_rng_t (tcnn::pcg32) Struct#

2.4.2 Why advance RNG state?#

2.4.3 How advance(k) jumps ahead in the PCG sequence#

2.4.4 Why multiply by N_MAX_RANDOM_SAMPLES_PER_RAY()?#

2.5 Sample Image Position#

2.5.1 CUDA Function nerf_random_image_pos_training#

2.5.2 Base Version#

2.5.3 The Key Takeaway#

2.6 Get Pixel Index#

2.6.1 CUDA Function pixel_idx#

2.6.2 Base Version#

CUDA Function image_pos()#

CUDA Function pixel_idx(px)#

CUDA Function pixel_idx(uv) — UV version#

2.7 Check Pixel Validity#

2.7.1 CUDA Function read_rgba#

Case 1: Byte — 8-bit texture stored as uint32_t#

Case 2: Half — 16-bit floating point (stored compact)#

Case 3: Float — direct vec4#

Second overload — UV input#

2.7.2 Base Version#

2.8 Determine Maximum Mip Level for Training#

2.8.1 Base Version#

2.9 Get Transform with Rolling Shutter and Motion Blur#

2.9.1 Why Motion Blur?#

2.10 CUDA Function get_xform_given_rolling_shutter#

2.2.1 CUDA Function `image_idx`#

2.3.1 `TrainingImageMetadata` Struct#

2.3.2 How to compute `resolution`#

2.4.1 `default_rng_t` (`tcnn::pcg32`) Struct#

2.4.3 How `advance(k)` jumps ahead in the PCG sequence#

2.4.4 Why multiply by `N_MAX_RANDOM_SAMPLES_PER_RAY()`?#

2.5.1 CUDA Function `nerf_random_image_pos_training`#

2.6.1 CUDA Function `pixel_idx`#

CUDA Function `image_pos()`#

CUDA Function `pixel_idx(px)`#

CUDA Function `pixel_idx(uv)` — UV version#

2.7.1 CUDA Function `read_rgba`#

Case 1: `Byte` — 8-bit texture stored as `uint32_t`#

Case 2: `Half` — 16-bit floating point (stored compact)#

Case 3: `Float` — direct `vec4`#

2.10 CUDA Function `get_xform_given_rolling_shutter`#