Skip to content

Commit 21708bf

Browse files
authored
[GPU] tower proof new api (#1067)
switch to new `create_proof` api in ceno gpu ### benchmark on fibonacci e2e benchmark CPU: 5900XT 32 cores GPU: RTX 5070 TI 16GB | Benchmark | Old GPU Time (s) | New GPU Time (s) | GPU Change (%) | Speedup (×) | |-----------------------------|------------------|------------------|----------------|-------------| | fibonacci_max_steps_1048576 | 1.162 | 0.897 | -22.77% | 1.29× | | fibonacci_max_steps_2097152 | 1.627 | 1.250 | -23.19% | 1.30× | | fibonacci_max_steps_4194304 | 2.441 | 1.919 | -21.40% | 1.27× | ### latency breakdown master branch ``` TRACE ZKVM_create_proof [ 3.85s | 0.00% / 100.00% ] TRACE ┝━ commit_to_pi [ 31.5µs | 0.00% ] profiling_1: true TRACE ┝━ commit_to_fixed_commit [ 6.54µs | 0.00% ] profiling_1: true TRACE ┝━ batch commit to traces [ 2.31s | 0.50% / 60.00% ] profiling_1: true TRACE │ ┝━ [gpu] init pp [ 1.25µs | 0.00% ] profiling_2: true TRACE │ ┝━ [gpu] hal init [ 116ms | 3.01% ] profiling_2: true TRACE │ ┝━ [gpu] batch_commit [ 2.17s | 56.44% ] profiling_2: true TRACE │ ┝━ [gpu] get_pure_commitment [ 220ns | 0.00% ] profiling_2: true TRACE │ ┝━ [gpu] get_mle_witness_from_commitment [ 2.50ms | 0.06% ] profiling_2: true TRACE │ ┕━ [gpu] transmute back [ 4.01µs | 0.00% ] profiling_2: true TRACE ┝━ transfer pk to device [ 9.19ms | 0.24% ] profiling_1: true TRACE ┝━ main_proofs [ 1.46s | 0.93% / 38.02% ] profiling_1: true TRACE │ ┝━ create_chip_proof [ 197ms | 0.00% / 5.12% ] table_name: "ADD" TRACE │ │ ┝━ per_layer_gen_witness [ 3.18ms | 0.08% ] profiling_2: true TRACE │ │ ┝━ prove_tower_relation [ 153ms | 0.00% / 3.98% ] profiling_2: true TRACE │ │ │ ┝━ build_tower_witness [ 28.1ms | 0.73% ] profiling_2: true TRACE │ │ │ ┝━ extract_out_evals_from_gpu_towers [ 531µs | 0.01% ] profiling_2: true TRACE │ │ │ ┕━ prove_tower_relation [ 125ms | 3.24% ] profiling_2: true TRACE │ │ ┕━ layer_proof [ 40.4ms | 1.05% ] profiling_2: true ``` target branch ``` TRACE ZKVM_create_proof [ 3.47s | 0.00% / 100.00% ] TRACE ┝━ commit_to_pi [ 36.0µs | 0.00% ] profiling_1: true TRACE ┝━ commit_to_fixed_commit [ 8.01µs | 0.00% ] profiling_1: true TRACE ┝━ batch commit to traces [ 2.28s | 0.52% / 65.85% ] profiling_1: true TRACE │ ┝━ [gpu] init pp [ 1.11µs | 0.00% ] profiling_2: true TRACE │ ┝━ [gpu] hal init [ 108ms | 3.11% ] profiling_2: true TRACE │ ┝━ [gpu] batch_commit [ 2.15s | 62.11% ] profiling_2: true TRACE │ ┝━ [gpu] get_pure_commitment [ 230ns | 0.00% ] profiling_2: true TRACE │ ┝━ [gpu] get_mle_witness_from_commitment [ 3.89ms | 0.11% ] profiling_2: true TRACE │ ┕━ [gpu] transmute back [ 4.06µs | 0.00% ] profiling_2: true TRACE ┝━ transfer pk to device [ 7.75ms | 0.22% ] profiling_1: true TRACE ┝━ main_proofs [ 1.11s | 1.02% / 32.02% ] profiling_1: true TRACE │ ┝━ create_chip_proof [ 157ms | 0.00% / 4.51% ] table_name: "ADD" TRACE │ │ ┝━ per_layer_gen_witness [ 2.93ms | 0.08% ] profiling_2: true TRACE │ │ ┝━ prove_tower_relation [ 112ms | 0.00% / 3.22% ] profiling_2: true TRACE │ │ │ ┝━ build_tower_witness [ 16.5ms | 0.48% ] profiling_2: true TRACE │ │ │ ┝━ extract_out_evals_from_gpu_towers [ 480µs | 0.01% ] profiling_2: true TRACE │ │ │ ┕━ prove_tower_relation [ 94.7ms | 2.73% ] profiling_2: true TRACE │ │ ┕━ layer_proof [ 41.8ms | 1.21% ] profiling_2: true ```
1 parent 596e1a1 commit 21708bf

File tree

4 files changed

+43
-14
lines changed

4 files changed

+43
-14
lines changed

ceno_zkvm/src/scheme/gpu/mod.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ use gkr_iop::gpu::gpu_prover::*;
4545

4646
pub struct GpuTowerProver;
4747

48+
use crate::scheme::constants::NUM_FANIN;
4849
use gkr_iop::gpu::{ArcMultilinearExtensionGpu, MultilinearExtensionGpu};
4950

5051
// Extract out_evals from GPU-built tower witnesses
@@ -59,7 +60,7 @@ fn extract_out_evals_from_gpu_towers<E: ff_ext::ExtensionField>(
5960
let mut w_out_evals = Vec::new();
6061
for (i, gpu_spec) in prod_gpu.iter().enumerate() {
6162
let first_layer_evals: Vec<E> = gpu_spec
62-
.get_final_evals(0)
63+
.get_output_evals()
6364
.expect("Failed to extract final evals from GPU product tower");
6465

6566
// Product tower first layer should have 2 MLEs
@@ -81,7 +82,7 @@ fn extract_out_evals_from_gpu_towers<E: ff_ext::ExtensionField>(
8182
let mut lk_out_evals = Vec::new();
8283
for gpu_spec in logup_gpu.iter() {
8384
let first_layer_evals: Vec<E> = gpu_spec
84-
.get_final_evals(0)
85+
.get_output_evals()
8586
.expect("Failed to extract final evals from GPU logup tower");
8687

8788
// Logup tower first layer should have 4 MLEs
@@ -481,7 +482,7 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> TowerProver<GpuBacke
481482
let span = entered_span!("prove_tower_relation", profiling_2 = true);
482483
let (point_gl, proof_gpu) = cuda_hal
483484
.tower
484-
.create_proof(&input, basic_tr)
485+
.create_proof(&cuda_hal, &input, NUM_FANIN, basic_tr)
485486
.expect("gpu tower create_proof failed");
486487
exit_span!(span);
487488

gkr_iop/src/cpu/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ impl<'a, E: ExtensionField> MultilinearPolynomial<E> for MultilinearExtension<'a
6262
fn evaluations_len(&self) -> usize {
6363
self.evaluations.len()
6464
}
65+
66+
fn bh_signature(&self) -> E {
67+
self.bh_signature()
68+
}
6569
}
6670

6771
impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> ProverBackend for CpuBackend<E, PCS> {

gkr_iop/src/gpu/mod.rs

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,8 @@ pub use gpu_prover::*;
7171
/// Stores a multilinear polynomial in dense evaluation form.
7272
pub struct MultilinearExtensionGpu<'a, E: ExtensionField> {
7373
/// GPU polynomial data, supporting both base field and extension field
74-
pub mle: GpuFieldType,
75-
_phantom: PhantomData<&'a E>,
74+
pub mle: GpuFieldType<'a>,
75+
_phantom: PhantomData<E>,
7676
}
7777

7878
impl<'a, E: ExtensionField> Default for MultilinearExtensionGpu<'a, E> {
@@ -122,6 +122,26 @@ impl<'a, E: ExtensionField> MultilinearPolynomial<E> for MultilinearExtensionGpu
122122
fn evaluations_len(&self) -> usize {
123123
self.mle.evaluations_len()
124124
}
125+
126+
fn bh_signature(&self) -> E {
127+
if std::any::TypeId::of::<E::BaseField>()
128+
!= std::any::TypeId::of::<p3::goldilocks::Goldilocks>()
129+
{
130+
panic!("GPU backend only supports Goldilocks");
131+
}
132+
133+
match &self.mle {
134+
GpuFieldType::Base(poly) => {
135+
let res: Vec<E> = unsafe { std::mem::transmute(vec![poly.bh_signature()]) };
136+
res[0]
137+
}
138+
GpuFieldType::Ext(poly) => {
139+
let res: Vec<E> = unsafe { std::mem::transmute(vec![poly.bh_signature()]) };
140+
res[0]
141+
}
142+
GpuFieldType::Unreachable => unreachable!(),
143+
}
144+
}
125145
}
126146

127147
impl<'a, E: ExtensionField> MultilinearExtensionGpu<'a, E> {
@@ -190,23 +210,23 @@ impl<'a, E: ExtensionField> MultilinearExtensionGpu<'a, E> {
190210
}
191211

192212
/// Create from base field GpuPolynomial
193-
pub fn from_ceno_gpu_base(mle_gpu: GpuPolynomial) -> Self {
213+
pub fn from_ceno_gpu_base(mle_gpu: GpuPolynomial<'a>) -> Self {
194214
Self {
195215
mle: GpuFieldType::Base(mle_gpu),
196216
_phantom: PhantomData,
197217
}
198218
}
199219

200220
/// Create from extension field GpuPolynomialExt
201-
pub fn from_ceno_gpu_ext(mle_gpu: GpuPolynomialExt) -> Self {
221+
pub fn from_ceno_gpu_ext(mle_gpu: GpuPolynomialExt<'a>) -> Self {
202222
Self {
203223
mle: GpuFieldType::Ext(mle_gpu),
204224
_phantom: PhantomData,
205225
}
206226
}
207227

208228
/// Method for backward compatibility
209-
pub fn from_ceno_gpu(mle_gpu: GpuPolynomial) -> Self {
229+
pub fn from_ceno_gpu(mle_gpu: GpuPolynomial<'a>) -> Self {
210230
Self::from_ceno_gpu_base(mle_gpu)
211231
}
212232

@@ -266,7 +286,8 @@ impl<E: ExtensionField, PCS: PolynomialCommitmentScheme<E>> ProverBackend for Gp
266286
type MultilinearPoly<'a> = MultilinearExtensionGpu<'a, E>;
267287
type Matrix = RowMajorMatrix<E::BaseField>;
268288
#[cfg(feature = "gpu")]
269-
type PcsData = BasefoldCommitmentWithWitnessGpu<E::BaseField, BufferImpl<E::BaseField>>;
289+
type PcsData =
290+
BasefoldCommitmentWithWitnessGpu<E::BaseField, BufferImpl<'static, E::BaseField>>;
270291
#[cfg(not(feature = "gpu"))]
271292
type PcsData = <PCS as PolynomialCommitmentScheme<E>>::CommitmentWithWitness;
272293

gkr_iop/src/hal.rs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,22 @@
1-
use ff_ext::ExtensionField;
2-
use mpcs::PolynomialCommitmentScheme;
3-
use multilinear_extensions::mle::Point;
4-
use std::{fmt::Debug, sync::Arc};
5-
61
use crate::gkr::layer::{
72
Layer,
83
hal::{LinearLayerProver, SumcheckLayerProver, ZerocheckLayerProver},
94
};
5+
use ff_ext::ExtensionField;
6+
use mpcs::PolynomialCommitmentScheme;
7+
use multilinear_extensions::mle::Point;
8+
use std::{fmt::Debug, sync::Arc};
109

1110
pub trait MultilinearPolynomial<E: ExtensionField> {
1211
fn num_vars(&self) -> usize;
1312
fn eval(&self, point: Point<E>) -> E;
1413

1514
/// Get the length of evaluation data
1615
fn evaluations_len(&self) -> usize;
16+
17+
/// Debug utility: generate a semantic signature value to represent the whole boolean hypercube elements
18+
/// this function is very heavily as traverse whole boolean hypercube
19+
fn bh_signature(&self) -> E;
1720
}
1821

1922
/// Defines basic types like field, pcs that are common among all devices

0 commit comments

Comments
 (0)