Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more choices to quantization tool. Post processing after sim_anneal(). (optimizer.py/ext_quant.cpp) #712

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
10 changes: 5 additions & 5 deletions exllamav2/exllamav2_ext/cpp/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ void apply_rep_penalty_cpu
// {
// if (g_rep_mask) free(g_rep_mask);
// g_vocab_size = vocab_size;
// g_rep_mask = (bool*) malloc(g_vocab_size * sizeof(bool));
// g_rep_mask = (bool*) calloc(1, g_vocab_size * sizeof(bool));
// }
// memset(g_rep_mask, 0, g_vocab_size * sizeof(bool));
bool* g_rep_mask = (bool*) calloc(vocab_size, sizeof(bool));
Expand Down Expand Up @@ -655,7 +655,7 @@ int tfs_cpu

int nc = sort_descending(num_candidates, temp_probs, temp_indices, num_candidates);

float* derivative = (float*) malloc(nc * sizeof(float));
float* derivative = (float*) calloc(1, nc * sizeof(float));
float dsum = 0.0f;
for (int i = 0; i < nc - 2; i++)
{
Expand Down Expand Up @@ -759,9 +759,9 @@ int typical_cpu

int r_candidates = pre_sort_descending(num_candidates, temp_probs, temp_indices);

float* temp = (float*) malloc(r_candidates * sizeof(float));
int* entropy_dev_order = (int*) malloc(r_candidates * sizeof(int));
int* temp_indices_2 = (int*) malloc(r_candidates * sizeof(int));
float* temp = (float*) calloc(1, r_candidates * sizeof(float));
int* entropy_dev_order = (int*) calloc(1, r_candidates * sizeof(int));
int* temp_indices_2 = (int*) calloc(1, r_candidates * sizeof(int));

float neg_entropy = 0.0f;
for (int i = 0; i < r_candidates; i++)
Expand Down
10 changes: 5 additions & 5 deletions exllamav2/exllamav2_ext/cuda/cache.cu
Original file line number Diff line number Diff line change
Expand Up @@ -165,16 +165,16 @@ __global__ void fp16_to_q_kv_paged_kernel

int page = block_table[pages_per_seq * y + x];
int seqlen = cache_seqlens[y];
int vx_a = page_size * x;
int px_a = seqlen - vx_a;
int vx_a = (int64_t)page_size * x;
int px_a = (int64_t)seqlen - vx_a;
int px_b = px_a + q_len;

if (dim % BLOCKSIZE_Q)
{
while ((px_a * dim) % BLOCKSIZE_Q) px_a--;
while ((px_b * dim) % BLOCKSIZE_Q) px_b++;
}

px_a = max(px_a, 0);
px_b = min(px_b, page_size);

Expand Down Expand Up @@ -346,7 +346,7 @@ __global__ void q_to_fp16_kv_paged_kernel
int seqlen = cache_seqlens[y];
if (!seqlen) return;

int vx_a = page_size * x;
int vx_a = (int64_t)page_size * x;
int vx_b = min(vx_a + page_size, seqlen);

if (dim < BLOCKSIZE_Q)
Expand Down Expand Up @@ -491,4 +491,4 @@ void array_q_to_fp16_kv_cuda
v_in, v_scales, v_out,
dim, offset, stride
);
}
}
13 changes: 11 additions & 2 deletions exllamav2/exllamav2_ext/cuda/q_matrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -603,9 +603,18 @@ bool QMatrix::make_sequential(const uint32_t* cpu_g_idx, cudaStream_t stream)
return false;
}

// Zero out the allocated memory
size_t mem_size = (height / 8) * width * sizeof(uint32_t);
err = cudaMemset(cuda_new_qweight, 0, mem_size);
if (err != cudaSuccess) {;;;
printf("CUDA memset failed: %s\n", cudaGetErrorString(err));
cudaFree(cuda_new_qweight); // Free the allocated memory in case of error
return err;
}

uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t));
uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t));
uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t));
uint32_t* cpu_x_map = (uint32_t*) calloc(1, height * sizeof(uint32_t));
uint32_t* cpu_x_map_inv = (uint32_t*) calloc(1, height * sizeof(uint32_t));

// Group histogram

Expand Down
2 changes: 1 addition & 1 deletion exllamav2/exllamav2_ext/cuda/util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

void print_global_mem(const half* ptr, int rows, int columns, int stride)
{
half* temp = (half*) malloc(rows * columns * sizeof(half));
half* temp = (half*) calloc(1, rows * columns * sizeof(half));

cudaDeviceSynchronize();
cudaMemcpyAsync(temp, ptr, rows * columns * sizeof(half), cudaMemcpyDeviceToHost);
Expand Down
Loading