Skip to content

imatrix: add option to display importance score statistics for a given imatrix file #12718

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
7 changes: 7 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2504,6 +2504,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.i_chunk = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
{"--show-statistics"},
string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
[](common_params & params) {
params.show_statistics = true;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
{"-pps"},
string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
Expand Down
5 changes: 3 additions & 2 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -405,8 +405,9 @@ struct common_params {
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
int32_t i_chunk = 0; // start processing from this chunk

bool process_output = false; // collect data for the output tensor
bool compute_ppl = true; // whether to compute perplexity
bool process_output = false; // collect data for the output tensor
bool compute_ppl = true; // whether to compute perplexity
bool show_statistics = false; // show imatrix statistics per tensor

// cvector-generator params
int n_pca_batch = 100;
Expand Down
164 changes: 147 additions & 17 deletions examples/imatrix/imatrix.cpp
Original file line number Diff line number Diff line change
@@ -1,30 +1,30 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"

#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <thread>
#include <mutex>
#include <vector>
#include <fstream>
#include <mutex>
#include <numeric>
#include <thread>
#include <unordered_map>
#include <algorithm>
#include <vector>

#include "arg.h"
#include "common.h"
#include "llama.h"
#include "log.h"

#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

static void print_usage(int, char ** argv) {
LOG("\nexample usage:\n");
LOG("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
LOG("\n %s -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output]\n"
" [--chunk 123] [--output-frequency 10] [--save-frequency 0] [--show-statistics]\n"
" [--no-ppl] [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
LOG("\n");
}

Expand All @@ -34,13 +34,27 @@ struct Stats {
int ncall = 0;
};

struct tensor_statistics {
std::string tensor;
float total = 0;
float mean = 0;
float max = 0;
float min = 0;
float stddev = 0;
float cv = 0;
float zd = 0;
float active = 0;
float entropy = 0;
int elements = 0;
};

class IMatrixCollector {
public:
IMatrixCollector() = default;
void set_params(common_params params) { m_params = std::move(params); }
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
void save_imatrix(int ncall = -1) const;
bool load_imatrix(const char * fname);
bool load_imatrix(const char * fname, std::vector<tensor_statistics> * tstats = nullptr);
private:
std::unordered_map<std::string, Stats> m_stats;
common_params m_params;
Expand Down Expand Up @@ -69,6 +83,35 @@ static std::string filter_tensor_name(const char * name) {
return wname;
}

static void process_tensor_name(const std::string & input, std::string & layer, std::string & tensor) {
std::vector<std::string> name;
std::istringstream stream(input);
std::string item;

while (std::getline(stream, item, '.')) {
name.push_back(item);
}
for (size_t i = 0; i < name.size(); ++i) {
if (name[i] == "blk" && i + 1 < name.size()) {
layer = name[i + 1];
break;
}
}
for (size_t i = 0; i < name.size(); ++i) {
if (name[i] == "weight" && i > 0) {
tensor = name[i - 1];
break;
}
}

if (tensor.empty()) {
tensor = input;
}
if (layer.empty()) {
layer = "-";
}
}

bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
GGML_UNUSED(user_data);

Expand Down Expand Up @@ -289,7 +332,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
}

bool IMatrixCollector::load_imatrix(const char * fname) {
bool IMatrixCollector::load_imatrix(const char * fname, std::vector<tensor_statistics> * ts) {
std::ifstream in(fname, std::ios::binary);
if (!in) {
LOG_ERR("%s: failed to open %s\n",__func__, fname);
Expand Down Expand Up @@ -335,13 +378,60 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
return false;
}

// Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
// Recreate the state as expected by save_imatrix(), and correct for weighted sum.
std::vector<float> activations;
activations.reserve(nval);

for (int i = 0; i < nval; i++) {
e.values[i] += tmp[i];
e.counts[i] += ncall;
activations.push_back(e.values[i] / static_cast<float>(e.counts[i]));
}
e.ncall += ncall;

if (ts) {
float total_bias = std::accumulate(activations.begin(), activations.end(), 0.0f);
float max_bias = * std::max_element(activations.begin(), activations.end());
float min_bias = * std::min_element(activations.begin(), activations.end());
float mean_bias = total_bias / activations.size();
float sq_total_bias = std::inner_product(activations.begin(), activations.end(), activations.begin(), 0.0f);
float dev = std::sqrt((sq_total_bias / activations.size()) - (mean_bias * mean_bias));
float rmsd = mean_bias > 0.0f ? dev / mean_bias : 0.0f;

float threshold = 1e-6f;
int inactive_count = std::count_if(activations.begin(), activations.end(), [threshold](const float v) { return fabs(v) < threshold; });
float active_ratio = 1 - (static_cast<float>(inactive_count) / activations.size());

float ent = 0.0f;
if (total_bias > 0) {
for (auto act : activations) {
if (float p = act / total_bias; p > 0) {
ent -= p* std::log2(p);
}
}
}

int z_score = 0;
for (auto act : activations) {
if (float p = (act - mean_bias) / dev; p > 1) {
z_score++;
}
}

ts->emplace_back();
auto & [tensor, total, mean, max, min, stddev, cv, zd, active, entropy, elements] = (*ts)[i];
tensor = name_as_vec.data();
total = total_bias;
mean = mean_bias;
max = max_bias;
min = min_bias;
stddev = dev;
cv = rmsd;
active = active_ratio;
entropy = ent;
elements = static_cast<int>(activations.size());
zd = static_cast<float>(z_score) / static_cast<float>(elements);
}
}
return true;
}
Expand All @@ -352,7 +442,6 @@ static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_dat
return g_collector.collect_imatrix(t, ask, user_data);
}


struct results_log_softmax {
double log_softmax;
float logit;
Expand Down Expand Up @@ -590,6 +679,47 @@ int main(int argc, char ** argv) {
return 1;
}

std::vector<tensor_statistics> ts;

if (params.show_statistics) {
if (params.in_files.empty() || params.in_files.size() > 1) {
LOG_ERR("\nError: a single imatrix file is required to compute tensor statistics\n\n");
return 1;
}
if (!g_collector.load_imatrix(params.in_files[0].c_str(), & ts)) {
LOG_ERR("\nError: %s is not a valid imatrix file\n\n", params.in_files[0].c_str());
return 1;
}
if (ts.empty()) {
LOG_ERR("Error: cannot compute statistics for %s\n\n", params.in_files[0].c_str());
return 1;
}

struct tensor_comparer {
bool operator()(const tensor_statistics & a, const tensor_statistics & b) const {
std::string layer, name_a, name_b;;
process_tensor_name(a.tensor, layer, name_a);
process_tensor_name(b.tensor, layer, name_b);
return name_a < name_b || (name_a == name_b && a.total > b.total);
}
};
std::sort(ts.begin(), ts.end(), tensor_comparer());

LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast<int>(ts.size()));
LOG_INF("\n%5s\t%-20s\t%10s\t%7s\t%12s\t%9s\t%10s\t%9s\t%6s\t%12s\t%7s\t%10s\n",
"Layer", "Tensor", "Σ(Bias)", "Min", "Max", "μ", "σ", "% Active", "N", "Entropy", "E (norm)", "ZD Score");
LOG_INF("==========================================================================================================================================================================\n");
for (const auto & [tensor, total, mean, max, min, stddev, cv, zd, active, entropy, elements] : ts) {
std::string layer, name;
process_tensor_name(tensor, layer, name);
LOG_INF("%5s\t%-20s\t%10.2f\t%7.4f\t%12.4f\t%8.4f\t%9.4f\t%8.2f%%\t%6d\t%12.4f\t%7.2f%%\t%9.2f%%\n",
layer.c_str(), name.c_str(), total, min, max, mean, stddev, active * 100.0f, elements, entropy, 100.0f * (entropy / std::log2(elements)), 100.0f * zd);
}
LOG_INF("\n");

return 0;
}

common_init();

params.n_batch = std::min(params.n_batch, params.n_ctx);
Expand Down
Loading