forked from graphcore/tutorials
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnnlshp.py
81 lines (73 loc) · 3.78 KB
/
nnlshp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
"""Non-Negative least squares histogram-packing."""
import time
import numpy as np
from scipy import optimize, stats
from functools import lru_cache
def get_packing_matrix(strategy_set, max_sequence_length):
num_strategies = len(strategy_set)
A = np.zeros((max_sequence_length, num_strategies), dtype=np.int32)
for i, strategy in enumerate(strategy_set):
for seq_len in strategy:
A[seq_len - 1, i] += 1
return A
@lru_cache(maxsize=None)
def get_packing_strategies(start_length, minimum_increment, target_length, depth):
gap = target_length - start_length
strategies = []
# Complete the packing with exactly 1 number
if depth == 1:
if gap >= minimum_increment:
strategies.append([gap])
# Complete the sample in "depth" steps, recursively
else:
for new in range(minimum_increment, gap + 1):
new_gap = target_length - start_length - new
if new_gap == 0:
strategies.append([new])
else:
options = get_packing_strategies(start_length + new, new, target_length, depth - 1)
for option in options:
if len(option) > 0:
strategies.append([new] + option)
return strategies
def pack_using_nnlshp(histogram, max_sequence_length, max_sequences_per_pack):
# List all unique ways of packing to the desired maximum sequence length
strategy_set = get_packing_strategies(0, 1, max_sequence_length, max_sequences_per_pack)
# Get the packing matrix corresponding to this list of packing strategies
A = get_packing_matrix(strategy_set, max_sequence_length)
# Weights that penalize the residual on short sequences less.
penalization_cutoff = 8
w0 = np.ones([max_sequence_length])
w0[:penalization_cutoff] = 0.09
# Solve the packing problem
start = time.time()
strategy_repeat_count, rnorm = optimize.nnls(np.expand_dims(w0, -1) * A, w0 * histogram)
# Round the floating point solution to nearest integer
strategy_repeat_count = np.rint(strategy_repeat_count).astype(np.int64)
# Compute the residuals, shape: [max_sequence_length]
residual = histogram - A @ strategy_repeat_count
# Handle the left-over sequences i.e. positive part of residual
unpacked_seqlen = np.arange(1, max_sequence_length + 1)[residual > 0]
for l in unpacked_seqlen:
strategy = sorted([l, max_sequence_length - l]) # the depth 1 strategy
strategy_index = strategy_set.index(strategy)
strategy_repeat_count[strategy_index] += residual[l-1]
# Re-compute the residual with the updated strategy_repeat_count
# This should now be strictly < 0
residual = histogram - A @ strategy_repeat_count
# Add padding based on deficit (negative residual portion of residual)
padding = np.where(residual < 0, -residual, 0)
# Calculate some basic statistics
duration = time.time() - start
sequence_lengths = np.arange(1, max_sequence_length + 1)
old_number_of_samples = histogram.sum()
new_number_of_samples = int(strategy_repeat_count.sum())
speedup_upper_bound = 1.0/(1 - (histogram*(1 - sequence_lengths / max_sequence_length)).sum()/old_number_of_samples)
num_padding_tokens_packed = (sequence_lengths * padding).sum()
efficiency = 1 - num_padding_tokens_packed/(new_number_of_samples*max_sequence_length)
print(f"Packing efficiency (fraction of real tokens): {efficiency:3.4f}\n",
f"Speed-up theoretical limit: {speedup_upper_bound:3.4f}\n",
f"Achieved speed-up over un-packed dataset: {old_number_of_samples/new_number_of_samples:3.5f}\n"
f"Runtime: Packed {old_number_of_samples} sequences in {duration:3.3f} seconds.")
return strategy_set, strategy_repeat_count