-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmobilenet_v1_prune_train.py
237 lines (200 loc) · 8.81 KB
/
mobilenet_v1_prune_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Build and train mobilenet_v1 with options for quantization."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tensorflow.contrib.model_pruning.python import pruning
import time
from tensorflow.core.protobuf import config_pb2
from tensorflow.python.client import timeline
from tensorflow.python.lib.io import file_io
from tensorflow.python.platform import tf_logging as logging
import os
from image_input import hcl_input
import mobilenet_v1_prune
slim = tf.contrib.slim
flags = tf.app.flags
flags.DEFINE_string('master', '', 'Session master')
flags.DEFINE_integer('task', 0, 'Task')
flags.DEFINE_integer('ps_tasks', 0, 'Number of ps')
flags.DEFINE_integer('batch_size', 64, 'Batch size')
flags.DEFINE_integer('num_classes', 3755, 'Number of classes to distinguish')
flags.DEFINE_integer('number_of_steps', None,
'Number of training steps to perform before stopping')
flags.DEFINE_integer('image_size', 64, 'Input image resolution')
flags.DEFINE_float('depth_multiplier', 0.5, 'Depth multiplier for mobilenet')
flags.DEFINE_bool('quantize', True, 'Quantize training')
flags.DEFINE_string('fine_tune_checkpoint', '',
'Checkpoint from which to start finetuning.')
flags.DEFINE_string('checkpoint_dir', './model',
'Directory for writing training checkpoints and logs')
flags.DEFINE_string('dataset_dir', '/data/home/jyw/hcl_cassia_new/', 'Location of dataset')
flags.DEFINE_integer('log_every_n_steps', 100, 'Number of steps per log')
flags.DEFINE_integer('save_summaries_secs', 100,
'How often to save summaries, secs')
flags.DEFINE_integer('save_interval_secs', 100,
'How often to save checkpoints, secs')
tf.app.flags.DEFINE_string(
'pruning_hparams', '',
"""Comma separated list of pruning-related hyperparameters""")
FLAGS = flags.FLAGS
_LEARNING_RATE_DECAY_FACTOR = 0.94
def train_step(sess, train_op, global_step, train_step_kwargs):
"""
Function that takes a gradient step and specifies whether to stop.
"""
start_time = time.time()
trace_run_options = None
run_metadata = None
if 'should_trace' in train_step_kwargs:
if 'logdir' not in train_step_kwargs:
raise ValueError('logdir must be present in train_step_kwargs when '
'should_trace is present')
if sess.run(train_step_kwargs['should_trace']):
trace_run_options = config_pb2.RunOptions(
trace_level=config_pb2.RunOptions.FULL_TRACE)
run_metadata = config_pb2.RunMetadata()
total_loss, np_global_step = sess.run([train_op[0], train_op[1], global_step],
options=trace_run_options,
run_metadata=run_metadata)
# sess.run(train_op[1])
time_elapsed = time.time() - start_time
if run_metadata is not None:
tl = timeline.Timeline(run_metadata.step_stats)
trace = tl.generate_chrome_trace_format()
trace_filename = os.path.join(train_step_kwargs['logdir'],
'tf_trace-%d.json' % np_global_step)
logging.info('Writing trace to %s', trace_filename)
file_io.write_string_to_file(trace_filename, trace)
if 'summary_writer' in train_step_kwargs:
train_step_kwargs['summary_writer'].add_run_metadata(run_metadata,
'run_metadata-%d' %
np_global_step)
if 'should_log' in train_step_kwargs:
if sess.run(train_step_kwargs['should_log']):
logging.info('global step %d: loss = %.4f (%.3f sec/step)',
np_global_step, total_loss, time_elapsed)
if 'should_stop' in train_step_kwargs:
should_stop = sess.run(train_step_kwargs['should_stop'])
else:
should_stop = False
return total_loss, should_stop
def get_learning_rate():
if FLAGS.fine_tune_checkpoint:
# If we are fine tuning a checkpoint we need to start at a lower learning
# rate since we are farther along on training.
return 1e-4
else:
return 0.045
def get_quant_delay():
if FLAGS.fine_tune_checkpoint:
# We can start quantizing immediately if we are finetuning.
return 0
else:
# We need to wait for the model to train a bit before we quantize if we are
# training from scratch.
return 250000
def build_model():
"""Builds graph for model to train with rewrites for quantization.
"""
g = tf.Graph()
with g.as_default(), tf.device(
tf.train.replica_device_setter(FLAGS.ps_tasks)):
inputs, labels = hcl_input(is_training=True)
#with slim.arg_scope(mobilenet_v1.mobilenet_v1_arg_scope(is_training=True)):
logits, _ = mobilenet_v1_prune.mobilenet_v1(
inputs,
is_training=True,
depth_multiplier=FLAGS.depth_multiplier,
num_classes=FLAGS.num_classes)
tf.losses.softmax_cross_entropy(labels, logits)
# Call rewriter to produce graph with fake quant ops and folded batch norms
# quant_delay delays start of quantization till quant_delay steps, allowing
# for better model accuracy.
if FLAGS.quantize:
tf.contrib.quantize.create_training_graph(quant_delay=get_quant_delay())
total_loss = tf.losses.get_total_loss(name='total_loss')
# Configure the learning rate using an exponential decay.
num_epochs_per_decay = 2.5
hcl_size = 4650035 #3523535
decay_steps = int(hcl_size / FLAGS.batch_size * num_epochs_per_decay)
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.train.exponential_decay(
get_learning_rate(),
global_step,#t1f.train.get_or_create_global_step(),
decay_steps,
_LEARNING_RATE_DECAY_FACTOR,
staircase=True)
opt = tf.train.GradientDescentOptimizer(learning_rate)
# Get, Print, and Edit Pruning Hyperparameters
pruning_hparams = pruning.get_pruning_hparams()
#print("Pruning Hyperparameters:", pruning_hparams)
# Change hyperparameters to meet our needs
pruning_hparams.begin_pruning_step = 200000
#pruning_hparams.end_pruning_step = 250
#pruning_hparams.pruning_frequency = 1
#pruning_hparams.sparsity_function_end_step = 250
pruning_hparams.target_sparsity = .5
print("Pruning Hyperparameters:", pruning_hparams)
# Create a pruning object using the pruning specification, sparsity seems to have priority over the hparam
p = pruning.Pruning(pruning_hparams, global_step=global_step, sparsity=.5)
prune_op = p.conditional_mask_update_op()
train_tensor = slim.learning.create_train_op(
total_loss,
optimizer=opt)
slim.summaries.add_scalar_summary(total_loss, 'total_loss', 'losses')
slim.summaries.add_scalar_summary(learning_rate, 'learning_rate', 'training')
return g, [train_tensor, prune_op]
def get_checkpoint_init_fn():
"""Returns the checkpoint init_fn if the checkpoint is provided."""
if FLAGS.fine_tune_checkpoint:
variables_to_restore = slim.get_variables_to_restore()
global_step_reset = tf.assign(tf.train.get_or_create_global_step(), 0)
slim_init_fn = slim.assign_from_checkpoint_fn(
FLAGS.fine_tune_checkpoint,
variables_to_restore,
ignore_missing_vars=True)
def init_fn(sess):
slim_init_fn(sess)
# If we are restoring from a floating point model, we need to initialize
# the global step to zero for the exponential decay to result in
# reasonable learning rates.
sess.run(global_step_reset)
return init_fn
else:
return None
def train_model():
"""Trains mobilenet_v1."""
g, train_tensor = build_model()
with g.as_default():
slim.learning.train(
train_tensor,
FLAGS.checkpoint_dir,
train_step_fn=train_step,
is_chief=(FLAGS.task == 0),
master=FLAGS.master,
log_every_n_steps=FLAGS.log_every_n_steps,
graph=g,
number_of_steps=FLAGS.number_of_steps,
save_summaries_secs=FLAGS.save_summaries_secs,
save_interval_secs=FLAGS.save_interval_secs,
init_fn=get_checkpoint_init_fn(),
global_step=tf.train.get_global_step())
def main(unused_arg):
train_model()
if __name__ == '__main__':
tf.app.run(main)