diff --git a/models/official/projects/maskformer/__pycache__/__init__.cpython-39.pyc b/models/official/projects/maskformer/__pycache__/__init__.cpython-39.pyc index 1f2d7037..46234b46 100644 Binary files a/models/official/projects/maskformer/__pycache__/__init__.cpython-39.pyc and b/models/official/projects/maskformer/__pycache__/__init__.cpython-39.pyc differ diff --git a/models/official/projects/maskformer/__pycache__/optimization.cpython-39.pyc b/models/official/projects/maskformer/__pycache__/optimization.cpython-39.pyc index 2af44232..0d382fd3 100644 Binary files a/models/official/projects/maskformer/__pycache__/optimization.cpython-39.pyc and b/models/official/projects/maskformer/__pycache__/optimization.cpython-39.pyc differ diff --git a/models/official/projects/maskformer/configs/maskformer.py b/models/official/projects/maskformer/configs/maskformer.py index 51a3cae4..fb5d3273 100644 --- a/models/official/projects/maskformer/configs/maskformer.py +++ b/models/official/projects/maskformer/configs/maskformer.py @@ -114,6 +114,10 @@ class MaskFormerTask(cfg.TaskConfig): SET_MODEL_BFLOAT16 = False SET_DATA_BFLOAT16 = True +if not os.environ.get('USE_BFLOAT16_DATA'): + SET_DATA_BFLOAT16 = False + + @exp_factory.register_config_factory('maskformer_coco_panoptic') def maskformer_coco_panoptic() -> cfg.ExperimentConfig: """Config to get results that matches the paper.""" @@ -124,9 +128,22 @@ def maskformer_coco_panoptic() -> cfg.ExperimentConfig: ckpt_interval = (COCO_TRAIN_EXAMPLES // train_batch_size) * 10 # Don't write ckpts frequently. Slows down the training image_size = int(os.environ.get('IMG_SIZE')) - steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size - train_steps = 300 * steps_per_epoch # 300 epochs - decay_at = train_steps - 100 * steps_per_epoch # 200 epochs + if os.environ.get('STEPS_PER_EPOCH'): + steps_per_epoch = int(os.environ.get('STEPS_PER_EPOCH')) + else: + steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size + + if os.environ.get('NUM_EPOCH'): + train_steps = int(os.environ.get('NUM_EPOCH')) * steps_per_epoch + decay_at = int(2/3 * train_steps) + else: + train_steps = 300 * steps_per_epoch # 300 epochs + decay_at = train_steps - 100 * steps_per_epoch # 200 epochs + + # steps_per_epoch = COCO_TRAIN_EXAMPLES // train_batch_size + # train_steps = 300 * steps_per_epoch # 300 epochs + # decay_at = train_steps - 100 * steps_per_epoch # 200 epochs + config = cfg.ExperimentConfig( task = MaskFormerTask( init_checkpoint="", @@ -179,7 +196,7 @@ def maskformer_coco_panoptic() -> cfg.ExperimentConfig: )), trainer=cfg.TrainerConfig( train_steps=train_steps, - validation_steps=COCO_VAL_EXAMPLES // eval_batch_size, + validation_steps=COCO_VAL_EXAMPLES // eval_batch_size if not os.environ.get('VAL_STEPS') else int(os.environ.get('VAL_STEPS')), steps_per_loop=steps_per_epoch, summary_interval=steps_per_epoch, checkpoint_interval=steps_per_epoch, diff --git a/models/official/projects/maskformer/eval_cpu.sh b/models/official/projects/maskformer/eval_cpu.sh index 67bf03ed..ff09a729 100644 --- a/models/official/projects/maskformer/eval_cpu.sh +++ b/models/official/projects/maskformer/eval_cpu.sh @@ -1,9 +1,9 @@ #!/bin/bash train_bsize=1 eval_bsize=1 -export PYTHONPATH=$PYTHONPATH:~/tf-maskformer/models -export MODEL_DIR="gs://cam2-models/maskformer_vishal_exps/EXP20_v8_eval" -export MASKFORMER_CKPT="gs://cam2-models/maskformer_vishal_exps/EXP20_v8/ckpt-18480" +export PYTHONPATH=$PYTHONPATH:/depot/davisjam/data/akshath/MaskFormer_tf/tf-maskformer/models +export MODEL_DIR="gs://cam2-models/maskformer_vishal_exps/EXP26_v8_eval" +export MASKFORMER_CKPT="gs://cam2-models/maskformer_vishal_exps/EXP26_v8/ckpt-482328" export RESNET_CKPT="gs://cam2-models/maskformer_vishal_exps/resnet50_pretrained/tfmg/ckpt-62400" export TFRECORDS_DIR="gs://cam2-datasets/coco_panoptic/tfrecords" export TRAIN_BATCH_SIZE=$train_bsize @@ -16,7 +16,7 @@ export OVERRIDES="runtime.distribution_strategy=one_device,runtime.mixed_precisi task.validation_data.global_batch_size=$EVAL_BATCH_SIZE,task.model.which_pixel_decoder=transformer_fpn,\ task.init_checkpoint_modules=all,\ task.init_checkpoint=$MASKFORMER_CKPT" -python3 models/official/projects/maskformer/train.py \ +python3 train.py \ --experiment maskformer_coco_panoptic \ --mode eval \ --model_dir $MODEL_DIR \ diff --git a/models/official/projects/maskformer/eval_gpu.sh b/models/official/projects/maskformer/eval_gpu.sh index 3def5bf4..dbe3d662 100755 --- a/models/official/projects/maskformer/eval_gpu.sh +++ b/models/official/projects/maskformer/eval_gpu.sh @@ -16,7 +16,7 @@ task.validation_data.global_batch_size=$EVAL_BATCH_SIZE,\ task.model.which_pixel_decoder=transformer_fpn,\ task.init_checkpoint_modules=all,\ task.init_checkpoint=$MASKFORMER_CKPT" -python3 models/official/projects/maskformer/train.py \ +python3 train.py \ --experiment maskformer_coco_panoptic \ --mode eval \ --model_dir $MODEL_DIR \ diff --git a/models/official/projects/maskformer/losses/__pycache__/__init__.cpython-39.pyc b/models/official/projects/maskformer/losses/__pycache__/__init__.cpython-39.pyc index bf307409..b47ab470 100644 Binary files a/models/official/projects/maskformer/losses/__pycache__/__init__.cpython-39.pyc and b/models/official/projects/maskformer/losses/__pycache__/__init__.cpython-39.pyc differ diff --git a/models/official/projects/maskformer/losses/__pycache__/inference.cpython-39.pyc b/models/official/projects/maskformer/losses/__pycache__/inference.cpython-39.pyc index 9860ab02..56dbe7b1 100644 Binary files a/models/official/projects/maskformer/losses/__pycache__/inference.cpython-39.pyc and b/models/official/projects/maskformer/losses/__pycache__/inference.cpython-39.pyc differ diff --git a/models/official/projects/maskformer/losses/__pycache__/maskformer_losses.cpython-39.pyc b/models/official/projects/maskformer/losses/__pycache__/maskformer_losses.cpython-39.pyc index 04b0d559..63987d74 100644 Binary files a/models/official/projects/maskformer/losses/__pycache__/maskformer_losses.cpython-39.pyc and b/models/official/projects/maskformer/losses/__pycache__/maskformer_losses.cpython-39.pyc differ diff --git a/models/official/projects/maskformer/losses/maskformer_losses.py b/models/official/projects/maskformer/losses/maskformer_losses.py index 8193422e..3dc470b6 100644 --- a/models/official/projects/maskformer/losses/maskformer_losses.py +++ b/models/official/projects/maskformer/losses/maskformer_losses.py @@ -54,6 +54,7 @@ def batch(self, y_true, y_pred): loss = tf.einsum("bnc,bmc->bnm",focal_pos,y_true) + tf.einsum( "bnc,bmc->bnm", focal_neg,(1 - y_true) ) + return loss/hw @@ -88,7 +89,7 @@ def batch(self, y_true, y_pred): return loss class Loss: - def __init__(self, num_classes, matcher, eos_coef, cost_class = 1.0, cost_focal = 20.0, cost_dice = 1.0, ignore_label =0): + def __init__(self, num_classes, matcher, eos_coef, cost_class = 1.0, cost_focal = 1.0, cost_dice = 1.0, ignore_label =0): self.num_classes = num_classes self.matcher = matcher @@ -120,13 +121,13 @@ def memory_efficient_matcher(self, outputs, y_true): tgt_mask_permuted = tf.reshape(tgt_mask_permuted, [tf.shape(tgt_mask_permuted)[0],tf.shape(tgt_mask_permuted)[1], -1]) # [b, 100, h*w] cost_focal = FocalLossMod().batch(tgt_mask_permuted, out_mask) - cost_dice = DiceLoss().batch(tgt_mask_permuted, out_mask) + cost_dice = DiceLoss().batch(tgt_mask_permuted, out_mask) total_cost = ( - self.cost_focal * cost_focal - + self.cost_class * cost_class - + self.cost_dice * cost_dice + self.cost_focal * cost_focal + + self.cost_class * cost_class + + self.cost_dice * cost_dice ) max_cost = ( @@ -135,14 +136,26 @@ def memory_efficient_matcher(self, outputs, y_true): self.cost_dice * 0.0 ) - + # print('cost_focal') + # print(cost_focal, cost_class, cost_dice) + # print('total_ocst') + # print(total_cost) + # print('max_cost') + # print(max_cost) + # Append highest cost where there are no objects : No object class == 0 (self.ignore_label) valid = tf.expand_dims(tf.cast(tf.not_equal(tgt_ids, self.ignore_label), dtype=total_cost.dtype), axis=1) + # print('max_cost - ', max_cost) + # print('total_cost before - ', total_cost) + total_cost = (1 - valid) * max_cost + valid * total_cost + # print('total_cost after - ', total_cost) + total_cost = tf.where( tf.logical_or(tf.math.is_nan(total_cost), tf.math.is_inf(total_cost)), max_cost * tf.ones_like(total_cost, dtype=total_cost.dtype), total_cost) + _, inds = matchers.hungarian_matching(total_cost) indices = tf.stop_gradient(inds) @@ -168,10 +181,10 @@ def get_loss(self, outputs, y_true, indices): num_masks = tf.reduce_sum(tf.cast(tf.logical_not(background), tf.float32), axis=-1) xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_classes, logits=cls_assigned) - + cls_loss = tf.where(background, self.eos_coef * xentropy, xentropy) - cls_weights = tf.where(background, self.eos_coef * tf.ones_like(cls_loss), tf.ones_like(cls_loss)) + # print('Weights: ', cls_weights) num_masks_per_replica = tf.reduce_sum(num_masks) @@ -181,7 +194,10 @@ def get_loss(self, outputs, y_true, indices): num_masks_sum, cls_weights_sum = replica_context.all_reduce(tf.distribute.ReduceOp.SUM,[num_masks_per_replica, cls_weights_per_replica]) # Final losses + # print('Losses: ', cls_loss) + cls_loss = tf.math.divide_no_nan(tf.reduce_sum(cls_loss), cls_weights_sum) + # print('Final loss given for changing the tvars - ', cls_loss) out_mask = mask_assigned tgt_mask = individual_masks @@ -206,10 +222,12 @@ def get_loss(self, outputs, y_true, indices): focal_loss = FocalLossMod(alpha=0.25, gamma=2)(tgt_mask, out_mask) focal_loss_weighted = tf.where(background, tf.zeros_like(focal_loss), focal_loss) focal_loss_final = tf.math.divide_no_nan(tf.math.reduce_sum(tf.math.reduce_sum(focal_loss_weighted, axis=-1)), num_masks_sum) - + # print(focal_loss_weighted) dice_loss = DiceLoss()(tgt_mask, out_mask) dice_loss_weighted = tf.where(background, tf.zeros_like(dice_loss), dice_loss) dice_loss_final = tf.math.divide_no_nan(tf.math.reduce_sum(tf.math.reduce_sum(dice_loss_weighted, axis=-1)), num_masks_sum) + # print(dice_loss_weighted) + # raise ValueError('2') return cls_loss, focal_loss_final, dice_loss_final diff --git a/models/official/projects/maskformer/modeling/__pycache__/__init__.cpython-39.pyc b/models/official/projects/maskformer/modeling/__pycache__/__init__.cpython-39.pyc index 7705fe67..24b40734 100644 Binary files a/models/official/projects/maskformer/modeling/__pycache__/__init__.cpython-39.pyc and b/models/official/projects/maskformer/modeling/__pycache__/__init__.cpython-39.pyc differ diff --git a/models/official/projects/maskformer/modeling/__pycache__/maskformer.cpython-39.pyc b/models/official/projects/maskformer/modeling/__pycache__/maskformer.cpython-39.pyc index 88362d2a..a7f07978 100644 Binary files a/models/official/projects/maskformer/modeling/__pycache__/maskformer.cpython-39.pyc and b/models/official/projects/maskformer/modeling/__pycache__/maskformer.cpython-39.pyc differ diff --git a/models/official/projects/maskformer/modeling/decoder/__pycache__/__init__.cpython-39.pyc b/models/official/projects/maskformer/modeling/decoder/__pycache__/__init__.cpython-39.pyc index 2432cf8b..df12e243 100644 Binary files a/models/official/projects/maskformer/modeling/decoder/__pycache__/__init__.cpython-39.pyc and b/models/official/projects/maskformer/modeling/decoder/__pycache__/__init__.cpython-39.pyc differ diff --git a/models/official/projects/maskformer/modeling/decoder/__pycache__/detr_transformer.cpython-39.pyc b/models/official/projects/maskformer/modeling/decoder/__pycache__/detr_transformer.cpython-39.pyc index a204c895..edf54dd4 100644 Binary files a/models/official/projects/maskformer/modeling/decoder/__pycache__/detr_transformer.cpython-39.pyc and b/models/official/projects/maskformer/modeling/decoder/__pycache__/detr_transformer.cpython-39.pyc differ diff --git a/models/official/projects/maskformer/modeling/decoder/__pycache__/pixel_decoder.cpython-39.pyc b/models/official/projects/maskformer/modeling/decoder/__pycache__/pixel_decoder.cpython-39.pyc index 552b5b3c..c39ea97c 100644 Binary files a/models/official/projects/maskformer/modeling/decoder/__pycache__/pixel_decoder.cpython-39.pyc and b/models/official/projects/maskformer/modeling/decoder/__pycache__/pixel_decoder.cpython-39.pyc differ diff --git a/models/official/projects/maskformer/modeling/decoder/__pycache__/transformer_decoder.cpython-39.pyc b/models/official/projects/maskformer/modeling/decoder/__pycache__/transformer_decoder.cpython-39.pyc index c79a215a..b982e786 100644 Binary files a/models/official/projects/maskformer/modeling/decoder/__pycache__/transformer_decoder.cpython-39.pyc and b/models/official/projects/maskformer/modeling/decoder/__pycache__/transformer_decoder.cpython-39.pyc differ diff --git a/models/official/projects/maskformer/modeling/decoder/__pycache__/transformer_pixel_decoder.cpython-39.pyc b/models/official/projects/maskformer/modeling/decoder/__pycache__/transformer_pixel_decoder.cpython-39.pyc index 246f0afc..f290d977 100644 Binary files a/models/official/projects/maskformer/modeling/decoder/__pycache__/transformer_pixel_decoder.cpython-39.pyc and b/models/official/projects/maskformer/modeling/decoder/__pycache__/transformer_pixel_decoder.cpython-39.pyc differ diff --git a/models/official/projects/maskformer/modeling/decoder/detr_transformer.py b/models/official/projects/maskformer/modeling/decoder/detr_transformer.py index fffb654e..3d9ebe52 100644 --- a/models/official/projects/maskformer/modeling/decoder/detr_transformer.py +++ b/models/official/projects/maskformer/modeling/decoder/detr_transformer.py @@ -64,7 +64,6 @@ def call(self, inputs): target_shape = tf.shape(targets) if mask is not None: - cross_attention_mask = tf.tile( tf.expand_dims(mask, axis=1), [1, target_shape[1], 1]) self_attention_mask=tf.ones( diff --git a/models/official/projects/maskformer/modeling/layers/__pycache__/__init__.cpython-39.pyc b/models/official/projects/maskformer/modeling/layers/__pycache__/__init__.cpython-39.pyc index a5818e23..b67a3373 100644 Binary files a/models/official/projects/maskformer/modeling/layers/__pycache__/__init__.cpython-39.pyc and b/models/official/projects/maskformer/modeling/layers/__pycache__/__init__.cpython-39.pyc differ diff --git a/models/official/projects/maskformer/modeling/layers/__pycache__/nn_block.cpython-39.pyc b/models/official/projects/maskformer/modeling/layers/__pycache__/nn_block.cpython-39.pyc index 8ebb1f1e..2905ea77 100644 Binary files a/models/official/projects/maskformer/modeling/layers/__pycache__/nn_block.cpython-39.pyc and b/models/official/projects/maskformer/modeling/layers/__pycache__/nn_block.cpython-39.pyc differ diff --git a/models/official/projects/maskformer/modeling/layers/nn_block.py b/models/official/projects/maskformer/modeling/layers/nn_block.py index 7f7da5d1..0ad8f17b 100644 --- a/models/official/projects/maskformer/modeling/layers/nn_block.py +++ b/models/official/projects/maskformer/modeling/layers/nn_block.py @@ -1,4 +1,7 @@ import tensorflow as tf +from official.modeling import tf_utils +from official.nlp.modeling import layers +from official.nlp.modeling import models ''' Transformer Parameters: @@ -74,6 +77,8 @@ def build(self, input_shape): # Final Layer self._layers.append( tf.keras.layers.Dense(dim[1], activation=None)) + # kernel_initializer=tf_utils.clone_initializer(tf.keras.initializers.get('glorot_uniform')), + # bias_initializer=tf_utils.clone_initializer(tf.keras.initializers.get('glorot_uniform'))) def call(self, x): for layer in self._layers: diff --git a/models/official/projects/maskformer/modeling/maskformer.py b/models/official/projects/maskformer/modeling/maskformer.py index 7a552baf..9851793c 100644 --- a/models/official/projects/maskformer/modeling/maskformer.py +++ b/models/official/projects/maskformer/modeling/maskformer.py @@ -1,5 +1,6 @@ import tensorflow as tf - +import numpy as np +import os from official.projects.maskformer.modeling.decoder.transformer_decoder import MaskFormerTransformer from official.projects.maskformer.modeling.layers.nn_block import MLPHead from official.projects.maskformer.modeling.decoder.transformer_pixel_decoder import TransformerFPN @@ -150,6 +151,7 @@ def process_feature_maps(self, maps): def call(self, image, training = False): backbone_feature_maps = self._backbone(image) backbone_feature_maps_procesed = self.process_feature_maps(backbone_feature_maps) + if self._pixel_decoder == 'fpn': mask_features = self.pixel_decoder(backbone_feature_maps_procesed) transformer_enc_feat = backbone_feature_maps_procesed['5'] @@ -158,4 +160,4 @@ def call(self, image, training = False): transformer_features = self.transformer({"features": transformer_enc_feat}) seg_pred = self.head({"per_pixel_embeddings" : mask_features, "per_segment_embeddings": transformer_features}) - return seg_pred + return seg_pred \ No newline at end of file diff --git a/models/official/projects/maskformer/params.yaml b/models/official/projects/maskformer/params.yaml new file mode 100644 index 00000000..9c44ddf6 --- /dev/null +++ b/models/official/projects/maskformer/params.yaml @@ -0,0 +1,226 @@ +runtime: + all_reduce_alg: null + batchnorm_spatial_persistent: false + dataset_num_private_threads: null + default_shard_dim: -1 + distribution_strategy: one_device + enable_xla: false + gpu_thread_mode: null + loss_scale: null + mixed_precision_dtype: float32 + num_cores_per_replica: 1 + num_gpus: 1 + num_packs: 1 + per_gpu_thread_count: 0 + run_eagerly: false + task_index: -1 + tpu: null + tpu_enable_xla_dynamic_padder: null + worker_hosts: null +task: + allow_image_summary: false + bfloat16: false + differential_privacy_config: null + init_checkpoint: '' + init_checkpoint_modules: backbone + losses: + background_cls_weight: 0.1 + class_offset: 0 + l2_weight_decay: 0.0001 + model: + backbone: + resnet: + bn_trainable: false + depth_multiplier: 1.0 + model_id: 50 + replace_stem_max_pool: false + resnetd_shortcut: false + scale_stem: true + se_ratio: 0.0 + stem_type: v0 + stochastic_depth_drop_rate: 0.0 + type: resnet + backbone_endpoint_name: '5' + detr_encoder_layers: 0 + fpn_encoder_layers: 6 + hidden_size: 256 + input_size: [640, 640, 3] + norm_activation: + activation: relu + norm_epsilon: 0.001 + norm_momentum: 0.99 + use_sync_bn: true + num_classes: 133 + num_decoder_layers: 6 + num_queries: 100 + which_pixel_decoder: transformer_fpn + name: null + panoptic_quality_evaluator: + ignored_label: 0 + is_thing: null + max_num_instances: 100 + num_categories: 133 + rescale_predictions: false + per_category_metrics: false + train_data: + apply_tf_data_service_before_batching: false + block_length: 1 + cache: false + cycle_length: null + decoder: + simple_decoder: + attribute_names: [] + mask_binarize_threshold: null + regenerate_source_id: false + type: simple_decoder + deterministic: null + drop_remainder: true + dtype: bfloat16 + enable_shared_tf_data_service_between_parallel_trainers: false + enable_tf_data_service: false + file_type: tfrecord + global_batch_size: 2 + input_path: /depot/davisjam/data/vishal/datasets/coco/tfrecords/train* + is_training: true + parser: + aspect_ratio_range: !!python/tuple + - 0.5 + - 2.0 + aug_rand_hflip: true + aug_scale_max: 1.0 + aug_scale_min: 1.0 + dtype: bfloat16 + groundtruth_padded_size: [640, 640] + ignore_label: 0 + max_retry: 50 + min_overlap_params: !!python/tuple + - 0.0 + - 1.4 + - 0.2 + - 0.1 + min_scale: 0.3 + output_size: [640, 640] + pad_output: true + resize_eval_groundtruth: true + seed: 2045 + prefetch_buffer_size: null + regenerate_source_id: false + seed: null + sharding: true + shuffle_buffer_size: 1000 + tf_data_service_address: null + tf_data_service_job_name: null + tfds_as_supervised: false + tfds_data_dir: '' + tfds_name: '' + tfds_skip_decoding_feature: '' + tfds_split: train + trainer_id: null + validation_data: + apply_tf_data_service_before_batching: false + block_length: 1 + cache: false + cycle_length: null + decoder: + simple_decoder: + attribute_names: [] + mask_binarize_threshold: null + regenerate_source_id: false + type: simple_decoder + deterministic: null + drop_remainder: false + dtype: float32 + enable_shared_tf_data_service_between_parallel_trainers: false + enable_tf_data_service: false + file_type: tfrecord + global_batch_size: 1 + input_path: /depot/davisjam/data/vishal/datasets/coco/tfrecords/val* + is_training: false + parser: + aspect_ratio_range: !!python/tuple + - 0.5 + - 2.0 + aug_rand_hflip: true + aug_scale_max: 1.0 + aug_scale_min: 1.0 + dtype: bfloat16 + groundtruth_padded_size: !!python/tuple + - 1280 + - 1280 + ignore_label: 0 + max_retry: 50 + min_overlap_params: !!python/tuple + - 0.0 + - 1.4 + - 0.2 + - 0.1 + min_scale: 0.3 + output_size: [640, 640] + pad_output: true + resize_eval_groundtruth: true + seed: 2045 + prefetch_buffer_size: null + regenerate_source_id: false + seed: null + sharding: true + shuffle_buffer_size: 10000 + tf_data_service_address: null + tf_data_service_job_name: null + tfds_as_supervised: false + tfds_data_dir: '' + tfds_name: '' + tfds_skip_decoding_feature: '' + tfds_split: train + trainer_id: null +trainer: + allow_tpu_summary: false + best_checkpoint_eval_metric: '' + best_checkpoint_export_subdir: best_ckpt + best_checkpoint_metric_comp: higher + checkpoint_interval: 59143 + continuous_eval_timeout: 3600 + eval_tf_function: true + eval_tf_while_loop: false + loss_upper_bound: 1000000.0 + max_to_keep: 3 + optimizer_config: + ema: null + learning_rate: + stepwise: + boundaries: [39428] + name: PiecewiseConstantDecay + offset: 0 + values: [0.0001, 1.0e-05] + type: stepwise + optimizer: + maskformer_adamw: + amsgrad: false + beta_1: 0.9 + beta_2: 0.999 + clipnorm: null + clipvalue: null + epsilon: 1.0e-07 + exclude_from_weight_decay: null + global_clipnorm: 0.1 + gradient_clip_norm: 0.0 + include_in_weight_decay: null + name: AdamWeightDecay + weight_decay_rate: 0.0001 + type: maskformer_adamw + warmup: + linear: + name: linear + warmup_learning_rate: 0.0 + warmup_steps: 640 + type: linear + preemption_on_demand_checkpoint: true + recovery_begin_steps: 0 + recovery_max_trials: 0 + steps_per_loop: 59143 + summary_interval: 59143 + train_steps: 59143 + train_tf_function: true + train_tf_while_loop: true + validation_interval: 59143 + validation_steps: 5000 + validation_summary_subdir: validation diff --git a/models/official/projects/maskformer/tasks/__pycache__/__init__.cpython-39.pyc b/models/official/projects/maskformer/tasks/__pycache__/__init__.cpython-39.pyc index bdf3c9f3..a4884cab 100644 Binary files a/models/official/projects/maskformer/tasks/__pycache__/__init__.cpython-39.pyc and b/models/official/projects/maskformer/tasks/__pycache__/__init__.cpython-39.pyc differ diff --git a/models/official/projects/maskformer/tasks/__pycache__/panoptic_maskformer.cpython-39.pyc b/models/official/projects/maskformer/tasks/__pycache__/panoptic_maskformer.cpython-39.pyc index ff6b0bf5..7ab91994 100644 Binary files a/models/official/projects/maskformer/tasks/__pycache__/panoptic_maskformer.cpython-39.pyc and b/models/official/projects/maskformer/tasks/__pycache__/panoptic_maskformer.cpython-39.pyc differ diff --git a/models/official/projects/maskformer/tasks/panoptic_maskformer.py b/models/official/projects/maskformer/tasks/panoptic_maskformer.py index 1ad9763a..5567de84 100644 --- a/models/official/projects/maskformer/tasks/panoptic_maskformer.py +++ b/models/official/projects/maskformer/tasks/panoptic_maskformer.py @@ -1,7 +1,7 @@ import os from absl import logging import tensorflow as tf - +import matplotlib.pyplot as plt from official.core import base_task from official.core import task_factory from official.core import train_utils @@ -37,6 +37,35 @@ class PanopticTask(base_task.Task): """ def build_model(self): """Builds MaskFormer Model.""" + + self.class_ids = {} + self.plot_collection = {} + self.plot_collection_labels = {0:[]} + self.temp = 0 + self.background_empty_mask = {} + self.labelled_empty_mask = {} + self.background_non_empty_mask = {} + self.class_id_counts = {} + self.log_dir = os.environ.get('LOG_DIR') + self.run_number = os.environ.get('RUN_NUMBER') + + if self.log_dir: + try: + os.mkdir(self.log_dir) + except: + pass + os.mkdir(os.path.join(self.log_dir, self.run_number)) # If there is existing, then throw error + self.log_dir = os.path.join(self.log_dir, self.run_number) + + with open(os.path.join(self.log_dir, 'checking_labels.txt'), 'w') as file: + pass + + with open(os.path.join(self.log_dir, 'settings.txt'), 'w') as file: + file.write("RUN: " + str(os.environ.get('RUN_NUMBER')) + '\n') + file.write("BSIZE: " + str(os.environ.get('TRAIN_BATCH_SIZE'))+ '\n') + file.write("BASE_LR: " + str(os.environ.get('BASE_LR'))+ '\n') + file.write("NO_OBJ_CLS_WEIGHT: " + str(os.environ.get('NO_OBJ_CLS_WEIGHT'))+ '\n') + logging.info('Building MaskFormer model.') input_specs = tf.keras.layers.InputSpec(shape=[None] + self._task_config.model.input_size) @@ -58,14 +87,14 @@ def build_model(self): logging.info('Maskformer model build successful.') inputs = tf.keras.Input(shape=input_specs.shape[1:]) model(inputs) - + model.summary() return model def initialize(self, model: tf.keras.Model) -> None: """ Used to initialize the models with checkpoint """ - + logging.info('Initializing model from checkpoint: %s', self._task_config.init_checkpoint) if not self._task_config.init_checkpoint: return @@ -185,8 +214,139 @@ def build_metrics(self, training=True): ) return metrics + def _log_classes(self, labels: Dict[str, Any]) -> List[Dict[int, int]]: + """ + Logs all the class IDs viewed during training and evaluation. + Returns: + A dictionary of class ids and their counts across all images in batch + """ + + all_unique_ids = labels["unique_ids"]._numpy() + classes_in_batch = [] + for size in range(all_unique_ids.shape[0]): + unique_ids = all_unique_ids[size, :] + classes_in_image = {} + for class_id in unique_ids: + if class_id in classes_in_image: + classes_in_image[class_id] += 1 + else: + classes_in_image[class_id] = 1 + classes_in_batch.append(classes_in_image) + + for class_id in unique_ids: + if class_id in self.class_ids: + self.class_ids[class_id] += 1 + else: + self.class_ids[class_id] = 1 + + return classes_in_batch + + def _check_contigious_mask(self, labels: Dict[str, Any]): + """ + Checks if all the contigious masks are mapped properly from the category masks + + Returns: + EagerTensor with correctly mapped contigious masks + """ + mapping_dict = {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, \ + 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 27: 25, 28: 26, 31: 27, 32: 28, 33: 29, 34: 30, 35: 31, 36: 32, 37: 33, 38: 34, \ + 39: 35, 40: 36, 41: 37, 42: 38, 43: 39, 44: 40, 46: 41, 47: 42, 48: 43, 49: 44, 50: 45, 51: 46, 52: 47, 53: 48, 54: 49, 55: 50, 56: 51, 57: 52, \ + 58: 53, 59: 54, 60: 55, 61: 56, 62: 57, 63: 58, 64: 59, 65: 60, 67: 61, 70: 62, 72: 63, 73: 64, 74: 65, 75: 66, 76: 67, 77: 68, 78: 69, 79: 70, \ + 80: 71, 81: 72, 82: 73, 84: 74, 85: 75, 86: 76, 87: 77, 88: 78, 89: 79, 90: 80, 92: 81, 93: 82, 95: 83, 100: 84, 107: 85, 109: 86, 112: 87, \ + 118: 88, 119: 89, 122: 90, 125: 91, 128: 92, 130: 93, 133: 94, 138: 95, 141: 96, 144: 97, 145: 98, 147: 99, 148: 100, 149: 101, 151: 102, \ + 154: 103, 155: 104, 156: 105, 159: 106, 161: 107, 166: 108, 168: 109, 171: 110, 175: 111, 176: 112, 177: 113, 178: 114, 180: 115, 181: 116, \ + 184: 117, 185: 118, 186: 119, 187: 120, 188: 121, 189: 122, 190: 123, 191: 124, 192: 125, 193: 126, 194: 127, 195: 128, 196: 129, 197: 130, \ + 198: 131, 199: 132, 200: 133} + + category_mask = labels["category_mask"]._numpy() + contigious_mask = labels["contigious_mask"]._numpy() + + for size in range(category_mask.shape[0]): + cat = category_mask[size] + cont = contigious_mask[size, :, :, :] + mapped_cat = np.expand_dims(np.array([[mapping_dict.get(int(x), int(x)) for x in row] for row in cat]), axis=-1) + if not np.array_equal(mapped_cat, cont): + contigious_mask[size, :, :, :] = mapped_cat + + return tf.convert_to_tensor(contigious_mask) + + def _check_induvidual_masks(self, labels: Dict[str, Any], class_id_counts: List[Dict[int, int]]): + """ + Checks if all the induvidual masks are given the correct instance id + + Returns: + EagerTensor with correctly mapped induvidual masks + """ + + # mapping_dict = {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, \ + # 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 27: 25, 28: 26, 31: 27, 32: 28, 33: 29, 34: 30, 35: 31, 36: 32, 37: 33, 38: 34, \ + # 39: 35, 40: 36, 41: 37, 42: 38, 43: 39, 44: 40, 46: 41, 47: 42, 48: 43, 49: 44, 50: 45, 51: 46, 52: 47, 53: 48, 54: 49, 55: 50, 56: 51, 57: 52, \ + # 58: 53, 59: 54, 60: 55, 61: 56, 62: 57, 63: 58, 64: 59, 65: 60, 67: 61, 70: 62, 72: 63, 73: 64, 74: 65, 75: 66, 76: 67, 77: 68, 78: 69, 79: 70, \ + # 80: 71, 81: 72, 82: 73, 84: 74, 85: 75, 86: 76, 87: 77, 88: 78, 89: 79, 90: 80, 92: 81, 93: 82, 95: 83, 100: 84, 107: 85, 109: 86, 112: 87, \ + # 118: 88, 119: 89, 122: 90, 125: 91, 128: 92, 130: 93, 133: 94, 138: 95, 141: 96, 144: 97, 145: 98, 147: 99, 148: 100, 149: 101, 151: 102, \ + # 154: 103, 155: 104, 156: 105, 159: 106, 161: 107, 166: 108, 168: 109, 171: 110, 175: 111, 176: 112, 177: 113, 178: 114, 180: 115, 181: 116, \ + # 184: 117, 185: 118, 186: 119, 187: 120, 188: 121, 189: 122, 190: 123, 191: 124, 192: 125, 193: 126, 194: 127, 195: 128, 196: 129, 197: 130, \ + # 198: 131, 199: 132, 200: 133} + + induvidual_masks = labels["individual_masks"]._numpy() + # contig_mask = labels["contigious_mask"]._numpy().copy() + # instance_mask = labels["instance_mask"]._numpy().copy() + # zero_mask = np.zeros((induvidual_masks.shape[2], induvidual_masks.shape[3]), dtype=induvidual_masks.dtype) + class_ids = labels["unique_ids"]._numpy().copy() + + # induvidual_masks_in_image = induvidual_masks[size, :, :, :, :] + # instance_mask_in_image = instance_mask[size, :, :, :] + # contig_mask_in_image = contig_mask[size, :, :, :] + # combined_mask = np.array([[tuple((contig_mask_in_image[i, j], instance_mask_in_image[i, j])) for j in range(contig_mask_in_image.shape[1])] for i in range(contig_mask_in_image.shape[0])]) + + # with open('/depot/davisjam/data/akshath/exps/tf/indu_masks/indu_masks.txt', 'w') as file: + # file.write(str(combined_mask) + '\n') + # file.write(str(np.unique(combined_mask, axis=0)) + '\n') + + # for a in np.unique(instance_mask_in_image): + # plt.imshow(instance_mask_in_image == a) + # plt.savefig(f'/depot/davisjam/data/akshath/exps/tf/indu_masks/my_image__{size}_{a}.png') + + # unique_ids = class_ids[size, :] + # # np.save('/depot/davisjam/data/akshath/exps/tf/indu_masks/instance.npy', instance_mask_in_image) + # return + # for i, class_id in enumerate(unique_ids): + # if class_id != 0: + # print(class_id) + # instance_mask_in_image[instance_mask_in_image == i] + # if induvidual_masks_in_image[i,:,:,:] + # if not np.all((induvidual_masks_in_image[i,:,:,:] == 0) | (induvidual_masks_in_image[i,:,:,:] == mapped_id)): + # induvidual_masks_in_image[i, :, :, :] = np.array([[mapped_id for x in row] for row in induvidual_masks_in_image[i, :, :, :]]) + + for size in range(len(class_ids)): + + # background_non_empty_mask = 0 + labelled_empty_mask = 0 + # background_empty_mask = 0 + + for i, mask in enumerate(induvidual_masks[size, :, :, :, :]): + if class_ids[size][i] != 0: + if np.all(mask == 0): + labelled_empty_mask += 1 + class_ids[size][i] = 0 + + self.labelled_empty_mask[self.temp] = labelled_empty_mask + + with open(os.path.join(self.log_dir, 'background_empty_mask.txt'), 'w') as file: + file.write(str(self.background_empty_mask) + '\n') + with open(os.path.join(self.log_dir, 'labelled_empty_mask.txt'), 'w') as file: + file.write(str(self.labelled_empty_mask) + '\n') + with open(os.path.join(self.log_dir, 'background_non_empty_mask.txt'), 'w') as file: + file.write(str(self.background_non_empty_mask) + '\n') + with open(os.path.join(self.log_dir, 'class_id_counts.txt'), 'w') as file: + file.write(str(self.class_id_counts) + '\n') + with open(os.path.join(self.log_dir, 'class_ids.txt'), 'w') as file: + file.write(str(self.class_ids) + '\n') + return tf.convert_to_tensor(induvidual_masks) + + def train_step(self, inputs: Tuple[Any, Any],model: tf.keras.Model, optimizer: tf.keras.optimizers.Optimizer, metrics: Optional[List[Any]] = None) -> Dict[str, Any]: """ Does forward and backward. @@ -202,8 +362,55 @@ def train_step(self, inputs: Tuple[Any, Any],model: tf.keras.Model, optimizer: t """ features, labels = inputs + + # features = tf.convert_to_tensor(np.load('/depot/davisjam/data/akshath/exps/resnet/raw/features.npy')) + # for val in labels: + # labels[val] = tf.convert_to_tensor(np.load(f'/depot/davisjam/data/akshath/exps/resnet/raw/{val}.npy')) + + # np.save('/depot/davisjam/data/akshath/exps/tf/resnet/raw/features.npy', tf.cast(features, np.float32)._numpy()) + # for lab in labels: + # np.save(f'/depot/davisjam/data/akshath/exps/tf/resnet/raw/{lab}.npy', tf.cast(labels[lab], np.float32)._numpy()) + + + # self.temp += 2 + # all_unique_ids = labels["unique_ids"]._numpy() + # for size in range(all_unique_ids.shape[0]): + # unique_ids = all_unique_ids[size, :] + # for class_id in unique_ids: + # if class_id in self.class_ids: + # self.class_ids[class_id] += 1 + # else: + # self.class_ids[class_id] = 1 + + # print(self.temp) + # with open(os.path.join(self.log_dir, 'class_ids.txt'), 'w') as file: + # file.write(str(self.class_ids) + '\n') + + # self._log_classes(labels) + # labels["individual_masks"] = self._check_induvidual_masks(labels, self._log_classes(labels)) + + # # for param in model.trainable_variables: + # # name = param.name.replace('/', '-') + # # np.save(f"/depot/davisjam/data/akshath/exps/tf/weights_biases/{name}.npy", param.numpy()) + + # # with open('/depot/davisjam/data/akshath/exps/tf/indu_masks/indu_masks.txt', 'w') as file: + # # file.write(str(labels) + '\n') + + + # # raise ValueError('Init') + + # # labels["individual_masks"] = self._check_induvidual_masks(labels, self._log_classes(labels)) + # # labels["contigious_mask"] = self._check_contigious_mask(labels) + with tf.GradientTape() as tape: outputs = model(features, training=True) + # print(backbone_feature_maps_procesed.keys()) + + # for val in backbone_feature_maps_procesed: + # print(backbone_feature_maps_procesed[val]) + # print(backbone_feature_maps_procesed[val].numpy()) + # np.save(os.path.join('/depot/davisjam/data/akshath/exps/resnet/tf', 'backbone_feature_maps_procesed_' + str(val) + '.npy'), backbone_feature_maps_procesed[val].numpy()) + ########################################################## # FIXME : This loop must be used for auxilary outputs loss = 0.0 @@ -226,11 +433,17 @@ def train_step(self, inputs: Tuple[Any, Any],model: tf.keras.Model, optimizer: t ########################################################################## - # TODO : Add auxiallary losses total_loss, cls_loss, focal_loss, dice_loss = self.build_losses(output=outputs, labels=labels) scaled_loss = total_loss + if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer): total_loss = optimizer.get_scaled_loss(scaled_loss) + + print('Total loss : ', total_loss) + print('Cls loss : ', cls_loss) + print('Focal loss : ', focal_loss) + print('Dice loss : ', dice_loss) + tvars = model.trainable_variables grads = tape.gradient(scaled_loss,tvars) @@ -243,11 +456,40 @@ def train_step(self, inputs: Tuple[Any, Any],model: tf.keras.Model, optimizer: t pred_labels = tf.argmax(probs, axis=-1) print("Target labels :", labels["unique_ids"]) print("Output labels :", pred_labels) + + # temp = {} + # for grad, param in zip(grads, tvars): + # temp[param.name] = tf.norm(grad).numpy() + + # for param in temp: + # if param not in self.plot_collection: + # self.plot_collection[param] = [] + # else: + # self.plot_collection[param] += [temp[param]] + # self.plot_collection_labels[0] += [len(np.unique(pred_labels).tolist())] + + self.temp += int(os.environ.get('TRAIN_BATCH_SIZE')) + with open(os.path.join(self.log_dir, 'checking_labels.txt'), 'a') as file: + file.write(str(self.temp) + '\n') + file.write(str(labels["unique_ids"].numpy()) + '\n') + file.write(str(pred_labels.numpy())+ '\n') + file.write(f"{total_loss}, {cls_loss}, {focal_loss}, {dice_loss}" + '\n') + file.write('-----------------------------------' + '\n') + + # if (sum(temp.values()) == 0) or (len(np.unique(pred_labels).tolist()) == 1 and np.unique(pred_labels).tolist()[0] == 0): + # with open('/depot/davisjam/data/akshath/exps/tf/editing_layers/numIters.txt', 'a') as file: + # file.write(str('numIters : ' + str(self.temp)) + '\n') + # with open('/depot/davisjam/data/akshath/exps/tf/vishal_plot/dict.txt', 'w') as file: + # file.write(str(self.plot_collection)) + # with open('/depot/davisjam/data/akshath/exps/tf/vishal_plot/dict_labels.txt', 'w') as file: + # file.write(str(self.plot_collection_labels)) + + # raise ValueError('Stop2') # # Multiply for logging. # # Since we expect the gradient replica sum to happen in the optimizer, # # the loss is scaled with global num_boxes and weights. - # # To have it more interpretable/comparable we scale it back when logging. + # # # To have it more interpretable/comparable we scale it back when logging. num_replicas_in_sync = tf.distribute.get_strategy().num_replicas_in_sync total_loss *= num_replicas_in_sync cls_loss *= num_replicas_in_sync diff --git a/models/official/projects/maskformer/train.py b/models/official/projects/maskformer/train.py index a59c91d7..5eba5998 100644 --- a/models/official/projects/maskformer/train.py +++ b/models/official/projects/maskformer/train.py @@ -26,7 +26,6 @@ from official.core import train_lib from official.core import train_utils from official.modeling import performance -from cloud_tpu_client import Client from official.projects.maskformer.configs import maskformer from official.projects.maskformer.tasks import panoptic_maskformer @@ -35,10 +34,13 @@ def main(_): if FLAGS.tpu: + from cloud_tpu_client import Client # This is for configuring the TPU software version programatically c = Client(os.environ['TPU_NAME'], zone=os.environ['TPU_ZONE'], project=os.environ['TPU_PROJECT']) c.configure_tpu_version(os.environ["TPU_SOFTWARE"], restart_type='ifNeeded') - c.wait_for_healthy() + c.wait_for_healthy() + else: + os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async' gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696795169.gilbreth-g000.rcac.purdue.edu.86093.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696795169.gilbreth-g000.rcac.purdue.edu.86093.0.v2 new file mode 100644 index 00000000..a23927ab Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696795169.gilbreth-g000.rcac.purdue.edu.86093.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696795544.gilbreth-g000.rcac.purdue.edu.88080.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696795544.gilbreth-g000.rcac.purdue.edu.88080.0.v2 new file mode 100644 index 00000000..797b4253 Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696795544.gilbreth-g000.rcac.purdue.edu.88080.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696800260.gilbreth-g000.rcac.purdue.edu.105971.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696800260.gilbreth-g000.rcac.purdue.edu.105971.0.v2 new file mode 100644 index 00000000..ad02ab55 Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696800260.gilbreth-g000.rcac.purdue.edu.105971.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696800341.gilbreth-g000.rcac.purdue.edu.106666.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696800341.gilbreth-g000.rcac.purdue.edu.106666.0.v2 new file mode 100644 index 00000000..ab649f95 Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696800341.gilbreth-g000.rcac.purdue.edu.106666.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696801211.gilbreth-g000.rcac.purdue.edu.109453.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696801211.gilbreth-g000.rcac.purdue.edu.109453.0.v2 new file mode 100644 index 00000000..8f8c0d54 Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696801211.gilbreth-g000.rcac.purdue.edu.109453.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696801269.gilbreth-g000.rcac.purdue.edu.110006.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696801269.gilbreth-g000.rcac.purdue.edu.110006.0.v2 new file mode 100644 index 00000000..da864f56 Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696801269.gilbreth-g000.rcac.purdue.edu.110006.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696801575.gilbreth-g000.rcac.purdue.edu.111277.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696801575.gilbreth-g000.rcac.purdue.edu.111277.0.v2 new file mode 100644 index 00000000..2e836610 Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696801575.gilbreth-g000.rcac.purdue.edu.111277.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696802244.gilbreth-g000.rcac.purdue.edu.113560.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696802244.gilbreth-g000.rcac.purdue.edu.113560.0.v2 new file mode 100644 index 00000000..51d8484e Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696802244.gilbreth-g000.rcac.purdue.edu.113560.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696802695.gilbreth-g000.rcac.purdue.edu.115248.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696802695.gilbreth-g000.rcac.purdue.edu.115248.0.v2 new file mode 100644 index 00000000..d0b6f1c3 Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696802695.gilbreth-g000.rcac.purdue.edu.115248.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696803587.gilbreth-g000.rcac.purdue.edu.118187.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696803587.gilbreth-g000.rcac.purdue.edu.118187.0.v2 new file mode 100644 index 00000000..37850292 Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696803587.gilbreth-g000.rcac.purdue.edu.118187.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696871050.gilbreth-g007.rcac.purdue.edu.96221.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696871050.gilbreth-g007.rcac.purdue.edu.96221.0.v2 new file mode 100644 index 00000000..43fa6c66 Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696871050.gilbreth-g007.rcac.purdue.edu.96221.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696907351.gilbreth-g005.rcac.purdue.edu.122426.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696907351.gilbreth-g005.rcac.purdue.edu.122426.0.v2 new file mode 100644 index 00000000..2015b0fa Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696907351.gilbreth-g005.rcac.purdue.edu.122426.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696907805.gilbreth-g005.rcac.purdue.edu.128399.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696907805.gilbreth-g005.rcac.purdue.edu.128399.0.v2 new file mode 100644 index 00000000..d2080f2e Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696907805.gilbreth-g005.rcac.purdue.edu.128399.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696907918.gilbreth-g005.rcac.purdue.edu.129094.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696907918.gilbreth-g005.rcac.purdue.edu.129094.0.v2 new file mode 100644 index 00000000..a6990459 Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696907918.gilbreth-g005.rcac.purdue.edu.129094.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696909420.gilbreth-g005.rcac.purdue.edu.3453.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696909420.gilbreth-g005.rcac.purdue.edu.3453.0.v2 new file mode 100644 index 00000000..f5cf9d4f Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696909420.gilbreth-g005.rcac.purdue.edu.3453.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696913554.gilbreth-g005.rcac.purdue.edu.17701.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696913554.gilbreth-g005.rcac.purdue.edu.17701.0.v2 new file mode 100644 index 00000000..268bcb99 Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696913554.gilbreth-g005.rcac.purdue.edu.17701.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696913908.gilbreth-k019.rcac.purdue.edu.98368.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696913908.gilbreth-k019.rcac.purdue.edu.98368.0.v2 new file mode 100644 index 00000000..6b2bb167 Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696913908.gilbreth-k019.rcac.purdue.edu.98368.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696914015.gilbreth-k019.rcac.purdue.edu.99870.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696914015.gilbreth-k019.rcac.purdue.edu.99870.0.v2 new file mode 100644 index 00000000..a1fc99c5 Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696914015.gilbreth-k019.rcac.purdue.edu.99870.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1696915048.gilbreth-k019.rcac.purdue.edu.118639.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1696915048.gilbreth-k019.rcac.purdue.edu.118639.0.v2 new file mode 100644 index 00000000..ab8e9f24 Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1696915048.gilbreth-k019.rcac.purdue.edu.118639.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1699527985.gilbreth-k012.rcac.purdue.edu.121017.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1699527985.gilbreth-k012.rcac.purdue.edu.121017.0.v2 new file mode 100644 index 00000000..e3664331 Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1699527985.gilbreth-k012.rcac.purdue.edu.121017.0.v2 differ diff --git a/models/official/projects/maskformer/train/events.out.tfevents.1699567985.gilbreth-fe00.rcac.purdue.edu.57387.0.v2 b/models/official/projects/maskformer/train/events.out.tfevents.1699567985.gilbreth-fe00.rcac.purdue.edu.57387.0.v2 new file mode 100644 index 00000000..128ada6a Binary files /dev/null and b/models/official/projects/maskformer/train/events.out.tfevents.1699567985.gilbreth-fe00.rcac.purdue.edu.57387.0.v2 differ diff --git a/models/official/projects/maskformer/train_on_cpu.sh b/models/official/projects/maskformer/train_on_cpu.sh index 0f406729..b4f46db8 100755 --- a/models/official/projects/maskformer/train_on_cpu.sh +++ b/models/official/projects/maskformer/train_on_cpu.sh @@ -1,7 +1,7 @@ #!/bin/bash train_bsize=1 eval_bsize=1 -export PYTHONPATH=$PYTHONPATH:~/tf-maskformer/models +export PYTHONPATH=$PYTHONPATH:/depot/davisjam/data/akshath/MaskFormer_tf/tf-maskformer/models export RESNET_CKPT="gs://cam2-models/maskformer_vishal_exps/resnet50_pretrained/tfmg/ckpt-62400" export MODEL_DIR="gs://cam2-models/maskformer_vishal_exps/EXP01_CPU" export TFRECORDS_DIR="gs://cam2-datasets/coco_panoptic/tfrecords" @@ -15,7 +15,7 @@ export OVERRIDES="runtime.distribution_strategy=one_device,runtime.mixed_precisi task.train_data.global_batch_size=$TRAIN_BATCH_SIZE,\ task.model.which_pixel_decoder=transformer_fpn,\ task.init_checkpoint=$RESNET_CKPT" -python3 models/official/projects/maskformer/train.py \ +python3 train.py \ --experiment maskformer_coco_panoptic \ --mode train \ --model_dir $MODEL_DIR \ diff --git a/models/official/projects/maskformer/train_on_gpu.sh b/models/official/projects/maskformer/train_on_gpu.sh index 81c8a16b..5cf44424 100755 --- a/models/official/projects/maskformer/train_on_gpu.sh +++ b/models/official/projects/maskformer/train_on_gpu.sh @@ -1,28 +1,30 @@ #!/bin/bash -# module load gcc/9.3.0 -# cd /depot/qqiu/data/vishal/tf-maskformer/ -# conda activate /depot/qqiu/data/vishal/envs/tmaskformer/ -# module load anaconda/2020.11-py38 -# module load cuda/11.7.0 cudnn/cuda-11.7_8.6 gcc/6.3.0 -train_bsize=8 -eval_bsize=8 -export PYTHONPATH=$PYTHONPATH:/depot/qqiu/data/vishal/tf-maskformer/models + +train_bsize=2 +eval_bsize=1 +export PYTHONPATH=/depot/davisjam/data/akshath/MaskFormer_tf/tf-maskformer/models export MODEL_DIR="./" -export RESNET_CKPT="/depot/davisjam/data/vishal/pretrained_ckpts/tfmg_resnet50/ckpt-62400" export TFRECORDS_DIR="/depot/davisjam/data/vishal/datasets/coco/tfrecords" export TRAIN_BATCH_SIZE=$train_bsize export EVAL_BATCH_SIZE=$eval_bsize -export BASE_LR=0.00005 +export BASE_LR=0.0001 export NO_OBJ_CLS_WEIGHT=0.1 export IMG_SIZE=640 export PRINT_OUTPUTS=True +# Akshath +export ON_GPU=True +export MODE="train" +export LOG_DIR="/depot/davisjam/data/akshath/exps/focal" +export RUN_NUMBER=1 +export USE_BFLOAT16_DATA=False +# export STEPS_PER_EPOCH=10 +export NUM_EPOCH=1 +# export VAL_STEPS=20 export OVERRIDES="runtime.distribution_strategy=one_device,runtime.num_gpus=1,runtime.mixed_precision_dtype=float32,\ task.train_data.global_batch_size=$train_bsize,\ -task.model.which_pixel_decoder=transformer_fpn,\ -task.init_checkpoint=$RESNET_CKPT" -python3 models/official/projects/maskformer/train.py \ +task.model.which_pixel_decoder=transformer_fpn" +python3 train.py \ --experiment maskformer_coco_panoptic \ - --mode eval \ + --mode $MODE \ --model_dir $MODEL_DIR \ - --params_override=$OVERRIDES - + --params_override=$OVERRIDES