You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
请问在CPU集群运行分布式TF的时候遇到这个问题是咋回事?有啥解决办法吗?
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
could not find method isEncrypted from class org/apache/hadoop/fs/FileStatus with signature ()Z
hdfsGetPathInfo(/user/tdw_gilbertchen/model_path/test/2019080400): getFileInfo error:
java.lang.NoSuchMethodError: isEncrypted
INFO:tensorflow:Graph was finalized.
2019-09-04 11:14:49.486416: I tensorflow/core/distributed_runtime/master_session.cc:1161] Start master session 239ec7870b717670 with config: gpu_options { }
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into hdfs://ss-wxg-3-v2/user/tdw_gilbertchen/model_path/test/2019080400/model.ckpt.
Traceback (most recent call last):
File "/data/user/code/mainRun.py", line 150, in
tf.app.run()
File "/usr/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 125, in run
_sys.exit(main(argv))
File "/data/user/code/mainRun.py", line 137, in main
tf.estimator.train_and_evaluate(model, train_spec, eval_spec)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/training.py", line 471, in train_and_evaluate
return executor.run()
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/training.py", line 637, in run
getattr(self, task_to_run)()
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/training.py", line 642, in run_chief
return self._start_distributed_training()
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/training.py", line 788, in _start_distributed_training
saving_listeners=saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 354, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1207, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1241, in _train_model_default
saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1468, in _train_with_estimator_spec
log_step_count_steps=log_step_count_steps) as mon_sess:
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 504, in MonitoredTrainingSession
stop_grace_period_secs=stop_grace_period_secs)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 921, in init
stop_grace_period_secs=stop_grace_period_secs)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 643, in init
self._sess = _RecoverableSession(self._coordinated_creator)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1107, in init
_WrappedSession.init(self, self._create_session())
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1112, in _create_session
return self._sess_creator.create_session()
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 807, in create_session
hook.after_create_session(self.tf_sess, self.coord)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/basic_session_run_hooks.py", line 568, in after_create_session
self._save(session, global_step)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/basic_session_run_hooks.py", line 599, in _save
self._get_saver().save(session, self._save_path, global_step=step)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1441, in save
{self.saver_def.filename_tensor_name: checkpoint_file})
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 929, in run
run_metadata_ptr)
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1152, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1328, in _do_run
run_metadata)
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1348, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.UnknownError: hdfs://ss-wxg-3-v2/user/tdw_gilbertchen/model_path/test/2019080400/model.ckpt-0_temp_6e34385f8bd846499e635fda38324771/part-00000-of-00001.index; Unknown error 255
[[node save/MergeV2Checkpoints (defined at /data/user/code/mainRun.py:137) = MergeV2Checkpoints[delete_old_dirs=true, _device="/job:ps/replica:0/task:0/device:CPU:0"](save/MergeV2Checkpoints/checkpoint_prefixes, _recv_save/Const_0_S581)]]
[[{{node save/Identity_S583}} = _HostRecvclient_terminated=false, recv_device="/job:chief/replica:0/task:0/device:CPU:0", send_device="/job:ps/replica:0/task:0/device:CPU:0", send_device_incarnation=-6548387880355174373, tensor_name="edge_302_save/Identity", tensor_type=DT_STRING, _device="/job:chief/replica:0/task:0/device:CPU:0"]]
Caused by op u'save/MergeV2Checkpoints', defined at:
File "/data/user/code/mainRun.py", line 150, in
tf.app.run()
File "/usr/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 125, in run
_sys.exit(main(argv))
File "/data/user/code/mainRun.py", line 137, in main
tf.estimator.train_and_evaluate(model, train_spec, eval_spec)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/training.py", line 471, in train_and_evaluate
return executor.run()
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/training.py", line 637, in run
getattr(self, task_to_run)()
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/training.py", line 642, in run_chief
return self._start_distributed_training()
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/training.py", line 788, in _start_distributed_training
saving_listeners=saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 354, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1207, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1241, in _train_model_default
saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1468, in _train_with_estimator_spec
log_step_count_steps=log_step_count_steps) as mon_sess:
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 504, in MonitoredTrainingSession
stop_grace_period_secs=stop_grace_period_secs)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 921, in init
stop_grace_period_secs=stop_grace_period_secs)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 643, in init
self._sess = _RecoverableSession(self._coordinated_creator)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1107, in init
_WrappedSession.init(self, self._create_session())
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1112, in _create_session
return self._sess_creator.create_session()
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 800, in create_session
self.tf_sess = self._session_creator.create_session()
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 557, in create_session
self._scaffold.finalize()
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 215, in finalize
self._saver.build()
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1114, in build
self._build(self._filename, build_save=True, build_restore=True)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1151, in _build
build_save=build_save, build_restore=build_restore)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 786, in _build_internal
save_tensor = self._AddShardedSaveOps(filename_tensor, per_device)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 369, in _AddShardedSaveOps
return self._AddShardedSaveOpsForV2(filename_tensor, per_device)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 351, in _AddShardedSaveOpsForV2
sharded_prefixes, checkpoint_prefix, delete_old_dirs=True)
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/gen_io_ops.py", line 473, in merge_v2_checkpoints
delete_old_dirs=delete_old_dirs, name=name)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/usr/lib/python2.7/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
return func(*args, **kwargs)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
op_def=op_def)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1770, in init
self._traceback = tf_stack.extract_stack()
请问在CPU集群运行分布式TF的时候遇到这个问题是咋回事?有啥解决办法吗?
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
could not find method isEncrypted from class org/apache/hadoop/fs/FileStatus with signature ()Z
hdfsGetPathInfo(/user/tdw_gilbertchen/model_path/test/2019080400): getFileInfo error:
java.lang.NoSuchMethodError: isEncrypted
INFO:tensorflow:Graph was finalized.
2019-09-04 11:14:49.486416: I tensorflow/core/distributed_runtime/master_session.cc:1161] Start master session 239ec7870b717670 with config: gpu_options { }
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into hdfs://ss-wxg-3-v2/user/tdw_gilbertchen/model_path/test/2019080400/model.ckpt.
Traceback (most recent call last):
File "/data/user/code/mainRun.py", line 150, in
tf.app.run()
File "/usr/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 125, in run
_sys.exit(main(argv))
File "/data/user/code/mainRun.py", line 137, in main
tf.estimator.train_and_evaluate(model, train_spec, eval_spec)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/training.py", line 471, in train_and_evaluate
return executor.run()
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/training.py", line 637, in run
getattr(self, task_to_run)()
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/training.py", line 642, in run_chief
return self._start_distributed_training()
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/training.py", line 788, in _start_distributed_training
saving_listeners=saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 354, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1207, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1241, in _train_model_default
saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1468, in _train_with_estimator_spec
log_step_count_steps=log_step_count_steps) as mon_sess:
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 504, in MonitoredTrainingSession
stop_grace_period_secs=stop_grace_period_secs)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 921, in init
stop_grace_period_secs=stop_grace_period_secs)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 643, in init
self._sess = _RecoverableSession(self._coordinated_creator)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1107, in init
_WrappedSession.init(self, self._create_session())
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1112, in _create_session
return self._sess_creator.create_session()
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 807, in create_session
hook.after_create_session(self.tf_sess, self.coord)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/basic_session_run_hooks.py", line 568, in after_create_session
self._save(session, global_step)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/basic_session_run_hooks.py", line 599, in _save
self._get_saver().save(session, self._save_path, global_step=step)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1441, in save
{self.saver_def.filename_tensor_name: checkpoint_file})
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 929, in run
run_metadata_ptr)
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1152, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1328, in _do_run
run_metadata)
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1348, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.UnknownError: hdfs://ss-wxg-3-v2/user/tdw_gilbertchen/model_path/test/2019080400/model.ckpt-0_temp_6e34385f8bd846499e635fda38324771/part-00000-of-00001.index; Unknown error 255
[[node save/MergeV2Checkpoints (defined at /data/user/code/mainRun.py:137) = MergeV2Checkpoints[delete_old_dirs=true, _device="/job:ps/replica:0/task:0/device:CPU:0"](save/MergeV2Checkpoints/checkpoint_prefixes, _recv_save/Const_0_S581)]]
[[{{node save/Identity_S583}} = _HostRecvclient_terminated=false, recv_device="/job:chief/replica:0/task:0/device:CPU:0", send_device="/job:ps/replica:0/task:0/device:CPU:0", send_device_incarnation=-6548387880355174373, tensor_name="edge_302_save/Identity", tensor_type=DT_STRING, _device="/job:chief/replica:0/task:0/device:CPU:0"]]
Caused by op u'save/MergeV2Checkpoints', defined at:
File "/data/user/code/mainRun.py", line 150, in
tf.app.run()
File "/usr/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 125, in run
_sys.exit(main(argv))
File "/data/user/code/mainRun.py", line 137, in main
tf.estimator.train_and_evaluate(model, train_spec, eval_spec)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/training.py", line 471, in train_and_evaluate
return executor.run()
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/training.py", line 637, in run
getattr(self, task_to_run)()
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/training.py", line 642, in run_chief
return self._start_distributed_training()
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/training.py", line 788, in _start_distributed_training
saving_listeners=saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 354, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1207, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1241, in _train_model_default
saving_listeners)
File "/usr/lib/python2.7/site-packages/tensorflow/python/estimator/estimator.py", line 1468, in _train_with_estimator_spec
log_step_count_steps=log_step_count_steps) as mon_sess:
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 504, in MonitoredTrainingSession
stop_grace_period_secs=stop_grace_period_secs)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 921, in init
stop_grace_period_secs=stop_grace_period_secs)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 643, in init
self._sess = _RecoverableSession(self._coordinated_creator)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1107, in init
_WrappedSession.init(self, self._create_session())
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1112, in _create_session
return self._sess_creator.create_session()
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 800, in create_session
self.tf_sess = self._session_creator.create_session()
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 557, in create_session
self._scaffold.finalize()
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 215, in finalize
self._saver.build()
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1114, in build
self._build(self._filename, build_save=True, build_restore=True)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1151, in _build
build_save=build_save, build_restore=build_restore)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 786, in _build_internal
save_tensor = self._AddShardedSaveOps(filename_tensor, per_device)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 369, in _AddShardedSaveOps
return self._AddShardedSaveOpsForV2(filename_tensor, per_device)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 351, in _AddShardedSaveOpsForV2
sharded_prefixes, checkpoint_prefix, delete_old_dirs=True)
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/gen_io_ops.py", line 473, in merge_v2_checkpoints
delete_old_dirs=delete_old_dirs, name=name)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/usr/lib/python2.7/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
return func(*args, **kwargs)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
op_def=op_def)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1770, in init
self._traceback = tf_stack.extract_stack()
UnknownError (see above for traceback): hdfs://ss-wxg-3-v2/user/tdw_gilbertchen/model_path/test/2019080400/model.ckpt-0_temp_6e34385f8bd846499e635fda38324771/part-00000-of-00001.index; Unknown error 255
[[node save/MergeV2Checkpoints (defined at /data/user/code/mainRun.py:137) = MergeV2Checkpoints[delete_old_dirs=true, _device="/job:ps/replica:0/task:0/device:CPU:0"](save/MergeV2Checkpoints/checkpoint_prefixes, _recv_save/Const_0_S581)]]
[[{{node save/Identity_S583}} = _HostRecvclient_terminated=false, recv_device="/job:chief/replica:0/task:0/device:CPU:0", send_device="/job:ps/replica:0/task:0/device:CPU:0", send_device_incarnation=-6548387880355174373, tensor_name="edge_302_save/Identity", tensor_type=DT_STRING, _device="/job:chief/replica:0/task:0/device:CPU:0"]]
The text was updated successfully, but these errors were encountered: