I see a very weird error while I try to train the CTC model when the gpu version of tensorflow is used. the cpu version does not have problem. the error generated from the line train_step.run(fd)
try:
for i_batch in range(TRAIN_STEPS):
fd = train_iterator.next_feed(BATCH_SIZE)
train_step.run(fd) <---------
the error is:
NotFoundError: Resource __per_step_4/_tensor_arraysmap/TensorArray_1_85/N10tensorflow11TensorArrayE does not exist.
I could not find helpful materials to figure out what is the problem.
the total log is attached.
NotFoundError Traceback (most recent call last)
/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1321 try:
-> 1322 return fn(*args)
1323 except errors.OpError as e:
/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1306 return self._call_tf_sessionrun(
-> 1307 options, feed_dict, fetch_list, target_list, run_metadata)
1308
/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
1408 self._session, options, feed_dict, fetch_list, target_list,
-> 1409 run_metadata)
1410 else:
NotFoundError: Resource __per_step_4/_tensor_arraysmap/TensorArray_1_85/N10tensorflow11TensorArrayE does not exist.
[[Node: gradients/map/TensorArrayStack/TensorArrayGatherV3_grad/TensorArrayGrad/TensorArrayGradV3 = TensorArrayGradV3[_class=["loc:@map/TensorArray_1"], source="gradients", _device="/job:localhost/replica:0/task:0/device:CPU:0"](map/TensorArray_1/_153, map/while/Exit_1/_155)]]
[[Node: gradients/map/while/map/while/TensorArrayWrite/TensorArrayWriteV3_grad/tuple/control_dependency/_249 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_2317_...dependency", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]
During handling of the above exception, another exception occurred:
NotFoundError Traceback (most recent call last)
in ()
6 for i_batch in range(TRAIN_STEPS):
7 fd = train_iterator.next_feed(BATCH_SIZE)
----> 8 train_step.run(fd)
9 if i_batch % LOSS_ITER == 0:
10 # Plotting loss
/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/framework/ops.py in run(self, feed_dict, session)
2375 none, the default session will be used.
2376 """
-> 2377 _run_using_default_session(self, feed_dict, self.graph, session)
2378
2379 _gradient_registry = registry.Registry("gradient")
/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/framework/ops.py in _run_using_default_session(operation, feed_dict, graph, session)
5213 "the operation's graph is different from the session's "
5214 "graph.")
-> 5215 session.run(operation, feed_dict)
5216
5217
/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
898 try:
899 result = self._run(None, fetches, feed_dict, options_ptr,
--> 900 run_metadata_ptr)
901 if run_metadata:
902 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1133 if final_fetches or final_targets or (handle and feed_dict_tensor):
1134 results = self._do_run(handle, final_targets, final_fetches,
-> 1135 feed_dict_tensor, options, run_metadata)
1136 else:
1137 results = []
/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1314 if handle is None:
1315 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1316 run_metadata)
1317 else:
1318 return self._do_call(_prun_fn, handle, feeds, fetches)
/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1333 except KeyError:
1334 pass
-> 1335 raise type(e)(node_def, op, message)
1336
1337 def _extend_graph(self):
NotFoundError: Resource __per_step_4/_tensor_arraysmap/TensorArray_1_85/N10tensorflow11TensorArrayE does not exist.
[[Node: gradients/map/TensorArrayStack/TensorArrayGatherV3_grad/TensorArrayGrad/TensorArrayGradV3 = TensorArrayGradV3[_class=["loc:@map/TensorArray_1"], source="gradients", _device="/job:localhost/replica:0/task:0/device:CPU:0"](map/TensorArray_1/_153, map/while/Exit_1/_155)]]
[[Node: gradients/map/while/map/while/TensorArrayWrite/TensorArrayWriteV3_grad/tuple/control_dependency/_249 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_2317_...dependency", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]
Caused by op 'gradients/map/TensorArrayStack/TensorArrayGatherV3_grad/TensorArrayGrad/TensorArrayGradV3', defined at:
File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
"main", mod_spec)
File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in
app.launch_new_instance()
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 486, in start
self.io_loop.start()
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tornado/platform/asyncio.py", line 127, in start
self.asyncio_loop.run_forever()
File "/usr/lib/python3.5/asyncio/base_events.py", line 345, in run_forever
self._run_once()
File "/usr/lib/python3.5/asyncio/base_events.py", line 1312, in _run_once
handle._run()
File "/usr/lib/python3.5/asyncio/events.py", line 125, in _run
self._callback(*self._args)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tornado/ioloop.py", line 759, in _run_callback
ret = callback()
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tornado/stack_context.py", line 276, in null_wrapper
return fn(*args, **kwargs)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 536, in
self.io_loop.add_callback(lambda : self._handle_events(self.socket, 0))
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
self._handle_recv()
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
self._run_callback(callback, msg)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
callback(*args, **kwargs)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tornado/stack_context.py", line 276, in null_wrapper
return fn(*args, **kwargs)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
handler(stream, idents, msg)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
raw_cell, store_history, silent, shell_futures)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes
if self.run_code(code, result):
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "", line 8, in
train_step = optimizer.minimize(loss, name='train_step')
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/training/optimizer.py", line 414, in minimize
grad_loss=grad_loss)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/training/optimizer.py", line 526, in compute_gradients
colocate_gradients_with_ops=colocate_gradients_with_ops)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/ops/gradients_impl.py", line 494, in gradients
gate_gradients, aggregation_method, stop_gradients)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/ops/gradients_impl.py", line 636, in _GradientsHelper
lambda: grad_fn(op, *out_grads))
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/ops/gradients_impl.py", line 385, in _MaybeCompile
return grad_fn() # Exit early
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/ops/gradients_impl.py", line 636, in
lambda: grad_fn(op, *out_grads))
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/ops/tensor_array_grad.py", line 161, in _TensorArrayGatherGrad
.grad(source=grad_source, flow=flow))
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/ops/tensor_array_ops.py", line 849, in grad
return self._implementation.grad(source, flow=flow, name=name)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/ops/tensor_array_ops.py", line 241, in grad
handle=self._handle, source=source, flow_in=flow, name=name)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 6229, in tensor_array_grad_v3
name=name)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
op_def=op_def)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1718, in init
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
...which was originally created as op 'map/TensorArrayStack/TensorArrayGatherV3', defined at:
File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
"main", mod_spec)
[elided 23 identical lines from previous traceback]
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "", line 9, in
dtype=tf.float32)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/ops/functional_ops.py", line 424, in map_fn
results_flat = [r.stack() for r in r_a]
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/ops/functional_ops.py", line 424, in
results_flat = [r.stack() for r in r_a]
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/ops/tensor_array_ops.py", line 893, in stack
return self._implementation.stack(name=name)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/ops/tensor_array_ops.py", line 291, in stack
return self.gather(math_ops.range(0, self.size()), name=name)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/ops/tensor_array_ops.py", line 305, in gather
element_shape=element_shape)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 6018, in tensor_array_gather_v3
flow_in=flow_in, dtype=dtype, element_shape=element_shape, name=name)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
op_def=op_def)
File "/mnt/tensor2/python3/HR/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1718, in init
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
NotFoundError (see above for traceback): Resource __per_step_4/_tensor_arraysmap/TensorArray_1_85/N10tensorflow11TensorArrayE does not exist.
[[Node: gradients/map/TensorArrayStack/TensorArrayGatherV3_grad/TensorArrayGrad/TensorArrayGradV3 = TensorArrayGradV3[_class=["loc:@map/TensorArray_1"], source="gradients", _device="/job:localhost/replica:0/task:0/device:CPU:0"](map/TensorArray_1/_153, map/while/Exit_1/_155)]]
[[Node: gradients/map/while/map/while/TensorArrayWrite/TensorArrayWriteV3_grad/tuple/control_dependency/_249 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_2317_...dependency", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]
bug