1. What you did:
I tried to use automatic mixed precision when training a MaskRCNN model via a graph rewrite. As presented here: https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/train/experimental/enable_mixed_precision_graph_rewrite, I added the following line at the end of the generalized_rcnn function GeneralizedRCNN.optimizer(): opt = tf.train.experimental.enable_mixed_precision_graph_rewrite(opt)
2. What you observed:
When I train the model without evaluation callback, there is no issue at all. Once it is trained, if I load the model with OfflinePredictor, it also works well. However, if I train the model with evaluation callback, I get the following error during the first evaluation:
InternalError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/client/session.py in _do_call(self, fn, *args)
1364 try:
-> 1365 return fn(*args)
1366 except errors.OpError as e:
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1349 return self._call_tf_sessionrun(options, feed_dict, fetch_list,
-> 1350 target_list, run_metadata)
1351
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
1442 fetch_list, target_list,
-> 1443 run_metadata)
1444
InternalError: 2 root error(s) found.
(0) Internal: Blas GEMM launch failed : a.shape=(12032000, 1), b.shape=(1, 4), m=12032000, n=4, k=1
[[{{node tower-pred-0/fpn/upsample_lat4/Tensordot/MatMul}}]]
(1) Internal: Blas GEMM launch failed : a.shape=(12032000, 1), b.shape=(1, 4), m=12032000, n=4, k=1
[[{{node tower-pred-0/fpn/upsample_lat4/Tensordot/MatMul}}]]
0 successful operations.
0 derived errors ignored.
During handling of the above exception, another exception occurred:
InternalError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/tensorpack/train/interface.py in launch_train_with_config(config, trainer)
97 starting_epoch=config.starting_epoch,
98 max_epoch=config.max_epoch,
---> 99 extra_callbacks=config.extra_callbacks)
100
101
/opt/conda/lib/python3.7/site-packages/tensorpack/train/base.py in train_with_defaults(self, _sentinel, callbacks, monitors, session_creator, session_init, steps_per_epoch, starting_epoch, max_epoch, extra_callbacks)
340 self.train(callbacks, monitors,
341 session_creator, session_init,
--> 342 steps_per_epoch, starting_epoch, max_epoch)
343
344 def __new__(cls, *args, **kwargs):
/opt/conda/lib/python3.7/site-packages/tensorpack/train/base.py in train(self, callbacks, monitors, session_creator, session_init, steps_per_epoch, starting_epoch, max_epoch)
312 self.setup_callbacks(callbacks, monitors)
313 self.initialize(session_creator, session_init)
--> 314 self.main_loop(steps_per_epoch, starting_epoch, max_epoch)
315
316 def train_with_defaults(
/opt/conda/lib/python3.7/site-packages/tensorpack/utils/argtools.py in wrapper(*args, **kwargs)
166 cache.add(func)
167
--> 168 return func(*args, **kwargs)
169
170 return wrapper
/opt/conda/lib/python3.7/site-packages/tensorpack/train/base.py in main_loop(self, steps_per_epoch, starting_epoch, max_epoch)
284
285 # trigger epoch outside the timing region.
--> 286 self._callbacks.trigger_epoch()
287 logger.info("Training has finished!")
288 except (StopTraining, tf.errors.OutOfRangeError) as e:
/opt/conda/lib/python3.7/site-packages/tensorpack/callbacks/base.py in trigger_epoch(self)
154
155 def trigger_epoch(self):
--> 156 self._trigger_epoch()
157
158 def _trigger_epoch(self):
/opt/conda/lib/python3.7/site-packages/tensorpack/callbacks/group.py in _trigger_epoch(self)
93 display_name = str(cb)
94 with tm.timed_callback(display_name):
---> 95 cb.trigger_epoch()
96 tm.log()
97
/opt/conda/lib/python3.7/site-packages/tensorpack/callbacks/base.py in trigger_epoch(self)
154
155 def trigger_epoch(self):
--> 156 self._trigger_epoch()
157
158 def _trigger_epoch(self):
/opt/conda/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
/opt/conda/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
/opt/conda/lib/python3.7/concurrent/futures/thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
/home/jovyan/eval.py in predict_dataflow()
--> 157 outputs = predict_image(img, model_func)
/home/jovyan/eval.py in predict_image(img, model_func)
---> 46 outputs = model_func(img)
/opt/conda/lib/python3.7/site-packages/tensorpack/predict/base.py in __call__(self, *dp)
39 list[array]: list of outputs
40 """
---> 41 output = self._do_call(dp)
42 if self.return_input:
43 return (dp, output)
/opt/conda/lib/python3.7/site-packages/tensorpack/predict/base.py in _do_call(self, dp)
134 # run_metadata = tf.RunMetadata()
135 # options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
--> 136 return self._callable(*dp)
137
138
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/client/session.py in _generic_run(*feed_args, **kwargs)
1230 feed: feed_val for feed, feed_val in zip(feed_list, feed_args)
1231 }
-> 1232 return self.run(fetches, feed_dict=feed_dict, **kwargs)
1233
1234 return _generic_run
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
954 try:
955 result = self._run(None, fetches, feed_dict, options_ptr,
--> 956 run_metadata_ptr)
957 if run_metadata:
958 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1178 if final_fetches or final_targets or (handle and feed_dict_tensor):
1179 results = self._do_run(handle, final_targets, final_fetches,
-> 1180 feed_dict_tensor, options, run_metadata)
1181 else:
1182 results = []
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1357 if handle is None:
1358 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1359 run_metadata)
1360 else:
1361 return self._do_call(_prun_fn, handle, feeds, fetches)
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/client/session.py in _do_call(self, fn, *args)
1382 '\nsession_config.graph_options.rewrite_options.'
1383 'disable_meta_optimizer = True')
-> 1384 raise type(e)(node_def, op, message)
1385
1386 def _extend_graph(self):
InternalError: 2 root error(s) found.
(0) Internal: Blas GEMM launch failed : a.shape=(12032000, 1), b.shape=(1, 4), m=12032000, n=4, k=1
[[node tower-pred-0/fpn/upsample_lat4/Tensordot/MatMul (defined at /opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:1748) ]]
(1) Internal: Blas GEMM launch failed : a.shape=(12032000, 1), b.shape=(1, 4), m=12032000, n=4, k=1
[[node tower-pred-0/fpn/upsample_lat4/Tensordot/MatMul (defined at /opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:1748) ]]
0 successful operations.
0 derived errors ignored.
Original stack trace for 'tower-pred-0/fpn/upsample_lat4/Tensordot/MatMul':
File "/opt/conda/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/opt/conda/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/opt/conda/lib/python3.7/site-packages/traitlets/config/application.py", line 845, in launch_instance
app.start()
File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 612, in start
self.io_loop.start()
File "/opt/conda/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 199, in start
self.asyncio_loop.run_forever()
File "/opt/conda/lib/python3.7/asyncio/base_events.py", line 541, in run_forever
self._run_once()
File "/opt/conda/lib/python3.7/asyncio/base_events.py", line 1786, in _run_once
handle._run()
File "/opt/conda/lib/python3.7/asyncio/events.py", line 88, in _run
self._context.run(self._callback, *self._args)
File "/opt/conda/lib/python3.7/site-packages/tornado/ioloop.py", line 688, in <lambda>
lambda f: self._run_callback(functools.partial(callback, future))
File "/opt/conda/lib/python3.7/site-packages/tornado/ioloop.py", line 741, in _run_callback
ret = callback()
File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 814, in inner
self.ctx_run(self.run)
File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 775, in run
yielded = self.gen.send(value)
File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 374, in dispatch_queue
yield self.process_one()
File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 250, in wrapper
runner = Runner(ctx_run, result, future, yielded)
File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 741, in __init__
self.ctx_run(self.run)
File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 775, in run
yielded = self.gen.send(value)
File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 358, in process_one
yield gen.maybe_future(dispatch(*args))
File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 234, in wrapper
yielded = ctx_run(next, result)
File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell
yield gen.maybe_future(handler(stream, idents, msg))
File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 234, in wrapper
yielded = ctx_run(next, result)
File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 538, in execute_request
user_expressions, allow_stdin,
File "/opt/conda/lib/python3.7/site-packages/tornado/gen.py", line 234, in wrapper
yielded = ctx_run(next, result)
File "/opt/conda/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 302, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/opt/conda/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 539, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2895, in run_cell
raw_cell, store_history, silent, shell_futures)
File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2940, in _run_cell
return runner(coro)
File "/opt/conda/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
coro.send(None)
File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3166, in run_cell_async
interactivity=interactivity, compiler=compiler, result=result)
File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3357, in run_ast_nodes
if (await self.run_code(code, result, async_=asy)):
File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3437, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-2-f9d37edbca59>", line 23, in <module>
commit_hash = "unknown",
File "/home/jovyan/train.py", line 315, in train_mask_rcnn
launch_train_with_config(traincfg, trainer)
File "/opt/conda/lib/python3.7/site-packages/tensorpack/train/interface.py", line 99, in launch_train_with_config
extra_callbacks=config.extra_callbacks)
File "/opt/conda/lib/python3.7/site-packages/tensorpack/train/base.py", line 342, in train_with_defaults
steps_per_epoch, starting_epoch, max_epoch)
File "/opt/conda/lib/python3.7/site-packages/tensorpack/train/base.py", line 312, in train
self.setup_callbacks(callbacks, monitors)
File "/opt/conda/lib/python3.7/site-packages/tensorpack/utils/argtools.py", line 168, in wrapper
return func(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/tensorpack/train/base.py", line 209, in setup_callbacks
self._callbacks.setup_graph(weakref.proxy(self))
File "/opt/conda/lib/python3.7/site-packages/tensorpack/callbacks/base.py", line 59, in setup_graph
self._setup_graph()
File "/opt/conda/lib/python3.7/site-packages/tensorpack/callbacks/group.py", line 68, in _setup_graph
cb.setup_graph(self.trainer)
File "/opt/conda/lib/python3.7/site-packages/tensorpack/callbacks/base.py", line 59, in setup_graph
self._setup_graph()
File "/home/jovyan/eval.py", line 305, in _setup_graph
self.predictors = [self._build_predictor(k % num_gpu) for k in range(self.num_predictor)]
File "/home/jovyan/eval.py", line 305, in <listcomp>
self.predictors = [self._build_predictor(k % num_gpu) for k in range(self.num_predictor)]
File "/home/jovyan/eval.py", line 319, in _build_predictor
return self.trainer.get_predictor(self._in_names, self._out_names, device=idx)
File "/opt/conda/lib/python3.7/site-packages/tensorpack/train/tower.py", line 136, in get_predictor
self.tower_func(*input.get_input_tensors())
File "/opt/conda/lib/python3.7/site-packages/tensorpack/tfutils/tower.py", line 291, in __call__
output = self._tower_fn(*args)
File "/home/jovyan/modeling/generalized_rcnn.py", line 129, in build_graph
features = self.backbone(image)
File "/home/jovyan/modeling/generalized_rcnn.py", line 307, in backbone
p23456 = fpn_model('fpn', c2345)
File "/opt/conda/lib/python3.7/site-packages/tensorpack/models/registry.py", line 173, in wrapped_func
outputs = func(*args, **actual_args)
File "/home/jovyan/modeling/model_fpn.py", line 65, in fpn_model
lat = lat + upsample2x('upsample_lat{}'.format(6 - idx), lat_sum_5432[-1])
File "/home/jovyan/modeling/model_fpn.py", line 51, in upsample2x
data_format='channels_first')
File "/opt/conda/lib/python3.7/site-packages/tensorpack/models/registry.py", line 173, in wrapped_func
outputs = func(*args, **actual_args)
File "/opt/conda/lib/python3.7/site-packages/tensorpack/models/pool.py", line 127, in FixedUnPooling
ret = tf.tensordot(x, mat, axes=1) # bxcxhxwxshxsw
File "/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/math_ops.py", line 4071, in tensordot
ab_matmul = matmul(a_reshape, b_reshape)
File "/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/util/dispatch.py", line 180, in wrapper
return target(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/math_ops.py", line 2754, in matmul
a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
File "/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/gen_math_ops.py", line 6136, in mat_mul
name=name)
File "/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
op_def=op_def)
File "/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
attrs, op_def, compute_device)
File "/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
op_def=op_def)
File "/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
self._traceback = tf_stack.extract_stack()
4. Your environment:
sys.platform linux
Python 3.7.10 | packaged by conda-forge | (default, Feb 19 2021, 16:07:37) [GCC 9.3.0]
Tensorpack v0.10.1-0-g8f831349
Numpy 1.19.5
TensorFlow 1.15.5/v1.15.5-1-g7d0c58b5326
TF Compiler Version 7.3.1 20180303
TF CUDA support True
TF MKL support False
TF XLA support False
Nvidia Driver /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.450.51.06
CUDA /usr/local/cuda-11.0/targets/x86_64-linux/lib/libcudart.so.11.0.221
CUDNN /usr/lib/x86_64-linux-gnu/libcudnn.so.8.0.4
NCCL /usr/lib/x86_64-linux-gnu/libnccl.so.2.7.8
CUDA_VISIBLE_DEVICES Unspecified
GPU 0 Tesla T4
Free RAM 21.86/29.45 GB
CPU Count 8
Horovod 0.21.3
cv2 4.4.0
msgpack 1.0.2
python-prctl False
Question: is it possible to run evaluation callback while training with automatic mixed precision (even if it already works in inference outside of the training) or are there changes to perform to make it work?