I encounterd this error when training the second model. Is anyone met the same error? If so, please teach me how to solve this. Thanks a lot.
Main library version :
torch 1.10.0+cu111
spconv-cu111 2.1.25
mmcv-full 1.4.0
mmdet 2.11.0
mmdet3d 0.11.0
Error:
[Exception|implicit_gemm]feat=torch.Size([290557, 96]),w=torch.Size([128, 3, 3, 3, 96]),pair=torch.Size([27, 131609]),act=131609,issubm=False,istrain=True
SPCONV_DEBUG_SAVE_PATH not found, you can specify SPCONV_DEBUG_SAVE_PATH as debug data save path to save debug data which can be attached in a issue.
[WARNING]your gpu arch (8, 9) isn't compiled in prebuilt, may cause invalid device function. available: {(7, 0), (6, 1), (8, 0), (6, 0), (7, 5), (8, 6), (5, 2)}
[WARNING]your gpu arch (8, 9) isn't compiled in prebuilt, may cause invalid device function. available: {(7, 0), (6, 1), (8, 0), (6, 0), (7, 5), (8, 6), (5, 2)}
[WARNING]your gpu arch (8, 9) isn't compiled in prebuilt, may cause invalid device function. available: {(7, 0), (6, 1), (8, 0), (6, 0), (7, 5), (8, 6), (5, 2)}
[WARNING]your gpu arch (8, 9) isn't compiled in prebuilt, may cause invalid device function. available: {(7, 0), (6, 1), (8, 0), (6, 0), (7, 5), (8, 6), (5, 2)}
[WARNING]your gpu arch (8, 9) isn't compiled in prebuilt, may cause invalid device function. available: {(7, 0), (6, 1), (8, 0), (6, 0), (7, 5), (8, 6), (5, 2)}
[WARNING]your gpu arch (8, 9) isn't compiled in prebuilt, may cause invalid device function. available: {(7, 0), (6, 1), (8, 0), (6, 0), (7, 5), (8, 6), (5, 2)}
Traceback (most recent call last):
File "./tools/train.py", line 287, in
main()
File "./tools/train.py", line 283, in main
meta=meta)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/mmdet/apis/train.py", line 170, in train_detector
runner.run(data_loaders, cfg.workflow)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 127, in run
epoch_runner(data_loaders[i], **kwargs)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 50, in train
self.run_iter(data_batch, train_mode=True, **kwargs)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 30, in run_iter
**kwargs)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/mmcv/parallel/distributed.py", line 52, in train_step
output = self.module.train_step(*inputs[0], **kwargs[0])
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/mmdet/models/detectors/base.py", line 247, in train_step
losses = self(**data)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/mmcv/runner/fp16_utils.py", line 98, in new_func
return old_func(*args, **kwargs)
File "/root/autodl-tmp/MSMDFusion-main/mmdet3d/models/detectors/base.py", line 58, in forward
return self.forward_train(**kwargs)
File "/root/autodl-tmp/MSMDFusion-main/mmdet3d/models/detectors/MSMDFusion.py", line 530, in forward_train
points, img=img, img_metas=img_metas)
File "/root/autodl-tmp/MSMDFusion-main/mmdet3d/models/detectors/MSMDFusion.py", line 458, in extract_feat
pts_feats = self.extract_pts_feat(points, img_feats, img_metas)
File "/root/autodl-tmp/MSMDFusion-main/mmdet3d/models/detectors/MSMDFusion.py", line 438, in extract_pts_feat
self.radius_list, self.max_cluster_samples_list, self.dist_thresh_list)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/autodl-tmp/MSMDFusion-main/mmdet3d/models/middle_encoders/sparse_multimodal_encoder_painting.py", line 456, in forward
voxel_stage_out_ds = getattr(self.downscale_blocks, stage_name)(voxel_stage_out)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/spconv/pytorch/modules.py", line 137, in forward
input = module(input)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/spconv/pytorch/conv.py", line 447, in forward
input._timer, self.fp32_accum)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/torch/cuda/amp/autocast_mode.py", line 94, in decorate_fwd
return fwd(*args, **kwargs)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/spconv/pytorch/functional.py", line 200, in forward
raise e
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/spconv/pytorch/functional.py", line 191, in forward
fp32_accum)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/spconv/pytorch/ops.py", line 1118, in implicit_gemm
fp32_accum=fp32_accum)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/spconv/algo.py", line 620, in tune_and_cache
output = output.clone()
RuntimeError: TensorStorage /io/include/tensorview/tensor.h 168
cuda failed with error 2 out of memory. use CUDA_LAUNCH_BLOCKING=1 to get correct traceback.
Traceback (most recent call last):
File "./tools/train.py", line 287, in
main()
File "./tools/train.py", line 283, in main
meta=meta)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/mmdet/apis/train.py", line 170, in train_detector
runner.run(data_loaders, cfg.workflow)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 127, in run
epoch_runner(data_loaders[i], **kwargs)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 50, in train
self.run_iter(data_batch, train_mode=True, **kwargs)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 30, in run_iter
**kwargs)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/mmcv/parallel/distributed.py", line 52, in train_step
output = self.module.train_step(*inputs[0], **kwargs[0])
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/mmdet/models/detectors/base.py", line 247, in train_step
losses = self(**data)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/mmcv/runner/fp16_utils.py", line 98, in new_func
return old_func(*args, **kwargs)
File "/root/autodl-tmp/MSMDFusion-main/mmdet3d/models/detectors/base.py", line 58, in forward
return self.forward_train(**kwargs)
File "/root/autodl-tmp/MSMDFusion-main/mmdet3d/models/detectors/MSMDFusion.py", line 530, in forward_train
points, img=img, img_metas=img_metas)
File "/root/autodl-tmp/MSMDFusion-main/mmdet3d/models/detectors/MSMDFusion.py", line 458, in extract_feat
pts_feats = self.extract_pts_feat(points, img_feats, img_metas)
File "/root/autodl-tmp/MSMDFusion-main/mmdet3d/models/detectors/MSMDFusion.py", line 438, in extract_pts_feat
self.radius_list, self.max_cluster_samples_list, self.dist_thresh_list)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/autodl-tmp/MSMDFusion-main/mmdet3d/models/middle_encoders/sparse_multimodal_encoder_painting.py", line 456, in forward
voxel_stage_out_ds = getattr(self.downscale_blocks, stage_name)(voxel_stage_out)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/spconv/pytorch/modules.py", line 137, in forward
input = module(input)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/spconv/pytorch/conv.py", line 447, in forward
input._timer, self.fp32_accum)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/torch/cuda/amp/autocast_mode.py", line 94, in decorate_fwd
return fwd(*args, **kwargs)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/spconv/pytorch/functional.py", line 200, in forward
raise e
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/spconv/pytorch/functional.py", line 191, in forward
fp32_accum)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/spconv/pytorch/ops.py", line 1118, in implicit_gemm
fp32_accum=fp32_accum)
File "/root/autodl-tmp/envs/openMmlab/lib/python3.7/site-packages/spconv/algo.py", line 620, in tune_and_cache
output = output.clone()
RuntimeError: TensorStorage /io/include/tensorview/tensor.h 168
cuda failed with error 2 out of memory. use CUDA_LAUNCH_BLOCKING=1 to get correct traceback.