We're getting an interesting error when trying to run the GPU code on larger datasets
workers=30
nodes=1
RESULTS=results/small_run_w${workers}_n${nodes}
deepblast-train \
--train-pairs $DIR/train.txt \
--test-pairs $DIR/test.txt \
--valid-pairs $DIR/valid.txt \
--output-directory $RESULTS \
--num-workers $workers \
--learning-rate 1e-4 \
--visualization-fraction 0.01 \
--batch-size 24 \
--grad-accum 16 \
--gpus 1
Warning: Error detected in torch::autograd::GraphRoot. Traceback of forward call that caused the error:
File "/home/jmorton/miniconda3/envs/alignment/bin/deepblast-train", line 7, in <module>
exec(compile(f.read(), __file__, 'exec'))
File "/home/jmorton/research/gert/deepblast/scripts/deepblast-train", line 67, in <module>
main(hparams)
File "/home/jmorton/research/gert/deepblast/scripts/deepblast-train", line 47, in main
trainer.fit(model)
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 976, in fit
results = self.single_gpu_train(model)
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/pytorch_lightning/trainer/distrib_parts.py", line 186, in single_gpu_train
results = self.run_pretrain_routine(model)
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1180, in run_pretrain_routine
self.train()
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 370, in train
self.run_training_epoch()
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 452, in run_training_epoch
batch_output = self.run_training_batch(batch, batch_idx)
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 627, in run_training_batch
opt_closure_result = self.optimizer_closure(
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 775, in optimizer_closure
training_step_output = self.training_forward(split_batch, batch_idx, opt_idx,
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 946, in training_forward
output = self.model.training_step(*args)
File "/home/jmorton/research/gert/deepblast/deepblast/trainer.py", line 80, in training_step
predA = self.aligner(x, y)
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/home/jmorton/research/gert/deepblast/deepblast/alignment.py", line 79, in forward
aln = self.nw.decode(theta, A)
File "/home/jmorton/research/gert/deepblast/deepblast/nw_cuda.py", line 304, in decode
v_grad, _ = torch.autograd.grad(v, (theta, A), create_graph=True)
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/autograd/__init__.py", line 156, in grad
return Variable._execution_engine.run_backward(
(print_stack at /opt/conda/conda-bld/pytorch_1591914886554/work/torch/csrc/autograd/python_anomaly_mode.cpp:60)
Traceback (most recent call last):
File "/home/jmorton/miniconda3/envs/alignment/bin/deepblast-train", line 7, in <module>
exec(compile(f.read(), __file__, 'exec'))
File "/home/jmorton/research/gert/deepblast/scripts/deepblast-train", line 67, in <module>
main(hparams)
File "/home/jmorton/research/gert/deepblast/scripts/deepblast-train", line 47, in main
trainer.fit(model)
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 976, in fit
results = self.single_gpu_train(model)
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/pytorch_lightning/trainer/distrib_parts.py", line 186, in single_gpu_train
results = self.run_pretrain_routine(model)
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1180, in run_pretrain_routine
self.train()
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 370, in train
self.run_training_epoch()
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 452, in run_training_epoch
batch_output = self.run_training_batch(batch, batch_idx)
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 627, in run_training_batch
opt_closure_result = self.optimizer_closure(
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 775, in optimizer_closure
training_step_output = self.training_forward(split_batch, batch_idx, opt_idx,
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 946, in training_forward
output = self.model.training_step(*args)
File "/home/jmorton/research/gert/deepblast/deepblast/trainer.py", line 80, in training_step
predA = self.aligner(x, y)
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/home/jmorton/research/gert/deepblast/deepblast/alignment.py", line 79, in forward
aln = self.nw.decode(theta, A)
File "/home/jmorton/research/gert/deepblast/deepblast/nw_cuda.py", line 304, in decode
v_grad, _ = torch.autograd.grad(v, (theta, A), create_graph=True)
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/autograd/__init__.py", line 156, in grad
return Variable._execution_engine.run_backward(
RuntimeError: CUDA error: an illegal memory access was encountered (operator() at /opt/conda/conda-bld/pytorch_1591914886554/work/aten/src/ATen/native/cuda/CUDAScalar.cu:19)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x4e (0x2aaaf9925b5e in /home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x2111a53 (0x2aaad4623a53 in /home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #2: at::native::_local_scalar_dense_cuda(at::Tensor const&) + 0x27 (0x2aaad4625157 in /home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #3: <unknown function> + 0xdd2280 (0x2aaad32e4280 in /home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0xe22b9d (0x2aaacdc1ab9d in /home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #5: <unknown function> + 0x27f3c99 (0x2aaacf5ebc99 in /home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #6: <unknown function> + 0xe22b9d (0x2aaacdc1ab9d in /home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #7: at::native::item(at::Tensor const&) + 0xc9c (0x2aaacd9187bc in /home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #8: <unknown function> + 0xe997e0 (0x2aaacdc917e0 in /home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #9: <unknown function> + 0x283f42b (0x2aaacf63742b in /home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #10: <unknown function> + 0xe22b9d (0x2aaacdc1ab9d in /home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #11: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&) + 0xc4c (0x2aaacf8dc64c in /home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #12: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&, bool) + 0x3d2 (0x2aaacf8dded2 in /home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #13: torch::autograd::Engine::thread_init(int) + 0x39 (0x2aaacf8d6549 in /home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/lib/libtorch_cpu.so)
frame #14: torch::autograd::python::PythonEngine::thread_init(int) + 0x38 (0x2aaacc125b08 in /home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame #15: <unknown function> + 0xc8163 (0x2aaac9ccf163 in /home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/torch/lib/../../../.././libstdc++.so.6)
frame #16: <unknown function> + 0x7ea5 (0x2aaaaacd6ea5 in /lib64/libpthread.so.0)
frame #17: clone + 0x6d (0x2aaaaafe98dd in /lib64/libc.so.6)
Exception ignored in: <function tqdm.__del__ at 0x2aab07342ca0>
Traceback (most recent call last):
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/tqdm/std.py", line 1086, in __del__
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/tqdm/std.py", line 1293, in close
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/tqdm/std.py", line 1471, in display
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/tqdm/std.py", line 1089, in __repr__
File "/home/jmorton/miniconda3/envs/alignment/lib/python3.8/site-packages/tqdm/std.py", line 1433, in format_dict
TypeError: cannot unpack non-iterable NoneType object