GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[INFO] (scyan.model) Training scyan with the following hyperparameters:
"batch_key": None
"batch_size": 8192
"hidden_size": 16
"lr": 0.0005
"max_samples": 200000
"modulo_temp": 3
"n_hidden_layers": 6
"n_layers": 7
"prior_std": 0.3
"temperature": 0.5
"warm_up": (0.35, 4)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
| Name | Type | Params
---------------------------------------
0 | module | ScyanModule | 29.6 K
---------------------------------------
29.6 K Trainable params
0 Non-trainable params
29.6 K Total params
0.118 Total estimated model params size (MB)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[5], line 1
----> 1 model.fit(trainer=pl.Trainer(accelerator='gpu', devices=1))
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/model.py:551, in Scyan.fit(self, max_epochs, min_delta, patience, num_workers, log_every_n_steps, callbacks, logger, enable_checkpointing, trainer, **trainer_args)
541 log_every_n_steps = min(log_every_n_steps, len(self.x) // self._batch_size)
542 trainer = pl.Trainer(
543 max_epochs=max_epochs,
544 callbacks=[esc] + (callbacks or []),
(...)
548 **trainer_args,
549 )
--> 551 trainer.fit(self)
553 self._is_fitted = True
554 log.info("Successfully ended training.")
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:608, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
606 model = self._maybe_unwrap_optimized(model)
607 self.strategy._lightning_module = model
--> 608 call._call_and_handle_interrupt(
609 self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
610 )
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py:38, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
36 return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
37 else:
---> 38 return trainer_fn(*args, **kwargs)
40 except _TunerExitException:
41 trainer._call_teardown_hook()
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:650, in Trainer._fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
643 ckpt_path = ckpt_path or self.resume_from_checkpoint
644 self._ckpt_path = self._checkpoint_connector._set_ckpt_path(
645 self.state.fn,
646 ckpt_path, # type: ignore[arg-type]
647 model_provided=True,
648 model_connected=self.lightning_module is not None,
649 )
--> 650 self._run(model, ckpt_path=self.ckpt_path)
652 assert self.state.stopped
653 self.training = False
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1112, in Trainer._run(self, model, ckpt_path)
1108 self._checkpoint_connector.restore_training_state()
1110 self._checkpoint_connector.resume_end()
-> 1112 results = self._run_stage()
1114 log.detail(f"{self.__class__.__name__}: trainer tearing down")
1115 self._teardown()
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1191, in Trainer._run_stage(self)
1189 if self.predicting:
1190 return self._run_predict()
-> 1191 self._run_train()
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1214, in Trainer._run_train(self)
1211 self.fit_loop.trainer = self
1213 with torch.autograd.set_detect_anomaly(self._detect_anomaly):
-> 1214 self.fit_loop.run()
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs)
197 try:
198 self.on_advance_start(*args, **kwargs)
--> 199 self.advance(*args, **kwargs)
200 self.on_advance_end()
201 self._restarting = False
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:267, in FitLoop.advance(self)
265 self._data_fetcher.setup(dataloader, batch_to_device=batch_to_device)
266 with self.trainer.profiler.profile("run_training_epoch"):
--> 267 self._outputs = self.epoch_loop.run(self._data_fetcher)
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs)
197 try:
198 self.on_advance_start(*args, **kwargs)
--> 199 self.advance(*args, **kwargs)
200 self.on_advance_end()
201 self._restarting = False
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py:213, in TrainingEpochLoop.advance(self, data_fetcher)
210 self.batch_progress.increment_started()
212 with self.trainer.profiler.profile("run_training_batch"):
--> 213 batch_output = self.batch_loop.run(kwargs)
215 self.batch_progress.increment_processed()
217 # update non-plateau LR schedulers
218 # update epoch-interval ones only when we are at the end of training epoch
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs)
197 try:
198 self.on_advance_start(*args, **kwargs)
--> 199 self.advance(*args, **kwargs)
200 self.on_advance_end()
201 self._restarting = False
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py:88, in TrainingBatchLoop.advance(self, kwargs)
84 if self.trainer.lightning_module.automatic_optimization:
85 optimizers = _get_active_optimizers(
86 self.trainer.optimizers, self.trainer.optimizer_frequencies, kwargs.get("batch_idx", 0)
87 )
---> 88 outputs = self.optimizer_loop.run(optimizers, kwargs)
89 else:
90 outputs = self.manual_loop.run(kwargs)
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs)
197 try:
198 self.on_advance_start(*args, **kwargs)
--> 199 self.advance(*args, **kwargs)
200 self.on_advance_end()
201 self._restarting = False
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:202, in OptimizerLoop.advance(self, optimizers, kwargs)
199 def advance(self, optimizers: List[Tuple[int, Optimizer]], kwargs: OrderedDict) -> None:
200 kwargs = self._build_kwargs(kwargs, self.optimizer_idx, self._hiddens)
--> 202 result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
203 if result.loss is not None:
204 # automatic optimization assumes a loss needs to be returned for extras to be considered as the batch
205 # would be skipped otherwise
206 self._outputs[self.optimizer_idx] = result.asdict()
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:249, in OptimizerLoop._run_optimization(self, kwargs, optimizer)
241 closure()
243 # ------------------------------
244 # BACKWARD PASS
245 # ------------------------------
246 # gradient update with accumulated gradients
247 else:
248 # the `batch_idx` is optional with inter-batch parallelism
--> 249 self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
251 result = closure.consume_result()
253 if result.loss is not None:
254 # if no result, user decided to skip optimization
255 # otherwise update running loss + reset accumulated loss
256 # TODO: find proper way to handle updating running loss
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:370, in OptimizerLoop._optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
362 rank_zero_deprecation(
363 "The NVIDIA/apex AMP implementation has been deprecated upstream. Consequently, its integration inside"
364 " PyTorch Lightning has been deprecated in v1.9.0 and will be removed in v2.0.0."
(...)
367 " return True."
368 )
369 kwargs["using_native_amp"] = isinstance(self.trainer.precision_plugin, MixedPrecisionPlugin)
--> 370 self.trainer._call_lightning_module_hook(
371 "optimizer_step",
372 self.trainer.current_epoch,
373 batch_idx,
374 optimizer,
375 opt_idx,
376 train_step_and_backward_closure,
377 on_tpu=isinstance(self.trainer.accelerator, TPUAccelerator),
378 **kwargs, # type: ignore[arg-type]
379 using_lbfgs=is_lbfgs,
380 )
382 if not should_accumulate:
383 self.optim_progress.optimizer.step.increment_completed()
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1356, in Trainer._call_lightning_module_hook(self, hook_name, pl_module, *args, **kwargs)
1353 pl_module._current_fx_name = hook_name
1355 with self.profiler.profile(f"[LightningModule]{pl_module.__class__.__name__}.{hook_name}"):
-> 1356 output = fn(*args, **kwargs)
1358 # restore current_fx when nested context
1359 pl_module._current_fx_name = prev_fx_name
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/core/module.py:1754, in LightningModule.optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_lbfgs)
1675 def optimizer_step(
1676 self,
1677 epoch: int,
(...)
1683 using_lbfgs: bool = False,
1684 ) -> None:
1685 r"""
1686 Override this method to adjust the default way the :class:`~pytorch_lightning.trainer.trainer.Trainer` calls
1687 each optimizer.
(...)
1752
1753 """
-> 1754 optimizer.step(closure=optimizer_closure)
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py:169, in LightningOptimizer.step(self, closure, **kwargs)
166 raise MisconfigurationException("When `optimizer.step(closure)` is called, the closure should be callable")
168 assert self._strategy is not None
--> 169 step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
171 self._on_after_step()
173 return step_output
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py:234, in Strategy.optimizer_step(self, optimizer, opt_idx, closure, model, **kwargs)
232 # TODO(fabric): remove assertion once strategy's optimizer_step typing is fixed
233 assert isinstance(model, pl.LightningModule)
--> 234 return self.precision_plugin.optimizer_step(
235 optimizer, model=model, optimizer_idx=opt_idx, closure=closure, **kwargs
236 )
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:119, in PrecisionPlugin.optimizer_step(self, optimizer, model, optimizer_idx, closure, **kwargs)
117 """Hook to run the optimizer step."""
118 closure = partial(self._wrap_closure, model, optimizer, optimizer_idx, closure)
--> 119 return optimizer.step(closure=closure, **kwargs)
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/optim/optimizer.py:140, in Optimizer._hook_for_profile.<locals>.profile_hook_step.<locals>.wrapper(*args, **kwargs)
138 profile_name = "Optimizer.step#{}.step".format(obj.__class__.__name__)
139 with torch.autograd.profiler.record_function(profile_name):
--> 140 out = func(*args, **kwargs)
141 obj._optimizer_step_code()
142 return out
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/optim/optimizer.py:23, in _use_grad_for_differentiable.<locals>._use_grad(self, *args, **kwargs)
21 try:
22 torch.set_grad_enabled(self.defaults['differentiable'])
---> 23 ret = func(self, *args, **kwargs)
24 finally:
25 torch.set_grad_enabled(prev_grad)
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/optim/adam.py:183, in Adam.step(self, closure, grad_scaler)
181 if closure is not None:
182 with torch.enable_grad():
--> 183 loss = closure()
185 for group in self.param_groups:
186 params_with_grad = []
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:105, in PrecisionPlugin._wrap_closure(self, model, optimizer, optimizer_idx, closure)
92 def _wrap_closure(
93 self,
94 model: "pl.LightningModule",
(...)
97 closure: Callable[[], Any],
98 ) -> Any:
99 """This double-closure allows makes sure the ``closure`` is executed before the
100 ``on_before_optimizer_step`` hook is called.
101
102 The closure (generally) runs ``backward`` so this allows inspecting gradients in this hook. This structure is
103 consistent with the ``PrecisionPlugin`` subclasses that cannot pass ``optimizer.step(closure)`` directly.
104 """
--> 105 closure_result = closure()
106 self._after_closure(model, optimizer, optimizer_idx)
107 return closure_result
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:149, in Closure.__call__(self, *args, **kwargs)
148 def __call__(self, *args: Any, **kwargs: Any) -> Optional[Tensor]:
--> 149 self._result = self.closure(*args, **kwargs)
150 return self._result.loss
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:135, in Closure.closure(self, *args, **kwargs)
134 def closure(self, *args: Any, **kwargs: Any) -> ClosureResult:
--> 135 step_output = self._step_fn()
137 if step_output.closure_loss is None:
138 self.warning_cache.warn("`training_step` returned `None`. If this was on purpose, ignore this warning...")
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:419, in OptimizerLoop._training_step(self, kwargs)
410 """Performs the actual train step with the tied hooks.
411
412 Args:
(...)
416 A ``ClosureResult`` containing the training step output.
417 """
418 # manually capture logged metrics
--> 419 training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values())
420 self.trainer.strategy.post_training_step()
422 model_output = self.trainer._call_lightning_module_hook("training_step_end", training_step_output)
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1494, in Trainer._call_strategy_hook(self, hook_name, *args, **kwargs)
1491 return
1493 with self.profiler.profile(f"[Strategy]{self.strategy.__class__.__name__}.{hook_name}"):
-> 1494 output = fn(*args, **kwargs)
1496 # restore current_fx when nested context
1497 pl_module._current_fx_name = prev_fx_name
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py:378, in Strategy.training_step(self, *args, **kwargs)
376 with self.precision_plugin.train_step_context():
377 assert isinstance(self.model, TrainingStep)
--> 378 return self.model.training_step(*args, **kwargs)
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/model.py:306, in Scyan.training_step(self, data, _)
304 """PyTorch lightning `training_step` implementation (i.e. returning the loss). See [ScyanModule][scyan.module.ScyanModule] for more details."""
305 use_temp = self.current_epoch % self.hparams.modulo_temp > 0
--> 306 loss = self.module.kl(*data, use_temp)
308 self.log("loss", loss, on_epoch=True, on_step=True)
310 return loss
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/scyan_module.py:194, in ScyanModule.kl(self, x, covariates, use_temp)
178 def kl(
179 self,
180 x: Tensor,
181 covariates: Tensor,
182 use_temp: bool,
183 ) -> Tuple[Tensor, Tensor]:
184 """Compute the module loss for one mini-batch.
185
186 Args:
(...)
192 The KL loss term.
193 """
--> 194 log_probs, ldj_sum, _ = self.compute_probabilities(x, covariates, use_temp)
196 return -(torch.logsumexp(log_probs, dim=1) + ldj_sum).mean()
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/scyan_module.py:174, in ScyanModule.compute_probabilities(self, x, covariates, use_temp)
166 u, _, ldj_sum = self(x, covariates)
168 log_pi = (
169 self.log_pi_temperature(-self.hparams.temperature)
170 if use_temp
171 else self.log_pi
172 )
--> 174 log_probs = self.prior.log_prob(u) + log_pi # size N x P
176 return log_probs, ldj_sum, u
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/distribution.py:122, in PriorDistribution.log_prob(self, u)
113 def log_prob(self, u: Tensor) -> Tensor:
114 """Log probability per population.
115
116 Args:
(...)
120 Log probabilities tensor of size $(B, P)$.
121 """
--> 122 diff = self.difference_to_modes(u) # size B x P x M
124 return self.prior_h.log_prob(diff) + self.na_constant_term
File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/distribution.py:92, in PriorDistribution.difference_to_modes(self, u)
83 def difference_to_modes(self, u: Tensor) -> Tensor:
84 """Difference between the latent variable $U$ and all the modes (one mode per population).
85
86 Args:
(...)
90 Tensor of size $(B, P, M)$ representing differences to all modes.
91 """
---> 92 diff = u[:, None, :] - self.modes
94 diff[:, self.rho_mask] = torch.clamp(
95 diff[:, self.rho_mask].abs() - self.uniform_law_radius, min=0
96 ) # Handling NA values
98 return diff
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!