The scyan from mics-lab

GPU support

Description

I am having trouble getting the GPU to work on the example data

Reproducing the issue

import scyan
import torch
#import lightning.pytorch
import pytorch_lightning as pl
import torch
from torch import Tensor, distributions
from pathlib import Path
import pandas as pd
import anndata
import numpy as np

adata, table = scyan.data.load("aml") # Automatic loading

model = scyan.Scyan(adata, table)

model.fit(trainer=pl.Trainer(accelerator='gpu', devices=1))

I am getting the following error message:

To me, this is an issue of setting up the trainer and is more related to lightning, but I hope that you have encountered this before and know where this problem stems from ...
since the GPU is available and used but it complains that the tensors should be on the same device. I have no clue how to change this...

Thanks,

Bernd

Error message


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[INFO] (scyan.model) Training scyan with the following hyperparameters:
"batch_key":       None
"batch_size":      8192
"hidden_size":     16
"lr":              0.0005
"max_samples":     200000
"modulo_temp":     3
"n_hidden_layers": 6
"n_layers":        7
"prior_std":       0.3
"temperature":     0.5
"warm_up":         (0.35, 4)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type        | Params
---------------------------------------
0 | module | ScyanModule | 29.6 K
---------------------------------------
29.6 K    Trainable params
0         Non-trainable params
29.6 K    Total params
0.118     Total estimated model params size (MB)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[5], line 1
----> 1 model.fit(trainer=pl.Trainer(accelerator='gpu', devices=1))

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/model.py:551, in Scyan.fit(self, max_epochs, min_delta, patience, num_workers, log_every_n_steps, callbacks, logger, enable_checkpointing, trainer, **trainer_args)
    541     log_every_n_steps = min(log_every_n_steps, len(self.x) // self._batch_size)
    542     trainer = pl.Trainer(
    543         max_epochs=max_epochs,
    544         callbacks=[esc] + (callbacks or []),
   (...)
    548         **trainer_args,
    549     )
--> 551 trainer.fit(self)
    553 self._is_fitted = True
    554 log.info("Successfully ended training.")

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:608, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    606 model = self._maybe_unwrap_optimized(model)
    607 self.strategy._lightning_module = model
--> 608 call._call_and_handle_interrupt(
    609     self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
    610 )

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py:38, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
     36         return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
     37     else:
---> 38         return trainer_fn(*args, **kwargs)
     40 except _TunerExitException:
     41     trainer._call_teardown_hook()

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:650, in Trainer._fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    643 ckpt_path = ckpt_path or self.resume_from_checkpoint
    644 self._ckpt_path = self._checkpoint_connector._set_ckpt_path(
    645     self.state.fn,
    646     ckpt_path,  # type: ignore[arg-type]
    647     model_provided=True,
    648     model_connected=self.lightning_module is not None,
    649 )
--> 650 self._run(model, ckpt_path=self.ckpt_path)
    652 assert self.state.stopped
    653 self.training = False

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1112, in Trainer._run(self, model, ckpt_path)
   1108 self._checkpoint_connector.restore_training_state()
   1110 self._checkpoint_connector.resume_end()
-> 1112 results = self._run_stage()
   1114 log.detail(f"{self.__class__.__name__}: trainer tearing down")
   1115 self._teardown()

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1191, in Trainer._run_stage(self)
   1189 if self.predicting:
   1190     return self._run_predict()
-> 1191 self._run_train()

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1214, in Trainer._run_train(self)
   1211 self.fit_loop.trainer = self
   1213 with torch.autograd.set_detect_anomaly(self._detect_anomaly):
-> 1214     self.fit_loop.run()

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs)
    197 try:
    198     self.on_advance_start(*args, **kwargs)
--> 199     self.advance(*args, **kwargs)
    200     self.on_advance_end()
    201     self._restarting = False

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:267, in FitLoop.advance(self)
    265 self._data_fetcher.setup(dataloader, batch_to_device=batch_to_device)
    266 with self.trainer.profiler.profile("run_training_epoch"):
--> 267     self._outputs = self.epoch_loop.run(self._data_fetcher)

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs)
    197 try:
    198     self.on_advance_start(*args, **kwargs)
--> 199     self.advance(*args, **kwargs)
    200     self.on_advance_end()
    201     self._restarting = False

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py:213, in TrainingEpochLoop.advance(self, data_fetcher)
    210     self.batch_progress.increment_started()
    212     with self.trainer.profiler.profile("run_training_batch"):
--> 213         batch_output = self.batch_loop.run(kwargs)
    215 self.batch_progress.increment_processed()
    217 # update non-plateau LR schedulers
    218 # update epoch-interval ones only when we are at the end of training epoch

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs)
    197 try:
    198     self.on_advance_start(*args, **kwargs)
--> 199     self.advance(*args, **kwargs)
    200     self.on_advance_end()
    201     self._restarting = False

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py:88, in TrainingBatchLoop.advance(self, kwargs)
     84 if self.trainer.lightning_module.automatic_optimization:
     85     optimizers = _get_active_optimizers(
     86         self.trainer.optimizers, self.trainer.optimizer_frequencies, kwargs.get("batch_idx", 0)
     87     )
---> 88     outputs = self.optimizer_loop.run(optimizers, kwargs)
     89 else:
     90     outputs = self.manual_loop.run(kwargs)

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/loop.py:199, in Loop.run(self, *args, **kwargs)
    197 try:
    198     self.on_advance_start(*args, **kwargs)
--> 199     self.advance(*args, **kwargs)
    200     self.on_advance_end()
    201     self._restarting = False

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:202, in OptimizerLoop.advance(self, optimizers, kwargs)
    199 def advance(self, optimizers: List[Tuple[int, Optimizer]], kwargs: OrderedDict) -> None:
    200     kwargs = self._build_kwargs(kwargs, self.optimizer_idx, self._hiddens)
--> 202     result = self._run_optimization(kwargs, self._optimizers[self.optim_progress.optimizer_position])
    203     if result.loss is not None:
    204         # automatic optimization assumes a loss needs to be returned for extras to be considered as the batch
    205         # would be skipped otherwise
    206         self._outputs[self.optimizer_idx] = result.asdict()

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:249, in OptimizerLoop._run_optimization(self, kwargs, optimizer)
    241         closure()
    243 # ------------------------------
    244 # BACKWARD PASS
    245 # ------------------------------
    246 # gradient update with accumulated gradients
    247 else:
    248     # the `batch_idx` is optional with inter-batch parallelism
--> 249     self._optimizer_step(optimizer, opt_idx, kwargs.get("batch_idx", 0), closure)
    251 result = closure.consume_result()
    253 if result.loss is not None:
    254     # if no result, user decided to skip optimization
    255     # otherwise update running loss + reset accumulated loss
    256     # TODO: find proper way to handle updating running loss

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:370, in OptimizerLoop._optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
    362     rank_zero_deprecation(
    363         "The NVIDIA/apex AMP implementation has been deprecated upstream. Consequently, its integration inside"
    364         " PyTorch Lightning has been deprecated in v1.9.0 and will be removed in v2.0.0."
   (...)
    367         " return True."
    368     )
    369     kwargs["using_native_amp"] = isinstance(self.trainer.precision_plugin, MixedPrecisionPlugin)
--> 370 self.trainer._call_lightning_module_hook(
    371     "optimizer_step",
    372     self.trainer.current_epoch,
    373     batch_idx,
    374     optimizer,
    375     opt_idx,
    376     train_step_and_backward_closure,
    377     on_tpu=isinstance(self.trainer.accelerator, TPUAccelerator),
    378     **kwargs,  # type: ignore[arg-type]
    379     using_lbfgs=is_lbfgs,
    380 )
    382 if not should_accumulate:
    383     self.optim_progress.optimizer.step.increment_completed()

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1356, in Trainer._call_lightning_module_hook(self, hook_name, pl_module, *args, **kwargs)
   1353 pl_module._current_fx_name = hook_name
   1355 with self.profiler.profile(f"[LightningModule]{pl_module.__class__.__name__}.{hook_name}"):
-> 1356     output = fn(*args, **kwargs)
   1358 # restore current_fx when nested context
   1359 pl_module._current_fx_name = prev_fx_name

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/core/module.py:1754, in LightningModule.optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_lbfgs)
   1675 def optimizer_step(
   1676     self,
   1677     epoch: int,
   (...)
   1683     using_lbfgs: bool = False,
   1684 ) -> None:
   1685     r"""
   1686     Override this method to adjust the default way the :class:`~pytorch_lightning.trainer.trainer.Trainer` calls
   1687     each optimizer.
   (...)
   1752 
   1753     """
-> 1754     optimizer.step(closure=optimizer_closure)

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py:169, in LightningOptimizer.step(self, closure, **kwargs)
    166     raise MisconfigurationException("When `optimizer.step(closure)` is called, the closure should be callable")
    168 assert self._strategy is not None
--> 169 step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
    171 self._on_after_step()
    173 return step_output

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py:234, in Strategy.optimizer_step(self, optimizer, opt_idx, closure, model, **kwargs)
    232 # TODO(fabric): remove assertion once strategy's optimizer_step typing is fixed
    233 assert isinstance(model, pl.LightningModule)
--> 234 return self.precision_plugin.optimizer_step(
    235     optimizer, model=model, optimizer_idx=opt_idx, closure=closure, **kwargs
    236 )

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:119, in PrecisionPlugin.optimizer_step(self, optimizer, model, optimizer_idx, closure, **kwargs)
    117 """Hook to run the optimizer step."""
    118 closure = partial(self._wrap_closure, model, optimizer, optimizer_idx, closure)
--> 119 return optimizer.step(closure=closure, **kwargs)

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/optim/optimizer.py:140, in Optimizer._hook_for_profile.<locals>.profile_hook_step.<locals>.wrapper(*args, **kwargs)
    138 profile_name = "Optimizer.step#{}.step".format(obj.__class__.__name__)
    139 with torch.autograd.profiler.record_function(profile_name):
--> 140     out = func(*args, **kwargs)
    141     obj._optimizer_step_code()
    142     return out

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/optim/optimizer.py:23, in _use_grad_for_differentiable.<locals>._use_grad(self, *args, **kwargs)
     21 try:
     22     torch.set_grad_enabled(self.defaults['differentiable'])
---> 23     ret = func(self, *args, **kwargs)
     24 finally:
     25     torch.set_grad_enabled(prev_grad)

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/torch/optim/adam.py:183, in Adam.step(self, closure, grad_scaler)
    181 if closure is not None:
    182     with torch.enable_grad():
--> 183         loss = closure()
    185 for group in self.param_groups:
    186     params_with_grad = []

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:105, in PrecisionPlugin._wrap_closure(self, model, optimizer, optimizer_idx, closure)
     92 def _wrap_closure(
     93     self,
     94     model: "pl.LightningModule",
   (...)
     97     closure: Callable[[], Any],
     98 ) -> Any:
     99     """This double-closure allows makes sure the ``closure`` is executed before the
    100     ``on_before_optimizer_step`` hook is called.
    101 
    102     The closure (generally) runs ``backward`` so this allows inspecting gradients in this hook. This structure is
    103     consistent with the ``PrecisionPlugin`` subclasses that cannot pass ``optimizer.step(closure)`` directly.
    104     """
--> 105     closure_result = closure()
    106     self._after_closure(model, optimizer, optimizer_idx)
    107     return closure_result

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:149, in Closure.__call__(self, *args, **kwargs)
    148 def __call__(self, *args: Any, **kwargs: Any) -> Optional[Tensor]:
--> 149     self._result = self.closure(*args, **kwargs)
    150     return self._result.loss

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:135, in Closure.closure(self, *args, **kwargs)
    134 def closure(self, *args: Any, **kwargs: Any) -> ClosureResult:
--> 135     step_output = self._step_fn()
    137     if step_output.closure_loss is None:
    138         self.warning_cache.warn("`training_step` returned `None`. If this was on purpose, ignore this warning...")

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:419, in OptimizerLoop._training_step(self, kwargs)
    410 """Performs the actual train step with the tied hooks.
    411 
    412 Args:
   (...)
    416     A ``ClosureResult`` containing the training step output.
    417 """
    418 # manually capture logged metrics
--> 419 training_step_output = self.trainer._call_strategy_hook("training_step", *kwargs.values())
    420 self.trainer.strategy.post_training_step()
    422 model_output = self.trainer._call_lightning_module_hook("training_step_end", training_step_output)

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1494, in Trainer._call_strategy_hook(self, hook_name, *args, **kwargs)
   1491     return
   1493 with self.profiler.profile(f"[Strategy]{self.strategy.__class__.__name__}.{hook_name}"):
-> 1494     output = fn(*args, **kwargs)
   1496 # restore current_fx when nested context
   1497 pl_module._current_fx_name = prev_fx_name

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py:378, in Strategy.training_step(self, *args, **kwargs)
    376 with self.precision_plugin.train_step_context():
    377     assert isinstance(self.model, TrainingStep)
--> 378     return self.model.training_step(*args, **kwargs)

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/model.py:306, in Scyan.training_step(self, data, _)
    304 """PyTorch lightning `training_step` implementation (i.e. returning the loss). See [ScyanModule][scyan.module.ScyanModule] for more details."""
    305 use_temp = self.current_epoch % self.hparams.modulo_temp > 0
--> 306 loss = self.module.kl(*data, use_temp)
    308 self.log("loss", loss, on_epoch=True, on_step=True)
    310 return loss

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/scyan_module.py:194, in ScyanModule.kl(self, x, covariates, use_temp)
    178 def kl(
    179     self,
    180     x: Tensor,
    181     covariates: Tensor,
    182     use_temp: bool,
    183 ) -> Tuple[Tensor, Tensor]:
    184     """Compute the module loss for one mini-batch.
    185 
    186     Args:
   (...)
    192         The KL loss term.
    193     """
--> 194     log_probs, ldj_sum, _ = self.compute_probabilities(x, covariates, use_temp)
    196     return -(torch.logsumexp(log_probs, dim=1) + ldj_sum).mean()

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/scyan_module.py:174, in ScyanModule.compute_probabilities(self, x, covariates, use_temp)
    166 u, _, ldj_sum = self(x, covariates)
    168 log_pi = (
    169     self.log_pi_temperature(-self.hparams.temperature)
    170     if use_temp
    171     else self.log_pi
    172 )
--> 174 log_probs = self.prior.log_prob(u) + log_pi  # size N x P
    176 return log_probs, ldj_sum, u

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/distribution.py:122, in PriorDistribution.log_prob(self, u)
    113 def log_prob(self, u: Tensor) -> Tensor:
    114     """Log probability per population.
    115 
    116     Args:
   (...)
    120         Log probabilities tensor of size $(B, P)$.
    121     """
--> 122     diff = self.difference_to_modes(u)  # size B x P x M
    124     return self.prior_h.log_prob(diff) + self.na_constant_term

File ~/venvs/P3.9.18.scyan/lib/python3.9/site-packages/scyan/module/distribution.py:92, in PriorDistribution.difference_to_modes(self, u)
     83 def difference_to_modes(self, u: Tensor) -> Tensor:
     84     """Difference between the latent variable $U$ and all the modes (one mode per population).
     85 
     86     Args:
   (...)
     90         Tensor of size $(B, P, M)$ representing differences to all modes.
     91     """
---> 92     diff = u[:, None, :] - self.modes
     94     diff[:, self.rho_mask] = torch.clamp(
     95         diff[:, self.rho_mask].abs() - self.uniform_law_radius, min=0
     96     )  # Handling NA values
     98     return diff

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

System

OS: linux/slurm cluster
Python version 3.9.18

Dependencies versions

Package Version ------------------------ ------------ aiohttp 3.8.6 aiosignal 1.3.1 anndata 0.8.0 asttokens 2.4.1 async-timeout 4.0.3 attrs 23.1.0 certifi 2023.7.22 charset-normalizer 3.3.1 comm 0.1.4 contourpy 1.1.1 cycler 0.12.1 debugpy 1.8.0 decorator 5.1.1 exceptiongroup 1.1.3 executing 2.0.1 fcsparser 0.2.8 fcswrite 0.6.2 FlowUtils 1.0.0 fonttools 4.43.1 frozenlist 1.4.0 fsspec 2023.10.0 h5py 3.10.0 idna 3.4 importlib-metadata 6.8.0 importlib-resources 6.1.0 ipykernel 6.26.0 ipython 8.17.2 ipywidgets 8.1.1 jedi 0.19.1 joblib 1.3.2 jupyter_client 8.5.0 jupyter_core 5.5.0 jupyterlab-widgets 3.0.9 kiwisolver 1.4.5 lightning 2.1.0 lightning-utilities 0.9.0 llvmlite 0.38.1 matplotlib 3.8.0 matplotlib-inline 0.1.6 multidict 6.0.4 natsort 8.4.0 nest-asyncio 1.5.8 numba 0.55.2 numpy 1.22.4 nvidia-cublas-cu11 11.10.3.66 nvidia-cuda-nvrtc-cu11 11.7.99 nvidia-cuda-runtime-cu11 11.7.99 nvidia-cudnn-cu11 8.5.0.96 packaging 23.2 pandas 2.1.2 parso 0.8.3 pexpect 4.8.0 Pillow 10.1.0 pip 23.3.1 platformdirs 3.11.0 prompt-toolkit 3.0.39 psutil 5.9.6 ptyprocess 0.7.0 pure-eval 0.2.2 Pygments 2.16.1 pynndescent 0.5.10 pyparsing 3.1.1 python-dateutil 2.8.2 pytorch-lightning 1.9.5 pytz 2023.3.post1 PyYAML 6.0.1 pyzmq 25.1.1 requests 2.31.0 scikit-learn 1.3.2 scipy 1.11.3 scyan 1.5.1 seaborn 0.12.2 setuptools 58.1.0 six 1.16.0 stack-data 0.6.3 tbb 2021.10.0 threadpoolctl 3.2.0 torch 1.13.1 torchmetrics 1.2.0 tornado 6.3.3 tqdm 4.66.1 traitlets 5.13.0 typing_extensions 4.8.0 tzdata 2023.3 umap-learn 0.5.4 urllib3 2.0.7 wcwidth 0.2.9 wheel 0.41.3 widgetsnbextension 4.0.9 yarl 1.9.2 zipp 3.17.0 Note: you may need to restart the kernel to use updated packages.

[Bug] NA values in refine_fit if some populations are not predicted

When some populations are not predicted, utils.grouped_means produce NaN values

mics-lab / scyan Goto Github PK

scyan's People

Contributors

Stargazers

Watchers

Forkers

scyan's Issues

GPU support

Description

Reproducing the issue

System

[Bug] NA values in refine_fit if some populations are not predicted

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent