Giter Club home page Giter Club logo

bigpls's Introduction

bigPLS

bigPLS, PLS models and their extension for big data in R

Frédéric Bertrand and Myriam Maumy-Bertrand

Lifecycle: stable Project Status: Active – The project has reached a stable, usable state and is being actively developed. R-CMD-check Codecov test coverage CRAN status CRAN RStudio mirror downloads GitHub Repo stars DOI

The goal of bigPLS is provide Cox models in a high dimensional setting in R.

Support for parallel computation and GPU is being developed.

This website and these examples were created by F. Bertrand and M. Maumy-Bertrand.

Installation

You can install the released version of bigPLS from CRAN with:

install.packages("bigPLS")

You can install the development version of bigPLS from github with:

devtools::install_github("fbertran/bigPLS")

Example

Allelotyping real dataset

The dataset

library(plsRcox)
data(micro.censure)
Y_train_micro <- micro.censure$survyear[1:80]
C_train_micro <- micro.censure$DC[1:80]
Y_test_micro <- micro.censure$survyear[81:117]
C_test_micro <- micro.censure$DC[81:117]

data(Xmicro.censure_compl_imp)
X_train_micro <- apply((as.matrix(Xmicro.censure_compl_imp)),FUN="as.numeric",MARGIN=2)[1:80,]
X_train_micro_df <- data.frame(X_train_micro)

X_train_micro_orig <- Xmicro.censure_compl_imp[1:80,]
X_train_micro_orig_df <- data.frame(X_train_micro_orig)

Compute deviance residuals with some options.

head(computeDR(Y_train_micro,C_train_micro,plot=TRUE))

Simulated data

Generate dataset

set.seed(4669)
library(bigPLS)
x_sim <- matrix(sample(0:1, size = 20000, replace = TRUE), ncol = 2)
dCox_sim <- dataCox(10^4, lambda = 3, rho = 2, x_sim,
beta = c(1,3), cens.rate = 5)
data(dCox_sim)

Compute deviance residuals with some options.

with(dCox_sim,head(computeDR(time,status,plot=TRUE)))

Model Matrix

coxgpls(~.,Y_train_micro,C_train_micro,ncomp=6,trace=TRUE,model_matrix=TRUE,dataXplan = X_train_micro_orig_df,ind.block.x=c(3,10,20))

coxgpls

(cox_gpls_fit=coxgpls(X_train_micro,Y_train_micro,C_train_micro,ncomp=6,ind.block.x=c(3,10,15)))

(cox_gpls_fit2=coxgpls(~X_train_micro,Y_train_micro,C_train_micro,ncomp=6,ind.block.x=c(3,10,15)))

(cox_gpls_fit3=coxgpls(~.,Y_train_micro,C_train_micro,ncomp=6,
dataXplan=X_train_micro_df,ind.block.x=c(3,10,15)))

rm(cox_gpls_fit,cox_gpls_fit2,cox_gpls_fit3)

cv.coxgpls

set.seed(123456)
(cv.coxgpls.res=cv.coxgpls(list(x=X_train_micro,time=Y_train_micro, status=C_train_micro),nt=10,ind.block.x=c(3,10,15)))

coxgplsDR

(cox_gplsDR_fit=coxgplsDR(X_train_micro,Y_train_micro,C_train_micro,ncomp=6,ind.block.x=c(3,10,15)))

(cox_gplsDR_fit2=coxgplsDR(~X_train_micro,Y_train_micro,C_train_micro,ncomp=6,ind.block.x=c(3,10,15)))

(cox_gplsDR_fit3=coxgplsDR(~.,Y_train_micro,C_train_micro,ncomp=6,
dataXplan=X_train_micro_df,ind.block.x=c(3,10,15)))

rm(cox_gplsDR_fit,cox_gplsDR_fit2,cox_gplsDR_fit3)

cv.coxgplsDR

set.seed(123456)

(cv.coxsplsDR.res=cv.coxgplsDR(list(x=X_train_micro,time=Y_train_micro, status=C_train_micro),nt=10,ind.block.x=c(3,10,15)))

coxDKgplsDR

(cox_DKsplsDR_fit=coxDKgplsDR(X_train_micro,Y_train_micro,C_train_micro,ncomp=6, validation="CV",ind.block.x=c(3,10,15),verbose=TRUE))

(cox_DKsplsDR_fit=coxDKgplsDR(~X_train_micro,Y_train_micro,C_train_micro,ncomp=6, validation="CV",ind.block.x=c(3,10,15)))

(cox_DKsplsDR_fit=coxDKgplsDR(~.,Y_train_micro,C_train_micro,ncomp=6,
validation="CV",dataXplan=data.frame(X_train_micro),ind.block.x=c(3,10,15)))

rm(cox_DKsplsDR_fit)

cv.coxDKgPLSDR

set.seed(123456)

(cv.coxDKgplsDR.res=cv.coxDKgplsDR(list(x=X_train_micro,time=Y_train_micro, status=C_train_micro),nt=10,ind.block.x=c(3,10,15)))

coxsgpls

(cox_sgpls_fit=coxsgpls(X_train_micro,Y_train_micro,C_train_micro,ncomp=6,ind.block.x=c(3,10,15), alpha.x = rep(0.95, 6)))

(cox_sgpls_fit2=coxsgpls(~X_train_micro,Y_train_micro,C_train_micro,ncomp=6,ind.block.x=c(3,10,15), alpha.x = rep(0.95, 6)))

(cox_sgpls_fit3=coxsgpls(~.,Y_train_micro,C_train_micro,ncomp=6,
dataXplan=X_train_micro_df,ind.block.x=c(3,10,15), alpha.x = rep(0.95, 6)))

rm(cox_sgpls_fit,cox_sgpls_fit2,cox_sgpls_fit3)

cv.coxsgpls

set.seed(123456)
(cv.coxsgpls.res=cv.coxsgpls(list(x=X_train_micro,time=Y_train_micro, status=C_train_micro),nt=10,ind.block.x=c(3,10,15), alpha.x = rep(0.95, 10)))

coxsgplsDR

(cox_sgplsDR_fit=coxsgplsDR(X_train_micro,Y_train_micro,C_train_micro,ncomp=6,ind.block.x=c(3,10,15), alpha.x = rep(0.95, 10)))

(cox_sgplsDR_fit2=coxsgplsDR(~X_train_micro,Y_train_micro,C_train_micro,ncomp=6,ind.block.x=c(3,10,15), alpha.x = rep(0.95, 10)))

(cox_sgplsDR_fit3=coxsgplsDR(~.,Y_train_micro,C_train_micro,ncomp=6,
                           dataXplan=X_train_micro_df,ind.block.x=c(3,10,15), alpha.x = rep(0.95, 10)))

rm(cox_sgplsDR_fit,cox_sgplsDR_fit2,cox_sgplsDR_fit3)

cv.coxsgplsDR

set.seed(4669)

(cv.coxsgplsDR.res=cv.coxsgplsDR(list(x=X_train_micro,time=Y_train_micro, status=C_train_micro),nt=10,ind.block.x=c(3,10,15), alpha.x = rep(0.95, 10)))

coxDKsgplsDR

(cox_DKsgplsDR_fit=coxDKsgplsDR(X_train_micro,Y_train_micro,C_train_micro,ncomp=6, validation="CV",ind.block.x=c(3,10,15), alpha.x = rep(0.95, 10),verbose=TRUE))

(cox_DKsgplsDR_fit=coxDKsgplsDR(~X_train_micro,Y_train_micro,C_train_micro,ncomp=6, validation="CV",ind.block.x=c(3,10,15), alpha.x = rep(0.95, 10)))

(cox_DKsgplsDR_fit=coxDKsgplsDR(~.,Y_train_micro,C_train_micro,ncomp=6,
                              validation="CV",dataXplan=data.frame(X_train_micro),ind.block.x=c(3,10,15), alpha.x = rep(0.95, 10)))

rm(cox_DKsgplsDR_fit)

cv.coxDKgplsDR

set.seed(123456)

(cv.coxDKgplsDR.res=cv.coxDKgplsDR(list(x=X_train_micro,time=Y_train_micro, status=C_train_micro),nt=10,ind.block.x=c(3,10,15), alpha.x = rep(0.95, 10)))

coxspls_sgpls

(cox_coxspls_sgpls_fit=coxspls_sgpls(X_train_micro,Y_train_micro,C_train_micro,ncomp=6,keepX = rep(5, 6)))

(cox_coxspls_sgpls_fit2=coxspls_sgpls(~X_train_micro,Y_train_micro,C_train_micro,ncomp=6,ind.keepX = rep(5, 6)))

(cox_coxspls_sgpls_fit3=coxspls_sgpls(~.,Y_train_micro,C_train_micro,ncomp=6,
dataXplan=X_train_micro_df,keepX = rep(5, 6)))

rm(cox_coxspls_sgpls_fit,cox_coxspls_sgpls_fit2,cox_coxspls_sgpls_fit3)

cv.coxspls_sgpls

set.seed(123456)
(cv.coxspls_sgpls.res=cv.coxspls_sgpls(list(x=X_train_micro,time=Y_train_micro, status=C_train_micro),nt=10, keepX = rep(5, 10)))

coxspls_sgplsDR

(cox_spls_sgplsDR_fit=coxspls_sgplsDR(X_train_micro,Y_train_micro,C_train_micro,ncomp=6,ind.block.x=c(3,10,15)))

(cox_spls_sgplsDR_fit2=coxspls_sgplsDR(~X_train_micro,Y_train_micro,C_train_micro,ncomp=6,ind.block.x=c(3,10,15)))

(cox_spls_sgplsDR_fit3=coxspls_sgplsDR(~.,Y_train_micro,C_train_micro,ncomp=6,
                           dataXplan=X_train_micro_df,ind.block.x=c(3,10,15)))

rm(cox_spls_sgplsDR_fit,cox_spls_sgplsDR_fit2,cox_spls_sgplsDR_fit3)

cv.coxspls_sgplsDR

set.seed(123456)

(cv.coxspls_sgplsDR.res=cv.coxspls_sgplsDR(list(x=X_train_micro,time=Y_train_micro, status=C_train_micro),nt=10,ind.block.x=c(3,10,15)))

coxDKspls_sgplsDR

(cox_DKspls_sgplsDR_fit=coxDKspls_sgplsDR(X_train_micro,Y_train_micro,C_train_micro,ncomp=6, validation="CV",ind.block.x=c(3,10,15),verbose=TRUE))

(cox_DKspls_sgplsDR_fit=coxDKspls_sgplsDR(~X_train_micro,Y_train_micro,C_train_micro,ncomp=6, validation="CV",ind.block.x=c(3,10,15)))

(cox_DKspls_sgplsDR_fit=coxDKspls_sgplsDR(~.,Y_train_micro,C_train_micro,ncomp=6,
                              validation="CV",dataXplan=data.frame(X_train_micro),ind.block.x=c(3,10,15)))

rm(cox_DKspls_sgplsDR_fit)

cv.coxDKspls_sgplsDR

set.seed(123456)

(cv.coxDKspls_sgplsDR.res=cv.coxDKspls_sgplsDR(list(x=X_train_micro,time=Y_train_micro, status=C_train_micro),nt=10,ind.block.x=c(3,10,15)))
data(survData, package="bigSurvSGD")
resultsBigscale <- bigscale(formula=Surv(time, status)~.,data=survData, parallel.flag=TRUE, num.cores=2)
resultsBigscale

# Simulated survival data to be read off the memory
data(survData) # a dataset with 1000 observations (rows) and 10 features (columns)
# Save dataset survSGD as bigSurvSGD to be read chunk-by-chunk off the memory 
write.csv(survData, file.path(tempdir(), "bigSurvData.csv"), row.names = FALSE) 
dataPath <- file.path(tempdir(), "bigSurvData.csv") # path to where data is
resultsBigscaleOffMemory <- bigscale(formula=Surv(time, status)~., data=dataPath, 
bigmemory.flag=TRUE, parallel.flag=TRUE, num.cores=2)
resultsBigscaleOffMemory


# Simulated sparse survival data
data(sparseSurvData,package="bigSurvSGD") # a sparse data with 100 observations (rows) and 150 features (columns)
resultsBigscaleSparse <- bigscale(formula=Surv(time, status)~.,data=sparseSurvData, parallel.flag=TRUE, num.cores=2)
resultsBigscaleSparse



# Simulated survival data - just estimation and no confidence interval
data(survData) # a dataset with 1000 observations (rows) and 10 features (columns)
resultsBig <- bigSurvSGD::bigSurvSGD(formula=Surv(time, status)~.,data=survData, inference.method="none",
parallel.flag=TRUE, num.cores=2, features.mean = resultsBigscale$features.mean, features.sd = resultsBigscale$features.sd)
(resultsBig$coef)

# Simulated survival data to be read off the memory
data(survData) # a dataset with 1000 observations (rows) and 10 features (columns)
# Save dataset survSGD as bigSurvSGD to be read chunk-by-chunk off the memory 
write.csv(survData, file.path(tempdir(), "bigSurvData.csv"), row.names = FALSE) 
dataPath <- file.path(tempdir(), "bigSurvData.csv") # path to where data is
resultsBigOffMemory <- bigSurvSGD::bigSurvSGD(formula=Surv(time, status)~., data=dataPath, 
bigmemory.flag=TRUE, parallel.flag=TRUE, num.cores=2, features.mean = resultsBigscale$features.mean, features.sd = resultsBigscale$features.sd)  #much faster without tests, inference.method="none")
(resultsBigOffMemory$coef)


# Simulated sparse survival data
data(sparseSurvData) # a sparse data with 100 observations (rows) and 150 features (columns)
resultsBigSparse <- bigSurvSGD::bigSurvSGD(formula=Surv(time, status)~.,data=sparseSurvData, 
alpha=0.9, lambda=0.1, features.mean = resultsBigscale$features.mean, features.sd = resultsBigscale$features.sd)
(resultsBigSparse$coef)
#data(survData, package = "bigSurvSGD")
#survData[2,3] <- NA
#write.csv(survData, "~/Documents/GitHub/bigPLS/bigSurvData.csv", row.names = FALSE) 
datapath0_NA = path.expand("~/Documents/GitHub/bigPLS/add_data/bigSurvData_NA.csv")

library(bigPLS)
if(FALSE){
resultsBigscale <- bigscale(formula=Surv(time, status)~., data=datapath0_NA,bigmemory.flag=TRUE, parallel.flag=TRUE, num.cores=2)
resultsBigscale
}

# First PLS step -> compute weights

ind.col=1
name.col.all <-(colnames(read.csv(datapath0_NA,nrows=2,header=TRUE))[-c(1,2)])
name.col0 = name.col.all[ind.col]
partialbigSurvSGDv0(datapath = datapath0_NA, resBigscale=resultsBigscale, name.col=name.col0)

# Need to filter for missing values pairwise

simplify2array(lapply(name.col.all,partialbigSurvSGDv0, datapath=datapath0_NA, resBigscale=resultsBigscale, bigmemory.flag=FALSE))
simplify2array(lapply(name.col.all,partialbigSurvSGDv0, datapath=datapath0_NA, resBigscale=resultsBigscale, bigmemory.flag=TRUE))

#install.packages("bigalgebra")
#library("bigalgebra")

datapath0 = path.expand("~/Documents/GitHub/bigPLS/add_data/bigSurvData.csv")

library(bigPLS)

debug(bigPLS:::bigplsRcoxmodel.default)



bigPLS::bigplsRcox(formula=Surv(time, status)~.,data=datapath0,verbose=TRUE)

bigPLS::bigplsRcox(formula=Surv(time, status)~.,data=datapath0, backingfile="temp_plsRcox_file2.bin", backingpath=path.expand("~/Documents/GitHub/bigPLS/add_data/"),  descriptorfile="temp_plsRcox_file2.desc",verbose=TRUE)

bigPLS::bigplsRcox(formula=Surv(time, status)~.,data=NULL,descriptorfile="temp_plsRcox_file.desc", backingpath=path.expand("~/Documents/GitHub/bigPLS/add_data/"),verbose=TRUE)

https://stackoverflow.com/questions/49959260/removing-columns-from-big-matrix-which-have-only-one-value

# Simulated survival data - just estimation and no confidence interval
data(survData) # a dataset with 1000 observations (rows) and 10 features (columns)


resultsBig <- bigSurvSGD::bigSurvSGD(formula=Surv(time, status)~.,data=survData, inference.method="none",
parallel.flag=TRUE, num.cores=2, features.mean = resultsBigscale$features.mean, features.sd = resultsBigscale$features.sd)
(resultsBig$coef)
#install.packages("bigstatsr")
library(bigstatsr)
library(bigassertr)
library(bigparallelr)

X <- big_attachExtdata()

# No scaling
big_noscale <- big_scale(center = FALSE, scale = FALSE)
class(big_noscale) # big_scale returns a new function
str(big_noscale(X))
big_noscale2 <- big_scale(center = FALSE)
str(big_noscale2(X)) # you can't scale without centering

# Centering
big_center <- big_scale(scale = FALSE)
str(big_center(X))
str(big_scale()(X))

dim(X)
rows_along(X)
cols_along(X)

seq_range(c(3, 10))

X <- big_attachExtdata()

test <- big_parallelize(X, p.FUN = partialbigSurvSGD, p.combine = 'rbind', ncores = 2)

TODO

bigPLS

PLS regression (linear) or generalized linear regression (glm) for big data

Cox

plsRcox

regular PLS

glm PLS

imputePLS

Final fit of the model.

big data Cox (via SGD)

bigSurvSGD

SGD cox

coxphSGD

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.