Sorry for bothering again!
I've identified dissimilar data with the CAL dataset of the MetaGxBreast package with respect with the published in https://www.ebi.ac.uk/arrayexpress/experiments/E-TABM-158/
I've realized that the Concordance.index of gene signatures available in genefu were not near satisfactory in this dataset. After some playing around I found that there is some missmatch between clinical and expression data of the original dataset and the one in MetaGx.
Below some code to reproduce these findings
library(MetaGxBreast)
esets2= loadBreastEsets(loadString = c("CAL","MSK"))
CAL=esets2$esets[["CAL"]]
library(ArrayExpress)
accession="E-TABM-158"
MTAB=getAE(accession,path = "/home/mguerrero/Genetic_alg/Data_sets/MTAB", type = "processed")
MTAB=list(path="/home/mguerrero/Genetic_alg/Data_sets/MTAB",
rawFiles=NULL,
rawArchive=NULL,
processedFiles="breastTumorExpression.txt",
processedArchive="E-TABM-158.processed.1.zip",
sdrf="E-TABM-158.sdrf.txt",
idf="E-TABM-158.idf.txt",
adf="A-AFFY-76.adf.txt")
MTABnames=strsplit(readLines(paste(MTAB$path,MTAB$processedFiles,sep="/"))[1],"\t")[[1]]
MTABset=read.table(paste(MTAB$path,MTAB$processedFiles,sep="/"),sep="\t",skip=2,col.names=MTABnames,row.names=1)
sdrf=read.table(paste(MTAB$path,MTAB$sdrf,sep="/"),sep="\t",header=TRUE,row.names=1,comment.char="")
#If we check, all the colnames of the expression set CAL are present in the "Array.Data.File" column in the MTAB sdrf object
sdrf$genefu.name= gsub("(?i).CEL","",paste("CAL",sdrf$Array.Data.File,sep="_"))
all(colnames(exprs(CAL) )%in% sdrf$genefu.name)
#TRUE
#nevertheless MTAB expression matrix does not have all the samples available in the clinical metadata and it colnames correspond to the Scan.Name column in the sdrf object.
dim(MTABset)[2]
#118
dim(sdrf)[1]
#130
all(colnames(MTABset) %in% sdrf$Scan.Name)
#TRUE
#If we check corresponding Scan.Name of the colnames of the CAL expression set with colnames from MTAB they do not match completely, which would mean that the expression matrix of the CAL MetaGx dataset is misslabeled
ScanNameEset=sdrf[match(colnames(exprs(CAL)), sdrf$genefu.name),"Scan.Name"]
table(colnames(MTABset) %in% ScanNameEset)
#Finally is important to notice that pData from CAL eset does not match either with the data in the MTAB sdrf file
sdrf=sdrf[match(colnames(exprs(CAL)), sdrf$genefu.name),]
identical(colnames(exprs(CAL)),sdrf$genefu.name)
#TRUE
table(pData(CAL)$er, sdrf$Characteristics..EstrogenReceptorStatus.)
cor(pData(CAL)$age_at_initial_pathologic_diagnosis, as.numeric(as.character(sdrf$Characteristics..age.at.diagnosis.)),use="pairwise.complete.obs")
#-0.08980216
Hope you understand what I did!
thanks again for all your work and effort in bringing all this data closer to the users, it has been really useful!
Best!
Martin
sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 16.04.5 LTS
Matrix products: default
BLAS: /usr/lib/libblas/libblas.so.3.6.0
LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
[4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats4 grid parallel stats graphics grDevices utils datasets methods
[10] base
other attached packages:
[1] ArrayExpress_1.42.0 bindrcpp_0.2.2 GEOquery_2.50.5 MetaGxBreast_1.2.0
[5] ExperimentHub_1.8.0 AnnotationHub_2.14.2 impute_1.56.0 lattice_0.20-38
[9] AnnotationDbi_1.44.0 IRanges_2.16.0 S4Vectors_0.20.1 illuminaio_0.24.0
[13] genefu_2.14.0 AIMS_1.14.1 Biobase_2.42.0 BiocGenerics_0.28.0
[17] e1071_1.7-0 iC10_1.4.2 iC10TrainingData_1.3.1 pamr_1.55
[21] biomaRt_2.38.0 limma_3.38.3 mclust_5.4.2 survcomp_1.32.0
[25] prodlim_2018.04.18 gplots_3.0.1 cba_0.2-19 proxy_0.4-22
[29] doParallel_1.0.14 iterators_1.0.10 foreach_1.4.4 gpuR_2.0.0
[33] survival_2.43-3 cluster_2.0.7-1
loaded via a namespace (and not attached):
[1] amap_0.8-16 assertive.base_0.0-7 class_7.3-15
[4] XVector_0.22.0 GenomicRanges_1.34.0 base64_2.0
[7] affyio_1.52.0 assertive.sets_0.0-3 bit64_0.9-7
[10] interactiveDisplayBase_1.20.0 xml2_1.2.0 oligoClasses_1.44.0
[13] assertive.data.uk_0.0-2 codetools_0.2-16 splines_3.5.2
[16] knitr_1.21 SuppDists_1.1-9.4 assertive_0.3-5
[19] assertive.data.us_0.0-2 shiny_1.2.0 BiocManager_1.30.4
[22] readr_1.3.1 compiler_3.5.2 httr_1.4.0
[25] assertthat_0.2.0 Matrix_1.2-15 later_0.7.5
[28] htmltools_0.3.6 prettyunits_1.0.2 tools_3.5.2
[31] GenomeInfoDbData_1.2.0 glue_1.3.0 affxparser_1.54.0
[34] dplyr_0.7.8 Rcpp_1.0.0 Biostrings_2.50.2
[37] preprocessCore_1.44.0 gdata_2.18.0 assertive.files_0.0-2
[40] assertive.datetimes_0.0-2 assertive.models_0.0-2 xfun_0.4
[43] stringr_1.3.1 mime_0.6 gtools_3.8.1
[46] XML_3.98-1.16 zlibbioc_1.28.0 hms_0.4.2
[49] promises_1.0.1 SummarizedExperiment_1.12.0 assertive.matrices_0.0-2
[52] assertive.strings_0.0-3 oligo_1.46.0 curl_3.2
[55] yaml_2.2.0 memoise_1.1.0 stringi_1.2.4
[58] RSQLite_2.1.1 rmeta_3.0 caTools_1.17.1.1
[61] BiocParallel_1.16.5 lava_1.6.4 GenomeInfoDb_1.18.1
[64] matrixStats_0.54.0 rlang_0.3.1 pkgconfig_2.0.2
[67] bitops_1.0-6 assertive.data_0.0-3 purrr_0.2.5
[70] bindr_0.1.1 assertive.properties_0.0-4 survivalROC_1.0.3
[73] bit_1.1-14 tidyselect_0.2.5 assertive.code_0.0-3
[76] magrittr_1.5 R6_2.3.0 bootstrap_2017.2
[79] DelayedArray_0.8.0 DBI_1.0.0 pillar_1.3.1
[82] assertive.numbers_0.0-2 RCurl_1.95-4.11 tibble_2.0.0
[85] crayon_1.3.4 assertive.types_0.0-3 KernSmooth_2.23-15
[88] progress_1.2.0 blob_1.1.1 digest_0.6.18
[91] xtable_1.8-3 ff_2.2-14 tidyr_0.8.2
[94] httpuv_1.4.5.1 openssl_1.1 assertive.reflection_0.0-4