#' N-of-1-pathways: unveiling personal deregulated mechanisms from a single paired sample #' #' \tabular{ll}{ #' Package: \tab Nof1Pathways\cr #' Type: \tab Package\cr #' Version: \tab 1.0\cr #' Date: \tab 2014-10-08\cr #' License: \tab GPL (>= 2)\cr #' LazyLoad: \tab yes\cr #' } #' #' Transform single paired gene expression data sample (microarray or RNA-seq) into a list of deregulated pathways #' #' \code{\link{n.of.1.pathways}} is the main application entry point. #' #' @name Nof1Pathwayss-package #' @aliases Nof1Pathways #' @docType package #' @title N-of-1-pathways: unveiling personal deregulated mechanisms from a single paired sample #' @author Lussier Lab \email{research@gardeux-vincent.eu} #' @references #' @keywords package #' @seealso \code{\link{n.of.1.pathways}} #' #' #' N-of-1-pathways module #2 (computes pvalues according to the chosen model between two paired samples, for a given pathway) #' #' @param pathway Name (or Id) of the pathway to analyze #' @param model The name of the model to use ('wilcoxon', 'distance' or 'enrichment') #' @param pathways.set A vector containing the list of HGNC gene names that constitutes the given 'pathway' #' @param row.genes.names List HGNC gene names matching the genes.1 and genes.2 rows expression values #' @param genes.1 A vector (paired with genes.2) containing all gene expressions #' @param genes.2 A vector (paired with genes.1) containing all gene expressions #' @param maxGenes Maximum size of the pathways allowed #' @param minGenes Minimum size of the pathways allowed #' @param is.exact A boolean stating if the computed p-value should be exact or not #' @param nb.repeat Number of permutation resampling for the 'distance' model #' @param genes.deg List of Deregulated genes (for the 'enrichment' model) #' @return A vector containing Wilcoxon p-value, W+, W-, number of genes originally in the pathway, and number of genes actually found in the pathway (given genes.1 and genes.2 genes names) #' compute.scores.pathway <- function(pathway, model, pathways.set, row.genes.names, genes.1, genes.2, maxGenes, minGenes, is.exact, nb.repeat, genes.deg) { genes <- unique(pathways.set[pathways.set$path_id == pathway, ]$symbol) # Genes contained in each pathway nbGenes <- length(which(genes %in% row.genes.names)) if(nbGenes >= minGenes & nbGenes <= maxGenes) # Consider only those pathways { if(model == "mixenrich") { obj = fisher.test(matrix(c(length(which(genes %in% genes.deg)), nbGenes, length(genes.deg), length(unique(row.genes.names))),nrow=2)) return(c(obj$p.value, 0, 0, length(genes), nbGenes)) } v1 <- c() v2 <- c() for(j in 1:length(genes)) { index <- which(row.genes.names==genes[j]) if(length(index) > 0) { v1 <- c(v1, max(genes.1[index])) # Take the max of all the gene with same Id v2 <- c(v2, max(genes.2[index])) # Take the max of all the gene with same Id } } if(model == "wilcoxon") { obj <- try(wilcox.test(v1, v2, paired=TRUE, exact=is.exact), silent=TRUE) if (is(obj, "try-error")) return(c(1, NA, NA, length(genes), length(v1))) return(c(obj$p.value, obj$statistic, wilcox.test(v2, v1, paired=TRUE, exact=is.exact)$statistic, length(genes), length(v1))) } if(model == "vectorial") { fold.change = genes.2 - genes.1 fc = v2 - v1 distance.original = dist(rbind(fc,rep(0,length(fc))))[1] count.null = 0 for(i in 1:nb.repeat) { fc.rand = sample(fold.change,length(fc)) distance.rand = dist(rbind(fc.rand,rep(0,length(fc.rand))))[1] if(distance.rand>distance.original) count.null = count.null + 1 } return(c(count.null/nb.repeat, distance.original, NA, length(genes), length(v1))) } if(model == "mahdist") { n = length(v1) distance.original = compute.mean.vert.dist(v1, v2) count.null = 0 for (i in 1:nb.repeat) { indexes = sample(1:n,n,replace=T) distance.rand = compute.mean.vert.dist(v1[indexes], v2[indexes]) if(distance.rand >= 0) count.null = count.null + 1 } gamma <- count.null/nb.repeat return(c(min(gamma,1-gamma), distance.original, NA, length(genes), length(v1))) } } return(c(NA,NA,NA,NA,NA)) } #' Mahalobis Distance calculation for the mahdist model #' #' @param normal A vector (paired with "case") containing normal gene expressions #' @param case A vector (paired with "normal") containing case gene expressions #' @return The mean of all the genes' mahalobis distance #' compute.mean.vert.dist <- function(normal, case) { mean(sqrt((var(normal)/((var(normal)*var(case))-cov(normal,case)^2)))*(case-normal)) } #' N-of-1-pathways module #1 (call the module #2 for each pathway found in the pathway list) #' #' @param pathways.set A vector containing the list of HGNC gene names that constitutes the given 'pathway' #' @param genes.1 A vector (paired with genes.2) containing all gene expressions #' @param genes.2 A vector (paired with genes.1) containing all gene expressions #' @param row.genes.names List HGNC gene names matching the genes.1 and genes.2 rows expression values #' @param model The name of the model to use ('wilcoxon', 'distance' or 'enrichment') #' @param maxGenes Maximum size of the pathways allowed #' @param minGenes Minimum size of the pathways allowed #' @param parallelize A boolean stating if the computation should be parallelized or not, using snowfall package (default = FALSE) #' @param data.description A data.frame containing two columns: path_id and description. Can be NULL or loaded from files in ./Genesets/ (default = NULL) #' @param is.exact A boolean stating if the computed p-value should be exact or not #' @param nb.repeat Number of permutation resampling for the 'distance' model (Default = 1000) #' @param p.dereg.cutoff Probability cutoff for the 'enrichment' model (Default = 0.05) #' @return A vector containing p-value, W+, W-, number of genes originally in the pathway, and number of genes actually found in the pathway (given genes.1 and genes.2 genes names) #' compute.scores.module <- function(pathways.set, genes.1, genes.2, row.genes.names, model, maxGenes, minGenes, parallelize=FALSE, data.description=NULL, is.exact, nb.repeat, p.dereg.cutoff) { genes.deg = NULL if(model == "mixenrich") { require(mixtools) abs.logFC = abs(genes.2-genes.1) # find the absolute log fold change mix.model = normalmixEM(abs.logFC,lambda=.5,mu=c(.5,3),sigma=c(1,1)) # function to find the probability of each gene belonging to deregulated class f = function(x,mix) { (mix$lambda[1]*(dnorm(x,mean=mix$mu[1],sd=mix$sigma[1])))/(mix$lambda[1]*(dnorm(x,mean=mix$mu[1],sd=mix$sigma[1]))+mix$lambda[2]*dnorm(x,mean=mix$mu[2],sd=mix$sigma[2])) } p.dereg=f(abs.logFC,mix.model) # probability of every gene genes.deg = unique(row.genes.names[p.dereg<=p.dereg.cutoff]) # find the genes that we strongly believe to be deregulated cat(paste("Found",length(genes.deg),"deregulated genes. ( Confidence =",p.dereg.cutoff,")\n")) } pathways <- as.character(unique(pathways.set$path_id)) res = c() if(parallelize) { sfExport("pathways") sfExport("pathways.set") sfExport("minGenes") sfExport("maxGenes") sfExport("model") sfExport("nb.repeat") sfExport("genes.deg") res = sfSapply(pathways, function(x) compute.scores.pathway(x, model, pathways.set, row.genes.names, genes.1, genes.2, maxGenes, minGenes, is.exact, nb.repeat, genes.deg)) } if(!parallelize) res = sapply(pathways, function(x) compute.scores.pathway(x, model, pathways.set, row.genes.names, genes.1, genes.2, maxGenes, minGenes, is.exact, nb.repeat, genes.deg)) res.pvalue <- res[1,][names(which(!is.na(res[2,])))] res.pvalue <- sort(res.pvalue) if(model == "mixenrich") { if(is.null(data.description)) return(data.frame(p.value=res.pvalue, initialSize=res[4,][names(res.pvalue)], computedSize=res[5,][names(res.pvalue)])) return(data.frame(p.value=res.pvalue, description=as.character(data.description[match(names(res.pvalue), data.description$path_id),]$description), initialSize=res[4,][names(res.pvalue)], computedSize=res[5,][names(res.pvalue)])) } if(model == "wilcoxon") { ranks.pos <- res[2,][names(res.pvalue)] ranks.neg <- res[3,][names(res.pvalue)] up_down <- sapply(ranks.pos - ranks.neg, function(x) if(x < 0) return("+") else return("-")) if(is.null(data.description)) return(data.frame(p.value=res.pvalue, ranks.pos=ranks.pos, ranks.neg=ranks.neg, up_down=up_down, initialSize=res[4,][names(res.pvalue)], computedSize=res[5,][names(res.pvalue)])) return(data.frame(p.value=res.pvalue, description=as.character(data.description[match(names(res.pvalue), data.description$path_id),]$description), ranks.pos=ranks.pos, ranks.neg=ranks.neg, up_down=up_down, initialSize=res[4,][names(res.pvalue)], computedSize=res[5,][names(res.pvalue)])) } if(model == "vectorial" | model == "mahdist") { if(is.null(data.description)) return(data.frame(p.value=res.pvalue, distance=res[2,][names(res.pvalue)], initialSize=res[4,][names(res.pvalue)], computedSize=res[5,][names(res.pvalue)])) return(data.frame(p.value=res.pvalue, description=as.character(data.description[match(names(res.pvalue), data.description$path_id),]$description), distance=res[2,][names(res.pvalue)], initialSize=res[4,][names(res.pvalue)], computedSize=res[5,][names(res.pvalue)])) } } #' N-of-1-pathways main function (computes the p-values between two paired samples, for a given list of pathways) #' #' @param row.genes.names List HGNC gene names matching the genes.1 and genes.2 rows expression values #' @param genes.1 A vector (paired with genes.2) containing all gene expressions #' @param genes.2 A vector (paired with genes.1) containing all gene expressions #' @param model The name of the model to use ('wilcoxon', 'vectorial', 'mixenrich', or 'mahdist') #' @param maxSizePathway Maximum size of the pathways allowed (default = 500) #' @param minSizePathway Minimum size of the pathways allowed (default = 15) #' @param parallelize A boolean stating if the computation should be parallelized or not, using snowfall package (default = FALSE) #' @param pathways.set The list of pathways to work with (Can be loaded from files in ./Genesets/) #' @param data.description A data.frame containing two columns: path_id and description. Can be NULL or loaded from files in ./Genesets/ (default = NULL) #' @param is.exact A boolean stating if the computed p-value should be exact or not #' @param seed.value Seed for the random generator (used for the 'distance' and 'enrichment' models only; Defaut = 123) #' @param nb.repeat Number of permutation resampling for the 'distance' model (Default = 1000) #' @param p.dereg.cutoff Probability cutoff for the 'enrichment' model (Default = 0.05) #' @return A data.frame containing the list of deregulated pathways, sorted by p-value n.of.1.pathways <- function(row.genes.names, model, genes.1, genes.2, maxSizePathway=500, minSizePathway=15, parallelize=FALSE, pathways.set, data.description=NULL, is.exact=FALSE, seed.value=123, nb.repeat=1000, p.dereg.cutoff=0.05) { set.seed(seed.value) if(!class(genes.1)=="numeric" | !class(genes.2)=="numeric" | !class(row.genes.names)=="character") cat("Error: genes.1 and genes.2 must be \"numeric\" vectors. row.genes.names must be \"character\" vector. \n Try using d[[\"colname\"]] instead of { d[\"colname\"] or d$colname } if you are extracting columns from a dataframe.\n") else { cat("Note: The data is supposed to be log-transformed. If not, you should transform it before running N-of-1-pathways.\n\n") cat("Running N-of-1-pathways framework with model =", model, "\n") if(parallelize) { require(parallel) # Used for detecting number of available cores require(snowfall) # Used for parallelizing apply functions # Initialize parallel calculation sfInit(parallel=TRUE, cpus=detectCores()) sfExport("compute.scores.module") sfExport("compute.scores.pathway") sfExport("compute.mean.vert.dist") } # Filtering cat("Filtering out unkown genes / NA values...\n") indexes <- which(row.genes.names!="---" & !is.na(row.genes.names) & !is.na(genes.1) & !is.na(genes.2)) # Filter unkown genes / NA values genes.1 <- genes.1[indexes] genes.2 <- genes.2[indexes] row.genes.names <- toupper(row.genes.names[indexes]) if(parallelize) { sfExport("row.genes.names") sfExport("genes.1") sfExport("genes.2") } # Compute the scores for each Pathway cat("Computing the scores for each pathway...\n") res <- compute.scores.module(pathways.set, genes.1, genes.2, row.genes.names, model, maxGenes=maxSizePathway, minGenes=minSizePathway, parallelize=parallelize, data.description=data.description, is.exact=is.exact, nb.repeat=nb.repeat, p.dereg.cutoff=p.dereg.cutoff) cat("Computation Done!\n") if(parallelize) sfStop() return(res) } }