#' N-of-1-pathways: unveiling personal deregulated mechanisms from a single paired sample
#'
#' \tabular{ll}{
#' Package: \tab Nof1Pathways\cr
#' Type: \tab Package\cr
#' Version: \tab 1.0\cr
#' Date: \tab 2014-10-08\cr
#' License: \tab GPL (>= 2)\cr
#' LazyLoad: \tab yes\cr
#' }
#'
#' Transform single paired gene expression data sample (microarray or RNA-seq) into a list of deregulated pathways
#'
#' \code{\link{n.of.1.pathways}} is the main application entry point.
#'
#' @name Nof1Pathwayss-package
#' @aliases Nof1Pathways
#' @docType package
#' @title N-of-1-pathways: unveiling personal deregulated mechanisms from a single paired sample
#' @author Lussier Lab \email{research@gardeux-vincent.eu}
#' @references
#' @keywords package
#' @seealso \code{\link{n.of.1.pathways}}
#'

#'
#' N-of-1-pathways module #2 (computes pvalues according to the chosen model between two paired samples, for a given pathway)
#'
#' @param pathway Name (or Id) of the pathway to analyze
#' @param model The name of the model to use ('wilcoxon', 'distance' or 'enrichment')
#' @param pathways.set A vector containing the list of HGNC gene names that constitutes the given 'pathway'
#' @param row.genes.names List HGNC gene names matching the genes.1 and genes.2 rows expression values
#' @param genes.1 A vector (paired with genes.2) containing all gene expressions
#' @param genes.2 A vector (paired with genes.1) containing all gene expressions
#' @param maxGenes Maximum size of the pathways allowed
#' @param minGenes Minimum size of the pathways allowed
#' @param is.exact A boolean stating if the computed p-value should be exact or not
#' @param nb.repeat Number of permutation resampling for the 'distance' model
#' @param genes.deg List of Deregulated genes (for the 'enrichment' model)
#' @return A vector containing Wilcoxon p-value, W+, W-, number of genes originally in the pathway, and number of genes actually found in the pathway (given genes.1 and genes.2 genes names)
#' 
compute.scores.pathway <- function(pathway, model, pathways.set, row.genes.names, genes.1, genes.2, maxGenes, minGenes, is.exact, nb.repeat, genes.deg)
{
	genes <- unique(pathways.set[pathways.set$path_id == pathway, ]$symbol) # Genes contained in each pathway
	nbGenes <- length(which(genes %in% row.genes.names))
	if(nbGenes >= minGenes & nbGenes <= maxGenes) # Consider only those pathways
	{
		if(model == "mixenrich") 
		{
			obj = fisher.test(matrix(c(length(which(genes %in% genes.deg)), nbGenes, length(genes.deg), length(unique(row.genes.names))),nrow=2))
			return(c(obj$p.value, 0, 0, length(genes), nbGenes))
		}
		v1 <- c()
		v2 <- c()
		for(j in 1:length(genes))
		{
			index <- which(row.genes.names==genes[j])
			if(length(index) > 0)
			{
				v1 <- c(v1, max(genes.1[index])) # Take the max of all the gene with same Id
				v2 <- c(v2, max(genes.2[index])) # Take the max of all the gene with same Id
			}
		}
		if(model == "wilcoxon") 
		{
			obj <- try(wilcox.test(v1, v2, paired=TRUE, exact=is.exact), silent=TRUE)
			if (is(obj, "try-error")) return(c(1, NA, NA, length(genes), length(v1)))
			return(c(obj$p.value, obj$statistic, wilcox.test(v2, v1, paired=TRUE, exact=is.exact)$statistic, length(genes), length(v1)))
		}
		if(model == "vectorial")
		{
			fold.change = genes.2 - genes.1
			fc = v2 - v1
			distance.original = dist(rbind(fc,rep(0,length(fc))))[1]
			count.null = 0
			for(i in 1:nb.repeat)
			{
				fc.rand = sample(fold.change,length(fc))
				distance.rand = dist(rbind(fc.rand,rep(0,length(fc.rand))))[1]
				if(distance.rand>distance.original) count.null = count.null + 1
			}
			return(c(count.null/nb.repeat, distance.original, NA, length(genes), length(v1)))
		}
		if(model == "mahdist")
		{
      n = length(v1)
		  distance.original = compute.mean.vert.dist(v1, v2)
		  count.null = 0
		  for (i in 1:nb.repeat)
      {
		      indexes = sample(1:n,n,replace=T)
		      distance.rand = compute.mean.vert.dist(v1[indexes], v2[indexes])
          if(distance.rand >= 0) count.null = count.null + 1
		  }
		  gamma <- count.null/nb.repeat
		  return(c(min(gamma,1-gamma), distance.original, NA, length(genes), length(v1)))
		}
	}
	return(c(NA,NA,NA,NA,NA))
}

#' Mahalobis Distance calculation for the mahdist model
#'
#' @param normal A vector (paired with "case") containing normal gene expressions
#' @param case A vector (paired with "normal") containing case gene expressions
#' @return The mean of all the genes' mahalobis distance
#' 
compute.mean.vert.dist <- function(normal, case)
{
  mean(sqrt((var(normal)/((var(normal)*var(case))-cov(normal,case)^2)))*(case-normal))
}

#' N-of-1-pathways module #1 (call the module #2 for each pathway found in the pathway list)
#'
#' @param pathways.set A vector containing the list of HGNC gene names that constitutes the given 'pathway'
#' @param genes.1 A vector (paired with genes.2) containing all gene expressions
#' @param genes.2 A vector (paired with genes.1) containing all gene expressions
#' @param row.genes.names List HGNC gene names matching the genes.1 and genes.2 rows expression values
#' @param model The name of the model to use ('wilcoxon', 'distance' or 'enrichment')
#' @param maxGenes Maximum size of the pathways allowed
#' @param minGenes Minimum size of the pathways allowed
#' @param parallelize A boolean stating if the computation should be parallelized or not, using snowfall package (default = FALSE)
#' @param data.description A data.frame containing two columns: path_id and description. Can be NULL or loaded from files in ./Genesets/ (default = NULL)
#' @param is.exact A boolean stating if the computed p-value should be exact or not
#' @param nb.repeat Number of permutation resampling for the 'distance' model (Default = 1000)
#' @param p.dereg.cutoff Probability cutoff for the 'enrichment' model (Default = 0.05)
#' @return A vector containing p-value, W+, W-, number of genes originally in the pathway, and number of genes actually found in the pathway (given genes.1 and genes.2 genes names)
#' 
compute.scores.module <- function(pathways.set, genes.1, genes.2, row.genes.names, model, maxGenes, minGenes, parallelize=FALSE, data.description=NULL, is.exact, nb.repeat, p.dereg.cutoff)
{
	genes.deg = NULL
	if(model == "mixenrich")
	{
		require(mixtools)
		abs.logFC = abs(genes.2-genes.1) # find the absolute log fold change
		mix.model = normalmixEM(abs.logFC,lambda=.5,mu=c(.5,3),sigma=c(1,1))
		# function to find the probability of each gene belonging to deregulated class
		f = function(x,mix)
		{
			(mix$lambda[1]*(dnorm(x,mean=mix$mu[1],sd=mix$sigma[1])))/(mix$lambda[1]*(dnorm(x,mean=mix$mu[1],sd=mix$sigma[1]))+mix$lambda[2]*dnorm(x,mean=mix$mu[2],sd=mix$sigma[2]))
		}
		p.dereg=f(abs.logFC,mix.model) # probability of every gene
		genes.deg = unique(row.genes.names[p.dereg<=p.dereg.cutoff]) # find the genes that we strongly believe to be deregulated
		cat(paste("Found",length(genes.deg),"deregulated genes. ( Confidence =",p.dereg.cutoff,")\n"))
	}
	pathways <- as.character(unique(pathways.set$path_id))
	res = c()
	if(parallelize) 
	{
		sfExport("pathways")
		sfExport("pathways.set")
		sfExport("minGenes")
		sfExport("maxGenes")
		sfExport("model")
		sfExport("nb.repeat")
		sfExport("genes.deg")
		res = sfSapply(pathways, function(x) compute.scores.pathway(x, model, pathways.set, row.genes.names, genes.1, genes.2, maxGenes, minGenes, is.exact, nb.repeat, genes.deg))
	}
	if(!parallelize) res = sapply(pathways, function(x) compute.scores.pathway(x, model, pathways.set, row.genes.names, genes.1, genes.2, maxGenes, minGenes, is.exact, nb.repeat, genes.deg))	
	res.pvalue <- res[1,][names(which(!is.na(res[2,])))]
	res.pvalue <- sort(res.pvalue)
	if(model == "mixenrich")
	{
		if(is.null(data.description)) return(data.frame(p.value=res.pvalue, initialSize=res[4,][names(res.pvalue)], computedSize=res[5,][names(res.pvalue)]))
		return(data.frame(p.value=res.pvalue, description=as.character(data.description[match(names(res.pvalue), data.description$path_id),]$description), initialSize=res[4,][names(res.pvalue)], computedSize=res[5,][names(res.pvalue)]))
	}
	if(model == "wilcoxon")
	{
		ranks.pos <- res[2,][names(res.pvalue)]
		ranks.neg <- res[3,][names(res.pvalue)]
		up_down <- sapply(ranks.pos - ranks.neg, function(x) if(x < 0) return("+") else return("-"))
		if(is.null(data.description)) return(data.frame(p.value=res.pvalue, ranks.pos=ranks.pos, ranks.neg=ranks.neg, up_down=up_down, initialSize=res[4,][names(res.pvalue)], computedSize=res[5,][names(res.pvalue)]))
		return(data.frame(p.value=res.pvalue, description=as.character(data.description[match(names(res.pvalue), data.description$path_id),]$description), ranks.pos=ranks.pos, ranks.neg=ranks.neg, up_down=up_down, initialSize=res[4,][names(res.pvalue)], computedSize=res[5,][names(res.pvalue)]))
	}
	if(model == "vectorial" | model == "mahdist")
	{
		if(is.null(data.description)) return(data.frame(p.value=res.pvalue, distance=res[2,][names(res.pvalue)], initialSize=res[4,][names(res.pvalue)], computedSize=res[5,][names(res.pvalue)]))
		return(data.frame(p.value=res.pvalue, description=as.character(data.description[match(names(res.pvalue), data.description$path_id),]$description), distance=res[2,][names(res.pvalue)], initialSize=res[4,][names(res.pvalue)], computedSize=res[5,][names(res.pvalue)]))
	}
}

#' N-of-1-pathways main function (computes the p-values between two paired samples, for a given list of pathways)
#'
#' @param row.genes.names List HGNC gene names matching the genes.1 and genes.2 rows expression values
#' @param genes.1 A vector (paired with genes.2) containing all gene expressions
#' @param genes.2 A vector (paired with genes.1) containing all gene expressions
#' @param model The name of the model to use ('wilcoxon', 'vectorial', 'mixenrich', or 'mahdist')
#' @param maxSizePathway Maximum size of the pathways allowed (default = 500)
#' @param minSizePathway Minimum size of the pathways allowed (default = 15)
#' @param parallelize A boolean stating if the computation should be parallelized or not, using snowfall package (default = FALSE)
#' @param pathways.set The list of pathways to work with (Can be loaded from files in ./Genesets/)
#' @param data.description A data.frame containing two columns: path_id and description. Can be NULL or loaded from files in ./Genesets/ (default = NULL)
#' @param is.exact A boolean stating if the computed p-value should be exact or not
#' @param seed.value Seed for the random generator (used for the 'distance' and 'enrichment' models only; Defaut = 123)
#' @param nb.repeat Number of permutation resampling for the 'distance' model (Default = 1000)
#' @param p.dereg.cutoff Probability cutoff for the 'enrichment' model (Default = 0.05)
#' @return A data.frame containing the list of deregulated pathways, sorted by p-value
n.of.1.pathways <- function(row.genes.names, model, genes.1, genes.2, maxSizePathway=500, minSizePathway=15, parallelize=FALSE, pathways.set, data.description=NULL, is.exact=FALSE, seed.value=123, nb.repeat=1000, p.dereg.cutoff=0.05)
{
  set.seed(seed.value)
	if(!class(genes.1)=="numeric" | !class(genes.2)=="numeric" | !class(row.genes.names)=="character") cat("Error: genes.1 and genes.2 must be \"numeric\" vectors. row.genes.names must be \"character\" vector. \n Try using d[[\"colname\"]] instead of { d[\"colname\"] or d$colname } if you are extracting columns from a dataframe.\n")
	else
	{
		cat("Note: The data is supposed to be log-transformed. If not, you should transform it before running N-of-1-pathways.\n\n")
		cat("Running N-of-1-pathways framework with model =", model, "\n")
		if(parallelize)
		{
			require(parallel) # Used for detecting number of available cores
			require(snowfall) # Used for parallelizing apply functions
			# Initialize parallel calculation
			sfInit(parallel=TRUE, cpus=detectCores())
			sfExport("compute.scores.module")
			sfExport("compute.scores.pathway")
			sfExport("compute.mean.vert.dist")
		}
		# Filtering
		cat("Filtering out unkown genes / NA values...\n")
		indexes <- which(row.genes.names!="---" & !is.na(row.genes.names) & !is.na(genes.1) & !is.na(genes.2)) # Filter unkown genes / NA values
		genes.1 <- genes.1[indexes]
		genes.2 <- genes.2[indexes]
		row.genes.names <- toupper(row.genes.names[indexes])
		if(parallelize)
		{
			sfExport("row.genes.names")
			sfExport("genes.1")
			sfExport("genes.2")
		}
		# Compute the scores for each Pathway
		cat("Computing the scores for each pathway...\n")
		res <- compute.scores.module(pathways.set, genes.1, genes.2, row.genes.names, model, maxGenes=maxSizePathway, minGenes=minSizePathway, parallelize=parallelize, data.description=data.description, is.exact=is.exact, nb.repeat=nb.repeat, p.dereg.cutoff=p.dereg.cutoff)
		cat("Computation Done!\n")
		if(parallelize) sfStop()
		return(res)
	}
}