#####################################
###
### preprocess_yeastData.R
### Aug 24th, 2018
### Author: Samir Rachid Zaim
###
#####################################

### Yeast contains 48 replicates of
### wild type (WT) and Snf2 mutation.
### The files are divided into individual
### bam output files with counts for either of
### these so they must all be read and aggregated.

WT.fileList <- dir('~/Dropbox/TBC-2018-single-subject-methods-accuracies/data/dat/WT')
Snf2.fileList <- dir('~/Dropbox/TBC-2018-single-subject-methods-accuracies/data/dat/Snf2')


### read in Wildtype replicates
setwd('~/Dropbox/TBC-2018-single-subject-methods-accuracies/data/dat/WT')

WT.mat = data.table(1:7131)
for(file in WT.fileList){
    tmp =fread(file)
    WT.mat = cbind(WT.mat, tmp)

}


WT.mat <- WT.mat[1:7126] #the list 5 rows are summary stats on the data
gene.names <- WT.mat[,2, with=F]

WT.mat <- WT.mat[, seq(3,97, by=2), with=F]
colnames(WT.mat) <- paste('WT_Replicate', 1:48)
WT.mat$GeneSymbol <- gene.names

write.csv(WT.mat, '~/Dropbox/TBC-2018-single-subject-methods-accuracies/data/full_wt_dataset.csv')


### read in Sfn2 mutant replicates
setwd('~/Dropbox/TBC-2018-single-subject-methods-accuracies/data/dat/Snf2')

Snf2.mat = data.table(1:7131)
for(file in Snf2.fileList){
    tmp =fread(file)
    Snf2.mat = cbind(Snf2.mat, tmp)

}


Snf2.mat <- Snf2.mat[1:7126] #the list 5 rows are summary stats on the data
gene.names <- Snf2.mat[,2, with=F]

Snf2.mat <- Snf2.mat[, seq(3,97, by=2), with=F]
colnames(Snf2.mat) <- paste('Snf2_Replicate', 1:48)
Snf2.mat$GeneSymbol <- gene.names

write.csv(Snf2.mat, '~/Dropbox/TBC-2018-single-subject-methods-accuracies/data/full_Snf2_dataset.csv')

### combine both datasets

full.mat <- data.frame(cbind(WT.mat[order(WT.mat$GeneSymbol)], Snf2.mat[order(Snf2.mat$GeneSymbol)]))
rownames(full.mat) <- full.mat$GeneSymbol
full.mat <- full.mat[, -c(grep('GeneSymbol', names(full.mat)))]

### remove bad replicates (based on their study)
exclude.list <- fread('~/Dropbox/TBC-2018-single-subject-methods-accuracies/data/exclude.lst', header=F)
exclude.list <- exclude.list$V1
exclude.list <- gsub('_MID[a-z,0-9,_,.,A-Z]*','', exclude.list)
exclude.list <- gsub('_rep','_Replicate.', exclude.list)
full.mat <- full.mat[, !names(full.mat) %in% c(exclude.list,"Snf2_Replicate.6") ]

write.csv(full.mat, '~/Dropbox/TBC-2018-single-subject-methods-accuracies/data/full_combined_dataset.csv', row.names=F)