##################################### ### ### preprocess_yeastData.R ### Aug 24th, 2018 ### Author: Samir Rachid Zaim ### ##################################### ### Yeast contains 48 replicates of ### wild type (WT) and Snf2 mutation. ### The files are divided into individual ### bam output files with counts for either of ### these so they must all be read and aggregated. WT.fileList <- dir('~/Dropbox/TBC-2018-single-subject-methods-accuracies/data/dat/WT') Snf2.fileList <- dir('~/Dropbox/TBC-2018-single-subject-methods-accuracies/data/dat/Snf2') ### read in Wildtype replicates setwd('~/Dropbox/TBC-2018-single-subject-methods-accuracies/data/dat/WT') WT.mat = data.table(1:7131) for(file in WT.fileList){ tmp =fread(file) WT.mat = cbind(WT.mat, tmp) } WT.mat <- WT.mat[1:7126] #the list 5 rows are summary stats on the data gene.names <- WT.mat[,2, with=F] WT.mat <- WT.mat[, seq(3,97, by=2), with=F] colnames(WT.mat) <- paste('WT_Replicate', 1:48) WT.mat$GeneSymbol <- gene.names write.csv(WT.mat, '~/Dropbox/TBC-2018-single-subject-methods-accuracies/data/full_wt_dataset.csv') ### read in Sfn2 mutant replicates setwd('~/Dropbox/TBC-2018-single-subject-methods-accuracies/data/dat/Snf2') Snf2.mat = data.table(1:7131) for(file in Snf2.fileList){ tmp =fread(file) Snf2.mat = cbind(Snf2.mat, tmp) } Snf2.mat <- Snf2.mat[1:7126] #the list 5 rows are summary stats on the data gene.names <- Snf2.mat[,2, with=F] Snf2.mat <- Snf2.mat[, seq(3,97, by=2), with=F] colnames(Snf2.mat) <- paste('Snf2_Replicate', 1:48) Snf2.mat$GeneSymbol <- gene.names write.csv(Snf2.mat, '~/Dropbox/TBC-2018-single-subject-methods-accuracies/data/full_Snf2_dataset.csv') ### combine both datasets full.mat <- data.frame(cbind(WT.mat[order(WT.mat$GeneSymbol)], Snf2.mat[order(Snf2.mat$GeneSymbol)])) rownames(full.mat) <- full.mat$GeneSymbol full.mat <- full.mat[, -c(grep('GeneSymbol', names(full.mat)))] ### remove bad replicates (based on their study) exclude.list <- fread('~/Dropbox/TBC-2018-single-subject-methods-accuracies/data/exclude.lst', header=F) exclude.list <- exclude.list$V1 exclude.list <- gsub('_MID[a-z,0-9,_,.,A-Z]*','', exclude.list) exclude.list <- gsub('_rep','_Replicate.', exclude.list) full.mat <- full.mat[, !names(full.mat) %in% c(exclude.list,"Snf2_Replicate.6") ] write.csv(full.mat, '~/Dropbox/TBC-2018-single-subject-methods-accuracies/data/full_combined_dataset.csv', row.names=F)