library(tidyverse)
#Setting working directory  
setwd("/work/tbLincRnas/pipeline/DE/edgeR/")

#Salmon output directory
dir <- "/work/tbLincRnas/pipeline/salmon/output"
list.files(dir)

#Criando os path para os arquivos quant.sf de cada sample
samples <- read.table(file.path(dir, "samples_iPSCs_TFB.txt"), header = TRUE)
head(samples)
files <- file.path(dir, samples$samples, "quant.sf")
files
#names(files) <- paste0("samples",1:18)
head(files)
all(file.exists(files))

#Creating tx2gene from noncoding.gtf 
library(readr)
csv = "/work/tbLincRnas/pipeline/gtf_to_fasta/lncRNA"
tx2gene <- read_tsv(file.path(csv, "tx2gene.noncoding.csv"))
head(tx2gene)

#Importando pelo tximport
library(rjson)
txi <- tximport(files, type = "salmon", tx2gene = tx2gene)
names(txi)
head(txi$counts)
head(txi$abundance)
# save(txi$counts, file = "tximport_salmon.RData")

#import edgeR
source("https://bioconductor.org/biocLite.R")
biocLite("edgeR")
library(edgeR)

#iPSC vsTFB
#Creating a DEGList 
groupSamples <- paste(c(rep(1, 12), rep(2, 6)))
cts <- txi$counts
dgList <- DGEList(counts=cts, group = groupSamples)
colnames(dgList)
colnames(dgList) <- c("TFB267", "TFB269", "TFB271", "TFB273", "TFB275", "TFB277",
                   "TFB279", "TFB281", "TFB283", "TFB285", "TFB287", "TFB289", "ZN21",
                   "ZN22", "ZN27", "ZN28", "ZN29", "ZN30")
dgList$samples
head(dgList$counts)

#Filtering
keep <- rowSums(cpm(dgList)>1) >= 2
keep
dgList <- dgList[keep, , keep.lib.sizes=FALSE]
dgList$samples

dgList <- calcNormFactors(dgList, method = "TMM")
summary(dgList)

#Data Exploration
plotMDS(dgList, col=c(rep(1,12),rep(2,6)), gene.selection = "pairwise")


#Samples
samplesType <- paste(c(rep("TFB", 12), rep("iPS", 6)))
samplesType

design <- model.matrix(~samplesType)




#Analysis dispersion
dgList <- estimateGLMCommonDisp(dgList, design = design)
dgList <- estimateGLMTrendedDisp(dgList, design = design)
dgList <- estimateGLMTagwiseDisp(dgList, design = design)

plotBCV(dgList)

#Differential Expression - Quasi-likelihood
fit <- glmQLFit(dgList, design)
qlf <- glmQLFTest(fit, coef = 2)
qlf_top100 <- topTags(qlf, n = 100)
teste <- topTags(qlf, n = 100, adjust.method = "BH", sort.by = "logFC")
teste
qlf_top30 <- topTags(qlf, n = 30)
topTags(qlf, n = 10, sort.by = "logFC", p.value = 0.001)

qlf$table

save(qlf_top100, file = "top100_qlf_TFB_IPS.RData")

o <- order(qlf$table$PValue)
cpm(dgList[o[1:10],])


#DecideTests - QLF
?decideTests
deGenesQLF <- decideTests(qlf, p=0.001)
deGenesQLF <- rownames(qlf)[as.logical(deGenesQLF)]
deGenesQLF
#Total de lncRNA DE - 3960
nrow(as_data_frame(deGenesQLF))


qlf$table %>% filter(PValue < 0.001) %>% nrow()

#Creating universe
qlf_table <- qlf$table
head(qlf_table)
qlf_table <- qlf_table %>% mutate(GeneID = rownames(qlf_table)) 
qlf_table <- qlf_table %>% mutate(de = qlf_table$PValue < 0.001)
head(qlf_table)
qlf_table <- qlf_table[c(5,1,2,3,4,6)]

qlf_table %>% mutate(PValue.adjusted = p.adjust(PValue))
mutate(qlf_table, EntrezID = left_join(qlf_table, grch38_id, by = "GeneID"))

qlf_table %>% filter(PValue < 0.001)
qlf_down <- qlf_table %>% filter(PValue < 0.001) %>% filter(logFC < -2)
qlf_up <- qlf_table %>% filter(PValue < 0.001) %>% filter(logFC > 2)

nrow(qlf_up) #952 up-regulated in TFB
nrow(qlf_down) # 1042 down-regulated in TFB

qlf_table %>% filter(logFC > 2) # 1116
qlf_table %>% filter(logFC < -2) # 1083
for(i in 1:nrow(qlf_table)) {
  # i-th element of `u1` squared into `i`-th position of `usq`
  qlf_table$GeneID[i] <- str_split_fixed(qlf_table$GeneID[i], "\\." , n=2)[1]
  #print(qlf_table$GeneID[i])
}
qlf_table
gene_universe <- as.numeric(qlf_table$de)
gene_universe <- factor(gene_universe)
names(gene_universe) <- qlf_table$GeneID
head(gene_universe)


#plot
plotSmear(qlf, de.tags = deGenesQLF)
abline(h=c(-1,1), col=2)



plotMD(qlf, main = "DE lncRNAs - iPSC vs TFB")
abline(h=c(-1,1), col="black")

#Heatmap - preciso selecionar apenas os genes de interesse
logcpm <- cpm(dgList, prior.count=2, log=TRUE)
heatmap(logcpm)
#GO Analyses
biocLite("org.Hs.eg.db")
biocLite("stephenturner/annotables")
library(annotables)
annotables::grch38

deGenesQLF_dataframe <- data.frame(deGenesQLF)
colnames(deGenesQLF_dataframe) <- "GeneID"
grch38_ensgene <- data.frame(grch38$ensgene, grcm38$entrez) 
colnames(grch38_ensgene) <- "GeneID"
grch38_ensgene %>% mutate(grch38$entrez) -> grch38_id
colnames(grch38_id) <- c("GeneID", "EntrezID")
grch38_id

deGenesQLFUP <- data.frame(qlf_up$GeneID)
colnames(deGenesQLFUP) <- "GeneID"
head(deGenesQLFUP)

deGenesQLFDown <- data.frame(qlf_down$GeneID)
colnames(deGenesQLFDown) <- "GeneID"
class(deGenesQLFDown)

teste <- data_frame(ID = character())
teste <- deGenesQLFDown
head(as.tibble(teste))

lista = data_frame(ID = character())
for(i in 1:nrow(deGenesQLF_dataframe)) {
  # i-th element of `u1` squared into `i`-th position of `usq`
  lista[i,] <- str_split_fixed(deGenesQLF_dataframe[i,], "\\." , n=2)[1]
  
}
lista[2,]
colnames(lista) <- "GeneID"
entrez_deGenesQLF <- left_join(lista, grch38_id, by = "GeneID")
entrez_deGenesQLF

iPSvsTFB_geneID_list <- entrez_deGenesQLF[,1]
iPSvsTFB_entrez_list <- entrez_deGenesQLF[,2]
iPSvsTFB_geneID_list

write_csv(iPSvsTFB_geneID_list, "iPS_TFB_geneID_DE.csv")

deGenesQLFDown_list <- as.data.frame(deGenesQLFDown$GeneID)
write_csv(deGenesQLFDown_list, "iPS_TFB_geneID_downDE")
go <- goana(as.character(qlf_table) ,species = "Hs")#, geneid = entrez_deGenesQLF[,2], 
            #trend = TRUE, plot = TRUE)
go
topGO(go, sort = "up")
keg <- kegga(as.character(entrez_deGenesQLFDown$EntrezID), species="Hs")
topKEGG(keg)
keg

#GO and KEGG analyses with clusterProfiler
library(clusterProfiler)
library(AnnotationHub)
library(org.Hs.eg.db)

# iPSvcTFB_list.df <- bitr(iPSvsTFB_geneID_list, fromType = "ENTREZID",
                # toType = c("ENSEMBL", "SYMBOL"),
                # OrgDb = org.Hs.eg.db)
iPSvsTFB_entrez_list
entrez_deGenesQLF$EntrezID
ego <- enrichGO(gene  = as.vector(deGenesQLFDown$GeneID), 
                #keytype = "ENTREZID",
                universe = as.vector(qlf_table$GeneID),
                keytype = "ENSEMBL",
                OrgDb         = "org.Hs.eg.db",
                ont           = "MF",
                pAdjustMethod = "BH",
                qvalueCutoff = 0.05)

ego
head(ego)
dotplot(ego, showCategory=30)
enrichMap(ego, vertex.label.cex=1.1, layout=igraph::layout.kamada.kawai)


#Tentando Kegg para Down regulated
entrez_deGenesQLFDown <- left_join(deGenesQLFDown, grch38_id, by = "GeneID")
head(entrez_deGenesQLFDown)
qlf_table_entrezlist <- left_join(qlf_table, grch38_id, by = "GeneID")
keg_CP <- enrichKEGG(gene = as.vector(entrez_deGenesQLFDown$EntrezID),
                  keyType = "kegg", 
                  #universe = as.vector(qlf_table_entrezlist$EntrezID), 
                  pAdjustMethod = "BH", 
                  qvalueCutoff = 0.05)

keg_CP

