📦 Load Required Packages

library(limma)
library(edgeR)
library(ggplot2)
library(pheatmap)
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(DOSE)
library(biomaRt)
library(EnhancedVolcano)
library(msigdbr)

📂 Load and Prepare Data

setwd("C:/Users/temit/Desktop/My_GitHub/ID_bioinformatics/1. GSE152418")

counts <- read.delim(gzfile("GSE152418_p20047_Study1_RawCounts.txt.gz"), 
                     row.names = 1, check.names = FALSE)
colnames(counts) <- gsub("\\.", "-", colnames(counts))

sample_ids <- colnames(counts)
group <- c(rep("COVID-19", 17), rep("Convalescent", 17))
pheno <- data.frame(sample_id = sample_ids, group = factor(group))
rownames(pheno) <- pheno$sample_id

counts <- counts[, rownames(pheno)]

📉 Library Size Check

dge <- DGEList(counts = counts, group = pheno$group)
keep <- filterByExpr(dge)
dge <- dge[keep,, keep.lib.sizes = FALSE]

barplot(dge$samples$lib.size / 1e6, names.arg = colnames(dge), 
        las = 2, cex.names = 0.7, col = "skyblue",
        main = "Library Sizes", ylab = "Million Reads")

🔄 Normalization & QC

dge <- calcNormFactors(dge)
logCPM <- cpm(dge, log = TRUE)

boxplot(logCPM, las = 2, col = "lightgreen",
        main = "Log2 CPM After Normalization",
        ylab = "Log2 Counts per Million")

plotMDS(dge, col = as.numeric(dge$samples$group))
legend("topright", legend = levels(dge$samples$group), 
       col = 1:length(levels(dge$samples$group)), pch = 16)

🧪 Differential Expression Analysis

design <- model.matrix(~group, data = pheno)
dge <- estimateDisp(dge, design)
fit <- glmQLFit(dge, design)
qlf <- glmQLFTest(fit)

plotQLDisp(fit)

results <- topTags(qlf, n = Inf)
results_df <- as.data.frame(results)
summary(decideTests(qlf))
##        groupCOVID-19
## Down            2753
## NotSig          9681
## Up              3240
# Preview top 10 results
top_degs <- head(results_df, 10)
kable(top_degs, caption = "Top 10 Differentially Expressed Genes")
Top 10 Differentially Expressed Genes
logFC logCPM F PValue FDR
ENSG00000167900 3.131654 4.894398 211.5524 0 0
ENSG00000166851 3.792626 3.580563 190.9849 0 0
ENSG00000171848 3.742333 5.764055 180.1725 0 0
ENSG00000111206 3.001165 2.552239 181.0904 0 0
ENSG00000175063 3.483723 3.194802 175.5504 0 0
ENSG00000178999 3.135028 2.944081 171.3765 0 0
ENSG00000088325 3.085792 3.861850 170.7789 0 0
ENSG00000123485 3.150995 2.495765 169.7789 0 0
ENSG00000154277 5.017562 1.001783 116.8335 0 0
ENSG00000145386 3.665545 3.538345 166.1403 0 0

🌋 Volcano Plot

EnhancedVolcano(
  results_df,
  lab = rownames(results_df),           # gene names as labels
  x = 'logFC',                          # column with log2 fold changes
  y = 'PValue',                         # column with p-values
  pCutoff = 0.05,                       # FDR threshold (for y-axis significance)
  FCcutoff = 1,                         # Fold-change threshold
  title = "Volcano Plot of DEGs",
  subtitle = "FDR < 0.05 and |logFC| > 1",
  xlab = "log2 Fold Change",
  ylab = "-log10(p-value)",
  pointSize = 2.0,
  labSize = 3.5,
  colAlpha = 0.8,
  legendPosition = 'right',
  legendLabSize = 12,
  legendIconSize = 4.0,
  drawConnectors = TRUE,               # Draw lines connecting labels to points
  widthConnectors = 0.5
)

🔥 Heatmap of Top DEGs

group <- factor(dge$samples$group)
top_genes <- rownames(results)[1:50]
mat <- logCPM[top_genes, ]

annotation_col <- data.frame(Group = group)
rownames(annotation_col) <- colnames(mat)

pheatmap(mat,
         scale = "row",
         show_rownames = FALSE,
         annotation_col = annotation_col,
         main = "Top 50 DEGs Heatmap")

📊 GO & KEGG Enrichment

sig_genes <- subset(results_df, FDR < 0.05 & abs(logFC) > 1)
gene_symbols <- rownames(sig_genes)

tryCatch({
  gene_entrez <- bitr(gene_symbols, 
                      fromType = "ENSEMBL", 
                      toType = "ENTREZID", 
                      OrgDb = org.Hs.eg.db)

  go_enrich <- enrichGO(gene = gene_entrez$ENTREZID,
                        OrgDb = org.Hs.eg.db,
                        keyType = "ENTREZID",
                        ont = "BP", pvalueCutoff = 0.05, qvalueCutoff = 0.2)

  barplot(go_enrich, showCategory = 15, title = "GO: Biological Process")
  dotplot(go_enrich, showCategory = 15, title = "GO Enrichment")

  kegg_enrich <- enrichKEGG(gene = gene_entrez$ENTREZID,
                            organism = 'hsa', pvalueCutoff = 0.05)

  barplot(kegg_enrich, showCategory = 15, title = "KEGG Pathways")
  dotplot(kegg_enrich, showCategory = 15, title = "KEGG Dotplot")
}, error = function(e) message("Enrichment step failed: ", e$message))

🧠 Enrichment Map

tryCatch({
  go_enrich_sim <- pairwise_termsim(go_enrich)
  emapplot(go_enrich_sim, showCategory = 30, layout = "kk")
  cnetplot(go_enrich, showCategory = 5)
}, error = function(e) message("Enrichment map step failed: ", e$message))

🎯 Gene Set Enrichment Analysis (GSEA)

gene_list <- results_df$logFC
names(gene_list) <- rownames(results_df)
gene_list <- sort(gene_list, decreasing = TRUE)

tryCatch({
  mart <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
  gene_conversion <- getBM(filters = "ensembl_gene_id",
                           attributes = c("ensembl_gene_id", "entrezgene_id"),
                           values = names(gene_list), mart = mart)

  gene_conversion <- gene_conversion[!is.na(gene_conversion$entrezgene_id), ]
  gene_conversion <- gene_conversion[!duplicated(gene_conversion$ensembl_gene_id), ]

  gene_list_filtered <- gene_list[gene_conversion$ensembl_gene_id]
  names(gene_list_filtered) <- gene_conversion$entrezgene_id
  gene_list_filtered <- gene_list_filtered[!duplicated(names(gene_list_filtered))]
  gene_list_filtered <- sort(gene_list_filtered, decreasing = TRUE)

  gsea_go <- gseGO(geneList = gene_list_filtered,
                   OrgDb = org.Hs.eg.db,
                   keyType = "ENTREZID",
                   ont = "BP", pvalueCutoff = 0.05)

  dotplot(gsea_go, showCategory = 20, title = "GSEA: GO Biological Processes")
  emapplot(pairwise_termsim(gsea_go), showCategory = 10, layout = "kk")
  ridgeplot(gsea_go) + labs(title = "GSEA Score Distribution")
  gseaplot2(gsea_go, geneSetID = "GO:0002449", title = "Lymphocyte Mediated Immunity")
}, error = function(e) message("GSEA step failed: ", e$message))

🧬 GSEA with Immune Signatures

tryCatch({
  msigdb_immune <- msigdbr(species = "Homo sapiens", category = "C7")
  msigdb_list <- msigdb_immune[, c("gs_name", "entrez_gene")]

  gsea_immune <- GSEA(geneList = gene_list_filtered,
                      TERM2GENE = msigdb_list,
                      pvalueCutoff = 0.05)

  dotplot(gsea_immune, showCategory = 20) + ggtitle("GSEA: Immune Signatures")
  ridgeplot(gsea_immune) + ggtitle("Enrichment Score Distribution")
  emapplot(pairwise_termsim(gsea_immune), showCategory = 20, layout = "kk")
}, error = function(e) message("Immune GSEA failed: ", e$message))

💾 Save Results

write.csv(results_df, "DEGs_GSE152418.csv")
write.csv(as.data.frame(go_enrich), "GO_enrichment_results.csv")
write.csv(as.data.frame(kegg_enrich), "KEGG_enrichment_results.csv")