📦 Load Required Packages
library(limma)
library(edgeR)
library(ggplot2)
library(pheatmap)
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(DOSE)
library(biomaRt)
library(EnhancedVolcano)
library(msigdbr)
📂 Load and Prepare Data
setwd("C:/Users/temit/Desktop/My_GitHub/ID_bioinformatics/1. GSE152418")
counts <- read.delim(gzfile("GSE152418_p20047_Study1_RawCounts.txt.gz"),
row.names = 1, check.names = FALSE)
colnames(counts) <- gsub("\\.", "-", colnames(counts))
sample_ids <- colnames(counts)
group <- c(rep("COVID-19", 17), rep("Convalescent", 17))
pheno <- data.frame(sample_id = sample_ids, group = factor(group))
rownames(pheno) <- pheno$sample_id
counts <- counts[, rownames(pheno)]
📉 Library Size Check
dge <- DGEList(counts = counts, group = pheno$group)
keep <- filterByExpr(dge)
dge <- dge[keep,, keep.lib.sizes = FALSE]
barplot(dge$samples$lib.size / 1e6, names.arg = colnames(dge),
las = 2, cex.names = 0.7, col = "skyblue",
main = "Library Sizes", ylab = "Million Reads")

🔄 Normalization & QC
dge <- calcNormFactors(dge)
logCPM <- cpm(dge, log = TRUE)
boxplot(logCPM, las = 2, col = "lightgreen",
main = "Log2 CPM After Normalization",
ylab = "Log2 Counts per Million")

plotMDS(dge, col = as.numeric(dge$samples$group))
legend("topright", legend = levels(dge$samples$group),
col = 1:length(levels(dge$samples$group)), pch = 16)

🧪 Differential Expression Analysis
design <- model.matrix(~group, data = pheno)
dge <- estimateDisp(dge, design)
fit <- glmQLFit(dge, design)
qlf <- glmQLFTest(fit)
plotQLDisp(fit)

results <- topTags(qlf, n = Inf)
results_df <- as.data.frame(results)
summary(decideTests(qlf))
## groupCOVID-19
## Down 2753
## NotSig 9681
## Up 3240
# Preview top 10 results
top_degs <- head(results_df, 10)
kable(top_degs, caption = "Top 10 Differentially Expressed Genes")
Top 10 Differentially Expressed Genes
ENSG00000167900 |
3.131654 |
4.894398 |
211.5524 |
0 |
0 |
ENSG00000166851 |
3.792626 |
3.580563 |
190.9849 |
0 |
0 |
ENSG00000171848 |
3.742333 |
5.764055 |
180.1725 |
0 |
0 |
ENSG00000111206 |
3.001165 |
2.552239 |
181.0904 |
0 |
0 |
ENSG00000175063 |
3.483723 |
3.194802 |
175.5504 |
0 |
0 |
ENSG00000178999 |
3.135028 |
2.944081 |
171.3765 |
0 |
0 |
ENSG00000088325 |
3.085792 |
3.861850 |
170.7789 |
0 |
0 |
ENSG00000123485 |
3.150995 |
2.495765 |
169.7789 |
0 |
0 |
ENSG00000154277 |
5.017562 |
1.001783 |
116.8335 |
0 |
0 |
ENSG00000145386 |
3.665545 |
3.538345 |
166.1403 |
0 |
0 |
🌋 Volcano Plot
EnhancedVolcano(
results_df,
lab = rownames(results_df), # gene names as labels
x = 'logFC', # column with log2 fold changes
y = 'PValue', # column with p-values
pCutoff = 0.05, # FDR threshold (for y-axis significance)
FCcutoff = 1, # Fold-change threshold
title = "Volcano Plot of DEGs",
subtitle = "FDR < 0.05 and |logFC| > 1",
xlab = "log2 Fold Change",
ylab = "-log10(p-value)",
pointSize = 2.0,
labSize = 3.5,
colAlpha = 0.8,
legendPosition = 'right',
legendLabSize = 12,
legendIconSize = 4.0,
drawConnectors = TRUE, # Draw lines connecting labels to points
widthConnectors = 0.5
)

🔥 Heatmap of Top DEGs
group <- factor(dge$samples$group)
top_genes <- rownames(results)[1:50]
mat <- logCPM[top_genes, ]
annotation_col <- data.frame(Group = group)
rownames(annotation_col) <- colnames(mat)
pheatmap(mat,
scale = "row",
show_rownames = FALSE,
annotation_col = annotation_col,
main = "Top 50 DEGs Heatmap")

📊 GO & KEGG Enrichment
sig_genes <- subset(results_df, FDR < 0.05 & abs(logFC) > 1)
gene_symbols <- rownames(sig_genes)
tryCatch({
gene_entrez <- bitr(gene_symbols,
fromType = "ENSEMBL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db)
go_enrich <- enrichGO(gene = gene_entrez$ENTREZID,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "BP", pvalueCutoff = 0.05, qvalueCutoff = 0.2)
barplot(go_enrich, showCategory = 15, title = "GO: Biological Process")
dotplot(go_enrich, showCategory = 15, title = "GO Enrichment")
kegg_enrich <- enrichKEGG(gene = gene_entrez$ENTREZID,
organism = 'hsa', pvalueCutoff = 0.05)
barplot(kegg_enrich, showCategory = 15, title = "KEGG Pathways")
dotplot(kegg_enrich, showCategory = 15, title = "KEGG Dotplot")
}, error = function(e) message("Enrichment step failed: ", e$message))

🧠Enrichment Map
tryCatch({
go_enrich_sim <- pairwise_termsim(go_enrich)
emapplot(go_enrich_sim, showCategory = 30, layout = "kk")
cnetplot(go_enrich, showCategory = 5)
}, error = function(e) message("Enrichment map step failed: ", e$message))

🎯 Gene Set Enrichment Analysis (GSEA)
gene_list <- results_df$logFC
names(gene_list) <- rownames(results_df)
gene_list <- sort(gene_list, decreasing = TRUE)
tryCatch({
mart <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
gene_conversion <- getBM(filters = "ensembl_gene_id",
attributes = c("ensembl_gene_id", "entrezgene_id"),
values = names(gene_list), mart = mart)
gene_conversion <- gene_conversion[!is.na(gene_conversion$entrezgene_id), ]
gene_conversion <- gene_conversion[!duplicated(gene_conversion$ensembl_gene_id), ]
gene_list_filtered <- gene_list[gene_conversion$ensembl_gene_id]
names(gene_list_filtered) <- gene_conversion$entrezgene_id
gene_list_filtered <- gene_list_filtered[!duplicated(names(gene_list_filtered))]
gene_list_filtered <- sort(gene_list_filtered, decreasing = TRUE)
gsea_go <- gseGO(geneList = gene_list_filtered,
OrgDb = org.Hs.eg.db,
keyType = "ENTREZID",
ont = "BP", pvalueCutoff = 0.05)
dotplot(gsea_go, showCategory = 20, title = "GSEA: GO Biological Processes")
emapplot(pairwise_termsim(gsea_go), showCategory = 10, layout = "kk")
ridgeplot(gsea_go) + labs(title = "GSEA Score Distribution")
gseaplot2(gsea_go, geneSetID = "GO:0002449", title = "Lymphocyte Mediated Immunity")
}, error = function(e) message("GSEA step failed: ", e$message))
🧬 GSEA with Immune Signatures
tryCatch({
msigdb_immune <- msigdbr(species = "Homo sapiens", category = "C7")
msigdb_list <- msigdb_immune[, c("gs_name", "entrez_gene")]
gsea_immune <- GSEA(geneList = gene_list_filtered,
TERM2GENE = msigdb_list,
pvalueCutoff = 0.05)
dotplot(gsea_immune, showCategory = 20) + ggtitle("GSEA: Immune Signatures")
ridgeplot(gsea_immune) + ggtitle("Enrichment Score Distribution")
emapplot(pairwise_termsim(gsea_immune), showCategory = 20, layout = "kk")
}, error = function(e) message("Immune GSEA failed: ", e$message))
💾 Save Results
write.csv(results_df, "DEGs_GSE152418.csv")
write.csv(as.data.frame(go_enrich), "GO_enrichment_results.csv")
write.csv(as.data.frame(kegg_enrich), "KEGG_enrichment_results.csv")