library(Seurat)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(SeuratData)


############################################################
# 01_download_example_data.R
############################################################
# This script downloads two small public 10x PBMC datasets for integration demos.
# (Update URLs if needed; see the 10x Datasets page.)

library(R.utils)

# Example URLs (10x PBMC 3k and 4k filtered matrices)
pbmc3k_url <- "https://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz"
# If the pbmc4k URL changes, visit the 10x Datasets page and copy the filtered matrix link
pbmc4k_url <- "https://cf.10xgenomics.com/samples/cell-exp/2.1.0/pbmc4k/pbmc4k_filtered_gene_bc_matrices.tar.gz"


dir.create("data", showWarnings = FALSE)
setwd("data")

fn <- basename(pbmc4k_url)
if (!file.exists(fn)) download.file(pbmc4k_url, destfile = fn, mode = "wb")
# Extract into its own folder to avoid name clashes
sample_id <- sub("_filtered_gene_bc_matrices.tar.gz$", "", fn)
outdir <- file.path(sample_id)
if (!dir.exists(outdir)) dir.create(outdir)
R.utils::gunzip(fn, destname=sub(".tar.gz$",".tar",fn), overwrite=TRUE, remove=FALSE)
utils::untar(sub(".gz$","", fn), exdir = outdir)

fn <- basename(pbmc3k_url)
if (!file.exists(fn)) download.file(pbmc3k_url, destfile = fn, mode = "wb")
# Extract into its own folder to avoid name clashes
sample_id <- sub("_filtered_gene_bc_matrices.tar.gz$", "", fn)
outdir <- file.path(sample_id)
if (!dir.exists(outdir)) dir.create(outdir)
R.utils::gunzip(fn, destname=sub(".tar.gz$",".tar",fn), overwrite=TRUE, remove=FALSE)
utils::untar(sub(".gz$","", fn), exdir = outdir)

setwd("..")

############################################################
# 02_import_and_qc.R
############################################################
# Read 10x matrices, basic QC, mitochondrial/ribosomal metrics, filtering
mtx <- Read10X(data.dir = "data/pbmc3k/filtered_gene_bc_matrices/hg19/")
pbmc3k <- CreateSeuratObject(counts = mtx, project = "pbmc3k", min.cells = 3, min.features = 200)
# QC metrics
pbmc3k[["percent.mt"]] <- PercentageFeatureSet(pbmc3k, pattern = "^MT-")
#pbmc3k[["percent.ribo"]] <- PercentageFeatureSet(pbmc4k, pattern = "^RPL|^RPS")

mtx <- Read10X(data.dir = "data/pbmc4k/filtered_gene_bc_matrices/GRCh38")
pbmc4k <- CreateSeuratObject(counts = mtx, project = "pbmc4k", min.cells = 3, min.features = 200)
# QC metrics
pbmc4k[["percent.mt"]] <- PercentageFeatureSet(pbmc4k, pattern = "^MT-")
#pbmc4k[["percent.ribo"]] <- PercentageFeatureSet(pbmc4k, pattern = "^RPL|^RPS")

# Quick QC plots (saved to ./figs)
VlnPlot(pbmc3k, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(pbmc4k, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

pbmc3k
pbmc4k

dir.create("figs", showWarnings = FALSE)
pdf("figs/02_QC_vlnplots.pdf", width=10, height=6)
print(VlnPlot(pbmc3k, features = c("nFeature_RNA","nCount_RNA","percent.mt"), ncol = 3))
print(VlnPlot(pbmc4k, features = c("nFeature_RNA","nCount_RNA","percent.mt"), ncol = 3))
dev.off()

# Save individual objects pre-QC
saveRDS(pbmc3k, file = "pbmc3k_raw.rds")
saveRDS(pbmc4k, file = "pbmc4k_raw.rds")
pbmc3k <- readRDS("pbmc3k_raw.rds")
pbmc4k <- readRDS("pbmc4k_raw.rds")

# QC filtering thresholds (adjust for your data)
pbmc3k <- subset(pbmc3k, subset = nFeature_RNA >= 200 & nFeature_RNA <= 6000 &
                   nCount_RNA >= 500 & percent.mt < 10)
pbmc4k <- subset(pbmc4k, subset = nFeature_RNA >= 200 & nFeature_RNA <= 6000 &
           nCount_RNA >= 500 & percent.mt < 10)
pbmc3k
pbmc4k

############################################################
# 03_doublet_removal.R
############################################################
# Use scDblFinder (Bioconductor). Note: works on SingleCellExperiment; we convert.

library(scDblFinder)
library(SingleCellExperiment)

rm_doublets <- function(seu){
  sce <- as.SingleCellExperiment(seu)
  sce <- scDblFinder(sce)
  seu$doublet <- factor(sce$scDblFinder.class)
  seu <- subset(seu, subset = doublet == "singlet")
  seu
}

pbmc3k <- rm_doublets(pbmc3k)
pbmc4k <- rm_doublets(pbmc4k)

pbmc.combined <- merge(pbmc3k, y = pbmc4k, add.cell.ids = c("3K", "4K"), project = "PBMC7K")

### SCTranscform
#library(sctransform)
#remotes::install_version("matrixStats", version="1.1.0")
# store mitochondrial percentage in object meta data
#pbmc3k <- PercentageFeatureSet(pbmc3k, pattern = "^MT-", col.name = "percent.mt")
#pbmc3k <- SCTransform(pbmc3k, vars.to.regress = "percent.mt", verbose = FALSE)
#pbmc4k <- PercentageFeatureSet(pbmc4k, pattern = "^MT-", col.name = "percent.mt")
#pbmc4k <- SCTransform(pbmc4k, vars.to.regress = "percent.mt", verbose = FALSE)

#saveRDS(pbmc3k, file = "pbmc3k_sct.rds")
#saveRDS(pbmc4k, file = "pbmc4k_sct.rds")
#pbmc3k <- readRDS("pbmc3k_sct.rds")
#pbmc4k <- readRDS("pbmc4k_sct.rds")

# Dimensionality reduction & clustering on integrated data
pbmc.combined <- NormalizeData(pbmc.combined)
pbmc.combined <- FindVariableFeatures(pbmc.combined)
pbmc.combined <- ScaleData(pbmc.combined)
pbmc.combined <- RunPCA(pbmc.combined)

ElbowPlot(pbmc.combined, ndims = 30)

pbmc.combined <- FindNeighbors(pbmc.combined, dims = 1:20)
pbmc.combined <- FindClusters(pbmc.combined, resolution = 0.3, cluster.name = "unintegrated_clusters")
pbmc.combined <- RunUMAP(pbmc.combined, dims = 1:20, reduction.name = "umap.unintegrated")

DimPlot(pbmc.combined , label = TRUE, reduction = "umap.unintegrated", group.by = "unintegrated_clusters")
DimPlot(pbmc.combined , label = TRUE, reduction = "umap.unintegrated", group.by = "orig.ident")

saveRDS(pbmc.combined, file = "pbmc_combined_UMAP.rds")
pbmc.combined <- readRDS("pbmc_combined_UMAP.rds")

pdf("figs/04_unintegrated_umap_by_sample_cluster.pdf", width=9, height=5)
print(DimPlot(pbmc.combined, reduction = "umap.unintegrated", label=TRUE, pt.size=0.3))
dev.off()

pbmc.combined <- IntegrateLayers(
  object = pbmc.combined, method = CCAIntegration,
  orig.reduction = "pca", new.reduction = "integrated.cca",
  verbose = FALSE
)

pbmc.combined <- IntegrateLayers(
  object = pbmc.combined, method = HarmonyIntegration,
  orig.reduction = "pca", new.reduction = "harmony",
  verbose = FALSE
)

pbmc.combined <- FindNeighbors(pbmc.combined, reduction = "integrated.cca", dims = 1:20)
pbmc.combined <- FindClusters(pbmc.combined, resolution = 0.3, cluster.name = "cca_clusters")
pbmc.combined <- RunUMAP(pbmc.combined, reduction = "integrated.cca", dims = 1:20, reduction.name = "umap.cca")
DimPlot(pbmc.combined , label = TRUE, reduction = "umap.cca", group.by = "cca_clusters", label.size = 8) + NoLegend()
DimPlot(pbmc.combined , label = TRUE, reduction = "umap.cca", group.by = "orig.ident")

pbmc.combined <- FindNeighbors(pbmc.combined, reduction = "harmony", dims = 1:20)
pbmc.combined <- FindClusters(pbmc.combined, resolution = 0.3, cluster.name = "harmony_clusters")
pbmc.combined <- RunUMAP(pbmc.combined, reduction = "harmony", dims = 1:20, reduction.name = "umap.harmony")
DimPlot(pbmc.combined , label = TRUE, reduction = "umap.harmony", group.by = "harmony_clusters", label.size = 8) + NoLegend()
DimPlot(pbmc.combined , label = TRUE, reduction = "umap.harmony", group.by = "orig.ident")


############################################################
# 05_markers_and_annotation.R
############################################################
# Find marker genes per cluster; annotate with SingleR + celldex (HPCA)

# find markers for every cluster compared to all remaining cells, report only the positive
# ones
Idents(pbmc.combined) <- "cca_clusters"
markers.pbmc.combined <- FindAllMarkers(pbmc.combined, only.pos = TRUE, logfc.threshold = 0.25)
markers.pbmc.combined %>%
  group_by(cluster) %>%
  dplyr::filter(avg_log2FC > 1)

write_csv(markers.pbmc.combined, "markers_by_cca_cluster.csv")

FeaturePlot(pbmc.combined, features = c("CD14", "CCR7", "IL7R", "CCL5", "CD79A", "KLRF1"), reduction = "umap.cca", ncol = 3)
VlnPlot(pbmc.combined, features = c("CD14", "CCR7", "IL7R", "CCL5", "CD79A", "KLRF1"), ncol = 3)

markers.pbmc.combined %>%
  group_by(cluster) %>%
  dplyr::filter(avg_log2FC > 1) %>%
  slice_head(n = 10) %>%
  ungroup() -> top10
DoHeatmap(pbmc.combined, features = top10$gene) + NoLegend()

# SingleR annotation
library(SingleR)
library(celldex)

ref <- celldex::HumanPrimaryCellAtlasData()
sce <- as.SingleCellExperiment(pbmc.combined)
pr <- SingleR(test = sce, ref = ref, labels = ref$label.main)
pbmc.combined$SingleR_main <- pr$labels
DimPlot(pbmc.combined, group.by = "SingleR_main", reduction = "umap.cca")
DimPlot(pbmc.combined, group.by = "SingleR_main", split.by = "SingleR_main", reduction = "umap.cca")
DimPlot(pbmc.combined, group.by = "SingleR_main", reduction = "umap.cca")

pbmc.combined <- RenameIdents(pbmc.combined, '0' = "Monocyte", '1' = "T_cell", '2' = "T_cell", 
                                             '3' = "T_cell", '4' = "B_cell", '5' = "T_cell", 
                                             '6' = "NK_cell", '7' = "Monocyte", '8' = "Monocyte", 
                                             '9' = "Monocyte", '10' = "Monocyte")
pbmc.combined$celltype <- pbmc.combined@active.ident

DimPlot(pbmc.combined, group.by = "celltype", reduction = "umap.cca", label = T, label.size = 8) + NoLegend()

# Save annotated object and quick plots
saveRDS(pbmc.combined, file = "pbmc_combined_annotated.rds")

############################################################
# 06_trajectory_monocle3.R
############################################################
library(monocle3)
library(SeuratWrappers)

pbmc.combined <-JoinLayers(pbmc.combined)

# converting seurat object to celldataset object for Monocle3
DefaultAssay(pbmc.combined) <- "RNA"
stopifnot("umap.cca" %in% names(pbmc.combined@reductions))
cds.pbmc <- as.cell_data_set(pbmc.combined)

# Assign UMAP coordinates
umap_mat <- Embeddings(pbmc.combined, reduction = "umap.cca")
stopifnot(nrow(umap_mat) == ncol(cds.pbmc))
reducedDims(cds.pbmc)$UMAP <- umap_mat
cds.pbmc <- cluster_cells(cds.pbmc, reduction_method = "UMAP")     # uses reducedDims(cds.pbmc)$UMAP


# get cell metadata
head(colData(cds.pbmc))

# get feature/gene metadata
fData(cds.pbmc)
rownames(fData(cds.pbmc))[1:10]
fData(cds.pbmc)$gene_short_name <- rownames(fData(cds.pbmc))
head(fData(cds.pbmc))

# get counts
head(counts(cds.pbmc))

## Retrieve clustering information from Surat object
# Assign partitions
recreate.partitions <- c(rep(1, length(cds.pbmc@colData@rownames)))
names(recreate.partitions) <- cds.pbmc@colData@rownames
recreate.partitions <- as.factor(recreate.partitions)
recreate.partitions

cds.pbmc@clusters@listData[["UMAP"]][["partitions"]] <- recreate.partitions

# Assign cluster information
list.cluster <- pbmc.combined@active.ident
cds.pbmc@clusters@listData[["UMAP"]][["clusters"]] <- list.cluster

#cds.pbmc@int_colData@listData[["reducedDims"]]@listData[["UMAP"]] <- pbmc.combined@reductions$umap@cell.embeddings

# Plot
cluster.before.traj <- plot_cells(cds.pbmc, color_cells_by = "cluster", label_groups_by_cluster = F, 
                                  group_label_size = 5) + theme(legend.position = "right")
cluster.before.traj

# Learn Trajectory
#cds.pbmc <- learn_graph(cds.pbmc, use_partition = F)
cds.pbmc <- learn_graph(cds.pbmc, use_partition = T)

plot_cells(cds.pbmc, color_cells_by = "celltype", label_groups_by_cluster = F,
           label_branch_points = T, label_roots = T, label_leaves = F,
           group_label_size = 5)

plot_cells(cds.pbmc, color_cells_by = "celltype", label_groups_by_cluster = F,
           label_branch_points = F, label_roots = F, label_leaves = F,
           group_label_size = 5)

# Order cells in Pseudotime
#cds.pbmc <- order_cells(cds.pbmc, reduction_method = "UMAP", root_cells = colnames(cds[, clusters(cds) == 'Ventricle_1']))
cds.pbmc <- order_cells(cds.pbmc, reduction_method = "UMAP")

plot_cells(cds.pbmc, color_cells_by = "pseudotime", label_groups_by_cluster = T,
           label_branch_points = F, label_roots = T, label_leaves = F)

cds.pbmc.pseudotime <- pseudotime(cds.pbmc)
data.pseudo <- as.data.frame(colData(cds.pbmc))

#ggplot(data.pseudo, aes(cds.pbmc.pseudotime, seurat_clusters, fill = seurat_clusters)) + geom_boxplot()
#ggplot(data.pseudo, aes(cds.pbmc.pseudotime, celltype, fill = celltype)) + geom_boxplot()

ggplot(data.pseudo, aes(cds.pbmc.pseudotime, reorder(celltype, cds.pbmc.pseudotime), fill = celltype)) + geom_boxplot()

colData(cds.pbmc)$assigned_pseudotime <- as.numeric(pseudotime(cds.pbmc))
colData(cds.pbmc)$assigned_pseudotime_point <- as.integer(pseudotime(cds.pbmc))

saveRDS(cds.pbmc, "cds_pbmc_pseudotime.rds")
cds.pbmc <- readRDS("cds_pbmc_pseudotime.rds")

# make pseudotime point top 3 marker genes in dotplot
marker_pseudotime <- top_markers(cds.pbmc, group_cells_by="assigned_pseudotime_point", reference_cells=1000, cores=8)

## export all pseudotime point marker gene list
#marker_test_res <- top_markers(cds, group_cells_by="assigned_pseudotime_point", reference_cells=1000, cores=8)
top_markers <- marker_pseudotime %>% filter(fraction_expressing >= 0.10) %>% group_by(cell_group)
write.csv(top_markers,file="marker_gene_for_pseudotime_point.csv")


# Add pseudotime values into the seuratobject
pbmc.combined$pseudotime <- pseudotime(cds.pbmc)
FeaturePlot(pbmc.combined, features = "pseudotime", cols = c("lightgrey", "darkred"))


############################################################
# 07_cellchat_interactions.R
############################################################
# Cell–cell communication analysis using CellChat and Seurat clusters/annotations
library(CellChat)
library(patchwork)

# Use gene expression data and meta info
data.input <- GetAssayData(pbmc.combined, assay = "RNA", slot = "data")
meta <- data.frame(labels = pbmc.combined$celltype, row.names = colnames(pbmc.combined))


cellchat <- createCellChat(object = data.input)
cellchat <- addMeta(cellchat, meta = meta)
cellchat <- setIdent(cellchat, ident.use = "labels")
cellchat@DB <- CellChatDB.human


cellchat <- subsetData(cellchat) # subset signaling genes
cellchat <- identifyOverExpressedGenes(cellchat)
cellchat <- identifyOverExpressedInteractions(cellchat)
cellchat <- projectData(cellchat, PPI.human)


cellchat <- computeCommunProb(cellchat)
cellchat <- filterCommunication(cellchat, min.cells = 10)
cellchat <- computeCommunProbPathway(cellchat)
cellchat <- aggregateNet(cellchat)

netVisual_circle(cellchat@net$count, vertex.weight = as.numeric(table(cellchat@idents)), weight.scale = TRUE, label.edge=FALSE)
netVisual_heatmap(cellchat)

saveRDS(cellchat, file = "pbmc_combined_cellchat.rds")

pdf("figs/07_cellchat_overview.pdf", width=11, height=7)
print(netVisual_circle(cellchat@net$count, vertex.weight = as.numeric(table(cellchat@idents)), weight.scale = TRUE, label.edge=FALSE))
print(netVisual_heatmap(cellchat))
dev.off()


