# Read in EVE results
EVEresTbl <- read_rds("data/runEVE/EVEresTbl.RDS")
# Read in shift gene sets
nShiftedTblComplete <- readRDS("data/TissueAtlasAnalysis/nShiftedTblComplete.RDS") %>%
as_tibble()
# load CORUM protein-complex data
if( !file.exists("data/proteinComplexs/corumWithRes.RDS")){
OGtbl <- read_tsv("input-data/from_ortho_pipeline/OGtbl.tsv", col_types = cols())
dir.create("data/proteinComplexs")
download.file("http://mips.helmholtz-muenchen.de/corum/download/coreComplexes.txt.zip",
"data/proteinComplexs/coreComplexes.txt.zip")
corum <- read_tsv("data/proteinComplexs/coreComplexes.txt.zip")
# need to convert ensembl gene IDs to NCBI gene IDs
EG2NCBI <-
biomaRt::getBM(attributes = c("ensembl_gene_id", "entrezgene_id"),
mart = biomaRt::useMart("ensembl",dataset = "hsapiens_gene_ensembl")) %>%
as_tibble() %>%
mutate( entrezgene_id = as.character(entrezgene_id) )
# Assign complex subunits to corresponding orthogroups
corumWithRes <-
corum %>%
filter(Organism=="Human") %>%
select( ComplexID,ComplexName, subunits=`subunits(Entrez IDs)`) %>%
mutate( subunits = strsplit(subunits,";")) %>% unnest() %>%
left_join(EG2NCBI, by=c("subunits"="entrezgene_id")) %>%
left_join(select(OGtbl, geneID, OG, N0), by=c("ensembl_gene_id"="geneID")) %>%
left_join(select(OGtbl, N0, N3) %>% filter(!is.na(N3)) %>% distinct(), by="N0") %>%
left_join(select(EVEresTbl, N3,N9,dup_type,sigDir, dupSigDir), by="N3")
saveRDS(corumWithRes,"data/proteinComplexs/corumWithRes.RDS")
}
corumWithRes <- readRDS("data/proteinComplexs/corumWithRes.RDS")
Overrepresentation of complexes among duplicates vs single
EVEresTbl %>%
select(N3,dup_type) %>% distinct() %>%
mutate(isInComplex = N3 %in% corumWithRes$N3) -> tmp
pval <-
table(tmp$isInComplex,tmp$dup_type) %>%
fisher.test(alternative = "less") %>%
with(p.value)
tmp %>%
group_by(dup_type) %>%
summarise( n=n(), propInComplex=mean(isInComplex), nInComplex=sum(isInComplex)) %>%
ggplot() +
geom_col(aes(y=propInComplex, x=dup_type),width = 0.25, fill="black") +
annotate("text",label=sprintf("P = %0.2e",pval),x=1.5,y=0.25) +
ylab("Proportion of orthogroups in complexes") +
theme_classic() + theme(axis.title.x = element_blank())
## `summarise()` ungrouping output (override with `.groups` argument)
Proportion of duplicates per complex
compute_complex_proportions <- function (data, label) {
data2 <-
data %>%
group_by(ComplexName) %>%
summarise(NoGenes = n(), DupProp = sum(dup_type == "duplicate"), .groups = 'drop') %>%
mutate(DupProp = DupProp/NoGenes) %>%
mutate( Data = label) %>%
mutate(SinProp = 1-DupProp)
pure_singletons <- sum(data2$SinProp == 1)
pure_duplicates <- sum(data2$DupProp == 1)
return (list(pure_singletons, pure_duplicates, data2))
}
data <-
corumWithRes %>%
drop_na() %>% # drop orthologs not in EVE results
select(ComplexName,N3,dup_type) %>%
distinct() %>% # only keep N3
group_by(ComplexName) %>%
filter( n() > 1) %>% # remove complexes with only one N3
ungroup()
cat("Number of complexes:", length(unique(data$ComplexName)))
## Number of complexes: 1450
cat("Total propotion of duplicates in complexes:", format(sum(data$dup_type == "duplicate")/nrow(data), digits = 2))
## Total propotion of duplicates in complexes: 0.71
# Real data
res <- compute_complex_proportions(data, "Observed")
res[[3]] %>%
ggplot(aes(x = DupProp)) +
geom_histogram(aes(),
breaks=seq(0,1,0.05),
col="firebrick",
fill="firebrick",
alpha=.2) +
#geom_density() +
xlab("Proportion of duplicates in individual complexes") +
ylab("Number of complexes") +
theme_bw()
# Randomize!
set.seed(seed=42)
no_rand <- 10000
pure_singletons_rand <- c(res[[1]])
pure_duplicates_rand <- c(res[[2]])
for (i in 2:(no_rand+1)) {
data_rand <- data
data_rand$dup_type <- sample(data_rand$dup_type, nrow(data_rand), replace = FALSE)
res_rand <- compute_complex_proportions(data_rand, "Randomized")
pure_singletons_rand[i] <- res_rand[[1]]
pure_duplicates_rand[i] <- res_rand[[2]]
}
p_pure_singletons <- sum(pure_singletons_rand >= res[[1]])/(no_rand+1)
cat ("Pure singleton complexes: ", res[[1]], " (p = ", format(p_pure_singletons,digits = 3), ")", sep = "")
## Pure singleton complexes: 77 (p = 0.0183)
p_pure_duplicates <- sum(pure_duplicates_rand >= res[[2]])/(no_rand+1)
cat ("Pure duplicate complexes: ", res[[2]], " (p = ", format(p_pure_duplicates,digits = 3), ")", sep = "")
## Pure duplicate complexes: 563 (p = 1e-04)
tibble( pure_duplicates_rand) %>%
ggplot( ) + geom_histogram(aes(x=pure_duplicates_rand),bins = 100) +
geom_vline( data = tibble( x = res[[2]]), mapping = aes(xintercept=x), color="red")
tibble( pure_singletons_rand) %>%
ggplot( ) + geom_histogram(aes(x=pure_singletons_rand),bins = 100) +
geom_vline( data = tibble( x = res[[1]]), mapping = aes(xintercept=x), color="red")
Overrepresentation of complexes among different shift categories
countsInComplex <- EVEresTbl %>%
mutate(isInComplex = ifelse(N3 %in% corumWithRes$N3, TRUE, FALSE)) %>%
mutate(sigDir = ifelse(dupSigDir == "", "no shift", dupSigDir)) %>%
select(N3, sigDir, dup_type, isInComplex) %>%
distinct() %>%
count(sigDir, dup_type, isInComplex) %>%
group_by(dup_type, isInComplex) %>%
mutate(total = sum(n), prop = n / sum(n),
updownClass = sub(x = sigDir,"(up|down|both).*","\\1")) %>%
ungroup() %>%
bind_rows(
EVEresTbl %>%
mutate(isInComplex = ifelse(N3 %in% corumWithRes$N3,TRUE,FALSE)) %>%
select(N3, isInComplex, dup_type) %>%
distinct() %>%
count(isInComplex,dup_type) %>%
group_by(dup_type) %>%
mutate(total = sum(n), prop = n / sum(n),
updownClass = "total", sigDir = "total") %>%
filter(isInComplex == TRUE) %>%
ungroup()
)
# Plot
countsInComplex %>%
filter(sigDir != "no shift") %>%
mutate(sigDir = factor(sigDir, levels = c("total", "up", "upup", "down", "downdown", "both")),
isInComplex = factor(ifelse(isInComplex, "inComplex", "notInComplex"), levels = c("inComplex", "notInComplex"))) %>%
ggplot(aes(x = sigDir, y = prop, alpha = isInComplex, fill = sigDir)) +
geom_col(position = "dodge") +
facet_grid(.~dup_type, scales = "free", space = "free") +
scale_fill_manual(values = c(both="#993094", down="#298ABD", downdown="#1B4F8B",
up="#F62D31", upup="#8B1B1A", "no shift"="grey30", total="black")) +
scale_alpha_manual(values = c("inComplex"=1,"notInComplex"=0.5)) +
guides(fill = "none", alpha = "none") +
ylim(0, 0.25) +
ylab("Proportion of orthogroups") +
theme_classic() +
theme(axis.title.x = element_blank(),
strip.background = element_blank(),
strip.text = element_text(size = 8, colour = "black"),
text = element_text(size = 8, colour = "black"),
axis.text = element_text(size = 8, colour = "black"),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1),
axis.line = element_line(size = 0.5, colour = "black"))
# ggsave("Figures/PPIbarplot.pdf", height = 3, width = 3.5, units = "in")
ComplexCounts <- corumWithRes %>%
select(ComplexID, ComplexName, N3, dup_type, dupSigDir) %>%
distinct() %>%
filter(!is.na(dup_type)) %>%
select(ComplexName, dup_type) %>%
group_by(ComplexName) %>%
count(dup_type) %>%
spread(dup_type, n) %>%
mutate(duplicate = ifelse(is.na(duplicate), 0, duplicate),
single = ifelse(is.na(single), 0, single))
dupSingleComplexes <- ComplexCounts %>%
filter(single > 0, duplicate > 0) %>%
.$ComplexName
singleInComplexWithDup <- corumWithRes %>%
filter(ComplexName %in% dupSingleComplexes) %>%
.$N3
fisherTestTable <- tibble()
for(complexSet in c("all", "single and ohnolog")) {
if(complexSet == "single and ohnolog") {
complexes <- singleInComplexWithDup
} else {
complexes <- corumWithRes$N3
}
for(dup in c("single", "duplicate")) {
for(dir in c("up", "upup", "down", "downdown", "both")) {
totalSet <- EVEresTbl %>%
mutate(isInComplex = N3 %in% complexes) %>%
select(N3, isInComplex, dup_type, dupSigDir) %>% distinct() %>%
filter(dup_type == dup)
shiftSet <- totalSet %>%
filter(dupSigDir == dir)
if(nrow(totalSet) > 0 & nrow(shiftSet) > 0) {
testTable <- tibble(
"Complex type" = complexSet,
"Orthogroup type" = ifelse(dup == "single", "singleton", "ohnolog"),
"Expression shift category" = sub("upup", "up + up", sub("downdown", "down + down", dir)),
"N total in complex" = sum(totalSet$isInComplex),
"N total not in complex" = sum(!totalSet$isInComplex),
"N shifted in complex" = sum(shiftSet$isInComplex),
"N shifted not in complex" = sum(!shiftSet$isInComplex)
)
testTable$`P-value` = round(fisher.test(matrix(c(
testTable$`N shifted in complex`,
testTable$`N shifted not in complex`,
testTable$`N total in complex`,
testTable$`N total not in complex`
), ncol = 2), alternative = "greater")$p.value,4)
fisherTestTable <- fisherTestTable %>%
bind_rows(testTable)
}
}
}
}
fisherTestTable
## # A tibble: 14 x 8
## `Complex type` `Orthogroup typ… `Expression shi… `N total in com…
## <chr> <chr> <chr> <int>
## 1 all singleton up 633
## 2 all singleton down 633
## 3 all ohnolog up 1403
## 4 all ohnolog up + up 1403
## 5 all ohnolog down 1403
## 6 all ohnolog down + down 1403
## 7 all ohnolog both 1403
## 8 single and oh… singleton up 524
## 9 single and oh… singleton down 524
## 10 single and oh… ohnolog up 943
## 11 single and oh… ohnolog up + up 943
## 12 single and oh… ohnolog down 943
## 13 single and oh… ohnolog down + down 943
## 14 single and oh… ohnolog both 943
## # … with 4 more variables: `N total not in complex` <int>, `N shifted in
## # complex` <int>, `N shifted not in complex` <int>, `P-value` <dbl>
data <- EVEresTbl %>%
mutate(isInComplex = N3 %in% corumWithRes$N3) %>%
select(N3, isInComplex, dup_type, dupSigDir) %>% distinct()
table(data$isInComplex, data$dup_type)
##
## duplicate single
## FALSE 4996 2832
## TRUE 1403 633
# duplicate single
# FALSE 5226 2832
# TRUE 1463 633
fisher.test(matrix(c(1463, 5226, 633, 2832), ncol = 2), alternative = "greater")
##
## Fisher's Exact Test for Count Data
##
## data: matrix(c(1463, 5226, 633, 2832), ncol = 2)
## p-value = 1.034e-05
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 1.146578 Inf
## sample estimates:
## odds ratio
## 1.252434
# p-value = 1.034e-05