# Read in EVE results
EVEresTbl <- read_rds("data/runEVE/EVEresTbl.RDS")

# Read in shift gene sets
nShiftedTblComplete <- readRDS("data/TissueAtlasAnalysis/nShiftedTblComplete.RDS") %>%
  as_tibble()

# load CORUM protein-complex data
if( !file.exists("data/proteinComplexs/corumWithRes.RDS")){

  OGtbl <- read_tsv("input-data/from_ortho_pipeline/OGtbl.tsv", col_types = cols())
    
  dir.create("data/proteinComplexs")
  
  
  download.file("http://mips.helmholtz-muenchen.de/corum/download/coreComplexes.txt.zip",
                "data/proteinComplexs/coreComplexes.txt.zip")
  corum <- read_tsv("data/proteinComplexs/coreComplexes.txt.zip")

  # need to convert ensembl gene IDs to NCBI gene IDs
  EG2NCBI <-
    biomaRt::getBM(attributes = c("ensembl_gene_id", "entrezgene_id"),
          mart = biomaRt::useMart("ensembl",dataset = "hsapiens_gene_ensembl")) %>% 
    as_tibble() %>% 
    mutate( entrezgene_id = as.character(entrezgene_id) )
  
  # Assign complex subunits to corresponding orthogroups
  corumWithRes <- 
    corum %>% 
    filter(Organism=="Human") %>% 
    select( ComplexID,ComplexName, subunits=`subunits(Entrez IDs)`) %>% 
    mutate( subunits = strsplit(subunits,";")) %>% unnest() %>% 
    left_join(EG2NCBI, by=c("subunits"="entrezgene_id")) %>% 
    left_join(select(OGtbl, geneID, OG, N0), by=c("ensembl_gene_id"="geneID")) %>% 
    left_join(select(OGtbl, N0, N3) %>% filter(!is.na(N3)) %>% distinct(), by="N0") %>% 
    left_join(select(EVEresTbl, N3,N9,dup_type,sigDir, dupSigDir), by="N3")
  
  
  saveRDS(corumWithRes,"data/proteinComplexs/corumWithRes.RDS")
}
corumWithRes <- readRDS("data/proteinComplexs/corumWithRes.RDS")

Overrepresentation of complexes among duplicates vs single

EVEresTbl %>% 
  select(N3,dup_type) %>% distinct() %>% 
  mutate(isInComplex = N3 %in% corumWithRes$N3) -> tmp

pval <- 
  table(tmp$isInComplex,tmp$dup_type) %>% 
  fisher.test(alternative = "less") %>% 
  with(p.value)

tmp %>% 
  group_by(dup_type) %>% 
  summarise( n=n(), propInComplex=mean(isInComplex), nInComplex=sum(isInComplex)) %>% 
  ggplot() + 
  geom_col(aes(y=propInComplex, x=dup_type),width = 0.25, fill="black") +
  annotate("text",label=sprintf("P = %0.2e",pval),x=1.5,y=0.25) + 
  ylab("Proportion of orthogroups in complexes") +
  theme_classic() + theme(axis.title.x = element_blank())
## `summarise()` ungrouping output (override with `.groups` argument)

Proportion of duplicates per complex

compute_complex_proportions <- function (data, label) {
  data2 <-
    data %>%
    group_by(ComplexName) %>%
    summarise(NoGenes = n(), DupProp = sum(dup_type == "duplicate"), .groups = 'drop') %>%
    mutate(DupProp = DupProp/NoGenes) %>%
    mutate( Data = label) %>%
    mutate(SinProp = 1-DupProp)

  pure_singletons <- sum(data2$SinProp == 1)
  pure_duplicates <- sum(data2$DupProp == 1)

  return (list(pure_singletons, pure_duplicates, data2))
}
data <- 
  corumWithRes %>% 
  drop_na() %>% # drop orthologs not in EVE results
  select(ComplexName,N3,dup_type) %>% 
  distinct() %>% # only keep N3
  group_by(ComplexName) %>% 
  filter( n() > 1) %>% # remove complexes with only one N3
  ungroup()
  

cat("Number of complexes:", length(unique(data$ComplexName)))
## Number of complexes: 1450
cat("Total propotion of duplicates in complexes:", format(sum(data$dup_type == "duplicate")/nrow(data), digits = 2))
## Total propotion of duplicates in complexes: 0.71
# Real data
res <- compute_complex_proportions(data, "Observed")
res[[3]] %>%
  ggplot(aes(x = DupProp)) +
  geom_histogram(aes(), 
                 breaks=seq(0,1,0.05), 
                 col="firebrick", 
                 fill="firebrick", 
                 alpha=.2) + 
  #geom_density() + 
  xlab("Proportion of duplicates in individual complexes") +
  ylab("Number of complexes") +
  theme_bw()

# Randomize!

set.seed(seed=42)

no_rand <- 10000
pure_singletons_rand <- c(res[[1]])
pure_duplicates_rand <- c(res[[2]])
for (i in 2:(no_rand+1)) {
  data_rand <- data
  data_rand$dup_type <- sample(data_rand$dup_type, nrow(data_rand), replace = FALSE)
  res_rand <- compute_complex_proportions(data_rand, "Randomized")
  pure_singletons_rand[i] <- res_rand[[1]]
  pure_duplicates_rand[i] <- res_rand[[2]]
}
p_pure_singletons <- sum(pure_singletons_rand >= res[[1]])/(no_rand+1)
cat ("Pure singleton complexes: ", res[[1]], " (p = ", format(p_pure_singletons,digits = 3), ")", sep = "")
## Pure singleton complexes: 77 (p = 0.0183)
p_pure_duplicates <- sum(pure_duplicates_rand >= res[[2]])/(no_rand+1)
cat ("Pure duplicate complexes: ", res[[2]], " (p = ", format(p_pure_duplicates,digits = 3), ")", sep = "")
## Pure duplicate complexes: 563 (p = 1e-04)
tibble( pure_duplicates_rand) %>% 
  ggplot( ) + geom_histogram(aes(x=pure_duplicates_rand),bins = 100) + 
  geom_vline( data = tibble( x = res[[2]]), mapping = aes(xintercept=x), color="red")

tibble( pure_singletons_rand) %>% 
  ggplot( ) + geom_histogram(aes(x=pure_singletons_rand),bins = 100) + 
  geom_vline( data = tibble( x = res[[1]]), mapping = aes(xintercept=x), color="red")

Overrepresentation of complexes among different shift categories

countsInComplex <- EVEresTbl %>% 
  mutate(isInComplex = ifelse(N3 %in% corumWithRes$N3, TRUE, FALSE)) %>%
  mutate(sigDir = ifelse(dupSigDir == "", "no shift", dupSigDir)) %>% 
  select(N3, sigDir, dup_type, isInComplex) %>% 
  distinct() %>%
  count(sigDir, dup_type, isInComplex) %>%
  group_by(dup_type, isInComplex) %>% 
  mutate(total = sum(n), prop = n / sum(n), 
         updownClass = sub(x = sigDir,"(up|down|both).*","\\1")) %>%
  ungroup()  %>%
  bind_rows(
    EVEresTbl %>% 
      mutate(isInComplex = ifelse(N3 %in% corumWithRes$N3,TRUE,FALSE)) %>% 
      select(N3, isInComplex, dup_type) %>% 
      distinct() %>% 
      count(isInComplex,dup_type) %>%
      group_by(dup_type) %>%
      mutate(total = sum(n), prop = n / sum(n), 
             updownClass = "total", sigDir = "total") %>%
      filter(isInComplex == TRUE) %>%
      ungroup()
  )

# Plot
countsInComplex %>%
  filter(sigDir != "no shift") %>%
  mutate(sigDir = factor(sigDir, levels = c("total", "up", "upup", "down", "downdown", "both")),
         isInComplex = factor(ifelse(isInComplex, "inComplex", "notInComplex"), levels = c("inComplex", "notInComplex"))) %>%
  ggplot(aes(x = sigDir, y = prop,  alpha = isInComplex, fill = sigDir)) + 
  geom_col(position = "dodge") + 
  facet_grid(.~dup_type, scales = "free", space = "free") +
  scale_fill_manual(values = c(both="#993094", down="#298ABD", downdown="#1B4F8B",
                               up="#F62D31", upup="#8B1B1A", "no shift"="grey30", total="black")) +
  scale_alpha_manual(values = c("inComplex"=1,"notInComplex"=0.5)) +
  guides(fill = "none", alpha = "none") +
  ylim(0, 0.25) +
  ylab("Proportion of orthogroups") +
  theme_classic() +
  theme(axis.title.x = element_blank(),
        strip.background = element_blank(),
        strip.text = element_text(size = 8, colour = "black"),
        text = element_text(size = 8, colour = "black"),
        axis.text = element_text(size = 8, colour = "black"),
        axis.ticks.x = element_blank(),
        axis.text.x = element_text(angle = 90, hjust = 1),
        axis.line = element_line(size = 0.5, colour = "black"))

# ggsave("Figures/PPIbarplot.pdf", height = 3, width = 3.5, units = "in")
ComplexCounts <- corumWithRes %>%
  select(ComplexID, ComplexName, N3, dup_type, dupSigDir) %>%
  distinct() %>%
  filter(!is.na(dup_type)) %>%
  select(ComplexName, dup_type) %>%
  group_by(ComplexName) %>%
  count(dup_type) %>%
  spread(dup_type, n) %>%
  mutate(duplicate = ifelse(is.na(duplicate), 0, duplicate),
         single = ifelse(is.na(single), 0, single))

dupSingleComplexes <- ComplexCounts %>%
  filter(single > 0, duplicate > 0) %>%
  .$ComplexName

singleInComplexWithDup <- corumWithRes %>%
  filter(ComplexName %in% dupSingleComplexes) %>%
  .$N3
fisherTestTable <- tibble()

for(complexSet in c("all", "single and ohnolog")) {
  if(complexSet == "single and ohnolog") {
    complexes <- singleInComplexWithDup
  } else {
    complexes <- corumWithRes$N3
  }
  
  for(dup in c("single", "duplicate")) {
    for(dir in c("up", "upup", "down", "downdown", "both")) {
      
      totalSet <- EVEresTbl %>% 
        mutate(isInComplex = N3 %in% complexes) %>% 
        select(N3, isInComplex, dup_type, dupSigDir) %>% distinct() %>%
        filter(dup_type == dup)
      
      shiftSet <- totalSet %>%
        filter(dupSigDir == dir)
      
      if(nrow(totalSet) > 0 & nrow(shiftSet) > 0) {
        testTable <- tibble(
          "Complex type" = complexSet,
          "Orthogroup type" = ifelse(dup == "single", "singleton", "ohnolog"),
          "Expression shift category" = sub("upup", "up + up", sub("downdown", "down + down", dir)),
          "N total in complex" = sum(totalSet$isInComplex),
          "N total not in complex" = sum(!totalSet$isInComplex),
          "N shifted in complex" = sum(shiftSet$isInComplex),
          "N shifted not in complex" = sum(!shiftSet$isInComplex)
        )
        
        testTable$`P-value` = round(fisher.test(matrix(c(
          testTable$`N shifted in complex`,
          testTable$`N shifted not in complex`,
          testTable$`N total in complex`,
          testTable$`N total not in complex`
        ), ncol = 2), alternative = "greater")$p.value,4)
        
        fisherTestTable <- fisherTestTable %>%
          bind_rows(testTable)
      }
    }
  }
}

fisherTestTable
## # A tibble: 14 x 8
##    `Complex type` `Orthogroup typ… `Expression shi… `N total in com…
##    <chr>          <chr>            <chr>                       <int>
##  1 all            singleton        up                            633
##  2 all            singleton        down                          633
##  3 all            ohnolog          up                           1403
##  4 all            ohnolog          up + up                      1403
##  5 all            ohnolog          down                         1403
##  6 all            ohnolog          down + down                  1403
##  7 all            ohnolog          both                         1403
##  8 single and oh… singleton        up                            524
##  9 single and oh… singleton        down                          524
## 10 single and oh… ohnolog          up                            943
## 11 single and oh… ohnolog          up + up                       943
## 12 single and oh… ohnolog          down                          943
## 13 single and oh… ohnolog          down + down                   943
## 14 single and oh… ohnolog          both                          943
## # … with 4 more variables: `N total not in complex` <int>, `N shifted in
## #   complex` <int>, `N shifted not in complex` <int>, `P-value` <dbl>
data <- EVEresTbl %>% 
        mutate(isInComplex = N3 %in% corumWithRes$N3) %>% 
        select(N3, isInComplex, dup_type, dupSigDir) %>% distinct()

table(data$isInComplex, data$dup_type)
##        
##         duplicate single
##   FALSE      4996   2832
##   TRUE       1403    633
  #       duplicate single
  # FALSE      5226   2832
  # TRUE       1463    633
fisher.test(matrix(c(1463, 5226, 633, 2832), ncol = 2), alternative = "greater")
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(c(1463, 5226, 633, 2832), ncol = 2)
## p-value = 1.034e-05
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  1.146578      Inf
## sample estimates:
## odds ratio 
##   1.252434
# p-value = 1.034e-05