# load the expression data
combExprMat <- readRDS("data/BSNormalize/combExprMat.RDS")

# load EVE results
EVEresTbl <- readRDS("data/runEVE/EVEresTbl.RDS")

# get kegg pathway mapping (for Eluc genes)
gene.pathway <- getGeneKEGGLinks(species.KEGG = "els")

# Load dN/dS results
dndsRes <- readRDS("input-data/dNdS_aBSREL/dndsRes.RDS")

dndsTbl <- 
  dndsRes %>% 
  # only complete duplicates
  filter(N3 %in% filter(EVEresTbl,dup_type == "duplicate", !N3partial)$N3) %>% 
  filter(N3 != "OG1v0014281.16") %>% # this has only one N9..
  filter( !is.na(N9)) %>% 
  filter( baseline_omega < 2) %>%
  select( N3, N9, baseline_omega, uncorrected_P) %>% 
  left_join(select(EVEresTbl %>% mutate( N9partial = is.na(Okis) | is.na(Omyk) | is.na(Salp) | is.na(Ssal)),
                   N9partial, N3partial,sigDir,dupSigDir,N9), by="N9")


dir.create("data/KEGGanalysis",showWarnings = F)

KEGG analysis

Enrichment for up/down regulated in duplicates/singletons. The background was selected as the total number of tested genes for each duplication type (duplicates/singletons). Includes the partial clades. The Eluc gene was used to represent each OG.

Singleton KEGG enrichment

singleAllEluc <- 
  EVEresTbl %>%
  filter( dup_type=="single") %>% 
  group_by(sigDir) %>% 
  do( Eluc=.$Eluc) %>% 
  with( set_names(Eluc,sub("^$","none",sigDir)))


keggResSingle <- 
  kegga(de = singleAllEluc,universe = unlist(singleAllEluc), species.KEGG = "els") %>% 
  rownames_to_column("KEGG_ID") %>% select(-none,-P.none)

DT::datatable(keggResSingle,rownames = F)
write_tsv(keggResSingle, "data/KEGGanalysis/keggResSingle.tsv")

Duplicate KEGG enrichment

dupAllEluc <- 
  EVEresTbl %>%
  filter( dup_type=="duplicate" ) %>% 
  select(N3,Eluc,dupSigDir) %>% distinct() %>%
  group_by(dupSigDir) %>% 
  do( Eluc=.$Eluc) %>% 
  with( set_names(Eluc,sub("^$","none",dupSigDir)))


keggResDuplicate <- 
  kegga(de = dupAllEluc,universe = unlist(dupAllEluc), species.KEGG = "els") %>% 
  rownames_to_column("KEGG_ID") %>% select(-none,-P.none)

DT::datatable(keggResDuplicate,rownames = F)
write_tsv(keggResDuplicate,"data/KEGGanalysis/keggResDuplicate.tsv")

Save a caombined table with EVE results and KEGG results

EVEwithKeggResDuplicate <- 
  EVEresTbl %>%
  filter(dup_type == "duplicate") %>%
  arrange(pval) %>%
  transmute(N3, Eluc,
            dupSigDir = ifelse(dupSigDir == "", "none", dupSigDir),
            Ssal = paste(Ssal, N3partial, sigDir, pval, sep = ":")) %>%
  group_by(N3) %>%
  mutate(dup = paste0("Ssal", row_number())) %>%
  spread(dup, Ssal) %>%
  separate(Ssal1, into = c("Ssal1.ID", "Ssal1.partial", "Ssal1.sigDir", "Ssal1.pval"), sep = ":") %>%
  separate(Ssal2, into = c("Ssal2.ID", "Ssal2.partial", "Ssal2.sigDir", "Ssal2.pval"), sep = ":") %>%
  mutate(Ssal1.pval = as.double(Ssal1.pval), Ssal2.pval = as.double(Ssal2.pval)) %>%
  ungroup() %>%
  filter(dupSigDir != "none") %>%
  arrange(dupSigDir) %>%
  left_join(gene.pathway,
             by = c("Eluc" = "GeneID")) %>%
  left_join(keggResDuplicate %>%
              group_by(Pathway) %>%
              gather(dupSigDir, Nshift, both:upup) %>%
              group_by(Pathway, dupSigDir) %>%
              gather(dupSigDir2, PathwayPval, P.both:P.upup) %>%
              filter(dupSigDir == sub("P.", "", dupSigDir2)) %>%
              dplyr::select(PathwayID = KEGG_ID, Pathway, N, dupSigDir, Nshift, PathwayPval),
            by = c("PathwayID", "dupSigDir"))

write_tsv(EVEwithKeggResDuplicate, path = "data/KEGGanalysis/EVEwithKeggResDuplicate.tsv")

Duplicates KEGG enrichment plot

dup.data <- keggResDuplicate %>%
  select(Pathway, N, both:upup) %>%
  gather(shift, count, both:upup) %>%
  mutate(prop = count / N) %>%
  left_join(
    keggResDuplicate %>%
      select(Pathway, P.both:P.upup) %>%
      gather(shift, pval, P.both:P.upup) %>%
      mutate(shift = sub("P\\.", "", shift)),
    by = c("Pathway", "shift")
  )

sgl.data <- keggResSingle %>%
  select(Pathway, N, down, up) %>%
  gather(shift, count, down, up) %>%
  mutate(prop = count / N)  %>%
  left_join(
    keggResDuplicate %>%
      select(Pathway, P.down, P.up) %>%
      gather(shift, pval, P.down, P.up) %>%
      mutate(shift = sub("P\\.", "", shift)),
    by = c("Pathway", "shift")
  )

cmb.data <- bind_rows(
  mutate(dup.data, gene_type = "duplicate"),
  mutate(sgl.data, gene_type = "single")
)

PW.clust <- dup.data %>%
  filter(pval < 0.05) %>%
  select(Pathway, shift, pval) %>%
  spread(shift, pval) %>%
  replace_na(as.list(setNames(rep(1, ncol(.)), colnames(.)))) %>%
  data.frame(row.names = 1) %>%
  dist() %>% 
  hclust()
PW.order <- PW.clust$labels[PW.clust$order]
# Move 'metabolic pathways'
# PW.order <- PW.order[c(2:18,1,19:27)]
# check that the Ribosome has an extremely low P-value
stopifnot(filter(dup.data, pval<1e-7)$Pathway == "Ribosome")
riboPval <- signif(min(dup.data$pval),2)

dup.data %>%
  filter(pval < 0.05) %>%
  mutate(shift = factor(shift, levels = c("up", "upup", "down", "downdown", "both")),
         Pathway = factor(Pathway, levels = PW.order)) %>%
  mutate(pval = pmax(pval, 1e-6)) %>%
  ggplot(aes(x = Pathway, y = -log10(pval), size = prop, colour = shift)) +
  geom_point() +
  scale_colour_manual(values = c(both="#993094", down="#298ABD", downdown="#1B4F8B",
                                 up="#F62D31", upup="#8B1B1A")) +
  scale_size_continuous(name = "proportion\nof genes\nin pathway", range = c(0.2, 6), breaks = c(0.1, 0.25, 0.5)) +
  scale_y_continuous(limits = -log10(c(0.05, 1e-6)), 
                     breaks = -log10(c(0.05, 0.01, 1e-3, 1e-4, 1e-5, 1e-6)), 
                     labels = c(0.05, 0.01, 1e-3, 1e-4, 1e-5, riboPval)) +
  guides(colour = "none") +
  labs(y = "KEGG enrichment p-value") +
  coord_flip() +
  theme_bw() + 
  theme(text = element_text(size = 8, colour = "black"),
        axis.text = element_text(size = 8, colour = "black"),
        axis.ticks =  element_line(size = 0.5, colour = "black"),
        panel.grid.minor.x = element_blank(),
        axis.title.y = element_blank())

Duplicates KEGG enrichment plot (alternative version)

# check that the Ribosome has an extremely low P-value
stopifnot(filter(dup.data, pval<1e-7)$Pathway == "Ribosome")
riboPval <- signif(min(dup.data$pval),2)

keggPlotData <-
  keggResDuplicate %>%
  rename_at(vars(both:upup), ~ paste0("nShifted.",.)) %>% 
  pivot_longer(cols=c(-KEGG_ID,-Pathway,-N), names_to = c(".value", "shift"), names_sep = "\\.") %>% 
  mutate(prop = nShifted / N) %>% 
  mutate(propLabel = paste0(nShifted, "/", N)) %>% 
  filter( P<0.05 ) %>% 
  # "Metabolic pathways" shows up twice, so remove one of them:
  filter( !(shift=="both" & Pathway=="Metabolic pathways")) %>% 
  mutate( shift = factor(shift, levels=c("downdown", "down", "both", "upup", "up"))) %>% 
  arrange( desc(shift), desc(P)) %>% 
  mutate( Pathway = forcats::fct_inorder(Pathway)) 

keggPlotData %>% 
  mutate(P = ifelse(P < 1e-6, 1e-6, P)) %>%
  ggplot(aes(x = Pathway, y = -log10(P), size = prop, color = shift)) +
  geom_point() + 
  geom_text(aes(label=propLabel, y = -log10(P) + (prop/2 + 0.2)*ifelse(P<1e-5,-1,1),
                hjust=ifelse(P<1e-5,1,0)), color="black", size=3) + 
  scale_colour_manual(values = c(both="#993094", down="#298ABD", downdown="#1B4F8B",
                                 up="#F62D31", upup="#8B1B1A")) +
  scale_size_continuous(name = "proportion\nof genes\nin pathway", range = c(0.2, 6), breaks = c(0.1, 0.25, 0.5)) +
  scale_y_continuous(limits = -log10(c(0.05, 1e-6)), 
                     breaks = -log10(c(0.05, 0.01, 1e-3, 1e-4, 1e-5, 1e-6)), 
                     labels = c(0.05, 0.01, 1e-3, 1e-4, 1e-5, riboPval)) +
  guides(colour = "none") +
  labs(y = "KEGG enrichment p-value") +
  coord_flip() +
  theme_bw() + 
  theme(text = element_text(size = 8, colour = "black"),
        axis.text = element_text(size = 8, colour = "black"),
        axis.ticks =  element_line(size = 0.5, colour = "black"),
        panel.grid.minor.x = element_blank(),
        axis.title.y = element_blank())

Duplicate expression asymmetry per KEGG pathway

Here the duplicate divergence asymmetry is a measure of how different the duplicates are expressed, e.g. low if both duplicates have the same expression level, and high if one duplicate has higher expression than the other.

salCols = !(colnames(combExprMat) %in% c("Drer","Olat","Eluc"))


EVEresTbl %>%
  filter(!N3partial,dup_type=="duplicate") %>% 
  group_by( Eluc, dupSigDir ) %>% 
  summarise( N9_1 = N9[1], N9_2=N9[2]) %>% 
  ungroup() %>% 
  mutate( divergenceAsymmentry = abs(rowMeans( combExprMat[N9_1,salCols]-
                                               combExprMat[N9_2,salCols]))) %>% 
  inner_join(gene.pathway,by = c("Eluc"="GeneID")) %>% 
  right_join( select(keggPlotData, PathwayID = KEGG_ID, Pathway, shift), by="PathwayID") %>% 
  
  # Filter only duplicates with shift in same direction KEGG enrichment was performed
  filter( dupSigDir == shift) %>%  
  
  ggplot( aes(x=Pathway,y=divergenceAsymmentry,fill=shift)) +
  geom_boxplot() + 
  scale_fill_manual(values = c(both="#993094", down="#298ABD", downdown="#1B4F8B",
                                 up="#F62D31", upup="#8B1B1A"), guide = F) +
  ylab("Expression asymmetry") +
  theme_classic() +
  coord_flip() +
  # theme_bw() + 
  theme(text = element_text(size = 8, colour = "black"),
        axis.text = element_text(size = 8, colour = "black"),
        axis.ticks =  element_line(size = 0.5, colour = "black"),
        panel.grid.minor.x = element_blank(),
        axis.title.y = element_blank())

Expression asymmetry selected pathways

Note: unlike above, this plot includes all duplicates in the pathway, not only the ones that have given shift category.

keggPaths = c("path:els03010","path:els00190","path:els03008")


EVEresTbl %>%
  filter(!N3partial,dup_type=="duplicate") %>% 
  group_by( Eluc, dupSigDir ) %>% 
  summarise( N9_1 = N9[1], N9_2=N9[2]) %>% 
  ungroup() %>% 
  mutate( divergenceAsymmentry = abs(rowMeans( combExprMat[N9_1,salCols]-
                                               combExprMat[N9_2,salCols]))) %>% 
  inner_join(gene.pathway,by = c("Eluc"="GeneID")) %>% 
  right_join( select(keggPlotData, PathwayID = KEGG_ID, Pathway, shift), by="PathwayID") %>% 
  
  # Filter only duplicates with shift in same direction KEGG enrichment was performed
  # filter( dupSigDir == shift) %>%  
  
  filter(PathwayID %in% keggPaths) %>% 
  
  ggplot( aes(x=Pathway,y=divergenceAsymmentry,fill=shift)) +
  geom_boxplot() + 
  scale_fill_manual(values = c(both="#993094", down="#298ABD", downdown="#1B4F8B",
                                 up="#F62D31", upup="#8B1B1A"), guide = F) +
  ylab("Expression asymmetry") +
  theme_classic() +
  coord_flip() +
  # theme_bw() + 
  theme(text = element_text(size = 8, colour = "black"),
        axis.text = element_text(size = 8, colour = "black"),
        axis.ticks =  element_line(size = 0.5, colour = "black"),
        panel.grid.minor.x = element_blank(),
        axis.title.y = element_blank())

Duplicate dN/dS asymmetry per kegg pathway

dndsTbl %>% 
  rename( dnds = baseline_omega ) %>% 
  group_by( N3 ) %>% 
  summarise( dndsAsymmetry = abs(dnds[1] - dnds[2])) %>% 
  inner_join(select(EVEresTbl,Eluc,N3,dupSigDir) %>% unique(), by="N3") %>% 
  inner_join(gene.pathway,by = c("Eluc"="GeneID")) %>% 
  right_join( select(keggPlotData, PathwayID = KEGG_ID, Pathway, shift), by="PathwayID") %>% 
  
  # Filter only duplicates with shift in same direction KEGG enrichment was performed
  filter( dupSigDir == shift) %>%  
  
  ggplot( aes(x=Pathway,y=dndsAsymmetry,fill=shift)) +
  geom_boxplot() + 
  scale_fill_manual(values = c(both="#993094", down="#298ABD", downdown="#1B4F8B",
                                 up="#F62D31", upup="#8B1B1A"), guide = F) +
  ylab("dN/dS asymmetry") +
  theme_classic() +
  coord_flip() +
  # theme_bw() + 
  theme(text = element_text(size = 8, colour = "black"),
        axis.text = element_text(size = 8, colour = "black"),
        axis.ticks =  element_line(size = 0.5, colour = "black"),
        panel.grid.minor.x = element_blank(),
        axis.title.y = element_blank())
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).

pathview plots

# Create shift size values
# Shift is the difference between the mean outgroup and mean salmonid expression for each gene
# Expression values are log2(TPM+0.01)
outgroup_cols <- colnames(combExprMat) %in% c("Drer","Olat","Eluc")
shiftSize <- tibble(
  N9 = rownames(combExprMat),
  outgroup = rowMeans(combExprMat[ , outgroup_cols],na.rm = T),
  salmonids = rowMeans(combExprMat[ , !outgroup_cols],na.rm = T)
) %>% 
  mutate( shift = salmonids - outgroup) %>% 
  filter( N9 %in% EVEresTbl$N9)

# Create table of values using Eluc as identifier
ElucShiftTable <- EVEresTbl %>%
    filter(dup_type == "duplicate") %>%
    left_join(shiftSize, by = "N9") %>%
    arrange(N3, desc(pval)) %>%
    select(Eluc, shift) %>% 
    group_by(Eluc) %>%
    mutate(dup = paste0("dup", 1:2)) %>%
    spread(dup, shift) %>%
    data.frame(row.names = 1)

# pathways
#selectedPathways <- c("00190","03008","04621","04146","04142","04130","00630","00350","03010")
selectedPathways <- c("00190","03008","03010")


# Plot Eluc pathways with expression shift values
for(pw in selectedPathways) {
  suppressMessages(
    pathview(kegg.dir = keggPathwayDir,
             gene.data = ElucShiftTable,
             gene.idtype = "entrez",
             pathway.id = pw, 
             map.symbol = F,
             species = "els",
             node.sum = "mean", 
             limit = list(gene = c(-4, 4), cpd = 1),
             low = "dodgerblue", high = "firebrick",
             out.suffix = "dupExprsShiftMeans"))
  filename = paste0("els",pw,".dupExprsShiftMeans.multi.png")
  file.rename(filename, file.path(pathviewOutPath,filename))
  cat(paste0("\n![",filename,"](",file.path(pathviewOutPath,filename),")\n\n"))
}
els00190.dupExprsShiftMeans.multi.png

els00190.dupExprsShiftMeans.multi.png

els03008.dupExprsShiftMeans.multi.png

els03008.dupExprsShiftMeans.multi.png

els03010.dupExprsShiftMeans.multi.png

els03010.dupExprsShiftMeans.multi.png