exprsMat <- readRDS("input-data/RNAseq/exprsMat.RDS")
N9geneIDtbl <- readRDS("data/orthoGroupFiltering/N9geneIDtbl.RDS")

species_names <- c("Drer"="Zebrafish", 
                   "Olat"="Medaka",
                   "Eluc"="Pike",
                   "Ssal"="Atlantic salmon",
                   "Salp"="Arctic char",
                   "Omyk"="Rainbow trout",
                   "Okis"="Coho salmon")

normalisation procedure

The TPM expression matrices were first normalised within each species using the TMM method in edgeR. Then, using the subset of genes with exactly one ortholog per species, the mean of the normalised expression was calculated per species, giving a mean-expression matrix with one column per species. The TMM normalisation method was then applied to this matrix to give the a normalization factor per species, which was then applied to the within-species normalised matrix.

#### within-species normalisation ####


# Apply normalization factors to expression matrix
applyNormFactorsMat <- function (m, normFactors) {
  for (i in 1:ncol(m)) {
    m[ ,i] <- m[ ,i] / normFactors[i]
  }
  return(m)
}

lapply( exprsMat, function(m){
  normFactors <- calcNormFactors.default(m,method="TMM", lib.size = rep(1e6,ncol(m)))
  applyNormFactorsMat(m,normFactors)
}) -> exprsMatWSN


#### between species normalisation ####


spcs <- names(exprsMatWSN)
spcs <- setNames(spcs,spcs)

# get gene ID table for 1:1 OGs
N9geneIDtbl11 <- filter(N9geneIDtbl, dup_type=="single") %>% na.omit()

# generate combined expression table of species means
sapply(spcs,function(spc){
  rowMeans(exprsMatWSN[[spc]][N9geneIDtbl11[[spc]],])
}) -> spcMean11
rownames(spcMean11) <- N9geneIDtbl11$N3

# calculate between species normalisation factors
BSN_factors <- 
  calcNormFactors.default(spcMean11,method="TMM", lib.size = rep(1e6,ncol(spcMean11))) %>% 
  setNames(colnames(spcMean11))

# apply to expression matrix and log transform
lapply(spcs, function(spc){
  exprsMatWSN[[spc]] / BSN_factors[spc]
}) -> exprsMatBSN

exprsLogMatBSN <- lapply(exprsMatBSN, function(m) log2(0.01 + m))


#### Generate combined expression matrix ####

lapply(spcs,function(spc){
  m <- matrix(NA_real_,nrow=nrow(N9geneIDtbl),ncol=ncol(exprsLogMatBSN[[spc]]))
  colnames(m) <- rep(spc,ncol(m))
  idx <- !is.na(N9geneIDtbl[[spc]])
  m[idx] <- exprsLogMatBSN[[spc]][N9geneIDtbl[[spc]][idx], ]
  m
}) %>% 
  Reduce(f=cbind) -> combExprMat

rownames(combExprMat) <- N9geneIDtbl$N9 # use N9 as rowname

saveRDS(combExprMat,file = "data/BSNormalize/combExprMat.RDS")

Combined expression normalisation plot

species.levels <- c("Drer", "Olat", "Eluc", "Ssal", "Salp", "Omyk", "Okis")

# Number of genes with TPM > cutoff
exprs.cutoffs <- c(0, 1, 10, 100)
plot.nGenes <- data.frame(
  species = factor(rep(names(exprsMat), length(exprs.cutoffs[])), levels = rev(species.levels)),
  TPM_cutoff = paste(">", factor(rep(exprs.cutoffs[], each = length(exprsMat)))),
  ngenes = lapply(exprs.cutoffs, function (i) {
             lapply(exprsMat, function (x) {
               sum(rowMeans(x[, -1]) > i)
             }) %>% unlist()
           }) %>% unlist()
  ) %>%
  ggplot(aes(x = species, y = (ngenes/ 1000), fill = TPM_cutoff)) +
  geom_bar(stat = "identity", position = "dodge", colour = "black", size = 0.5) + 
  scale_fill_manual(values = c("#abd9e9", "#ffffbf", "#fdae61", "#d7191c")) +
  coord_flip() +
  guides(fill = "none") +
  labs(y = "number of genes (x1000)", x = "Species") +
  scale_x_discrete( labels = species_names) +
  theme_classic() + 
  theme(text = element_text(size = 8), axis.text.y = element_text(angle = 90,hjust = 0.5))
WSN_factors_tbl <- 
  lapply(spcs, function(spc){
    tibble( NF = calcNormFactors.default(exprsMat[[spc]], method="TMM", lib.size = rep(1e6, ncol(exprsMat[[spc]])))) %>% 
      mutate( sampleID=colnames(exprsMat[[spc]]))
  }) %>% 
  bind_rows(.id = "spc") %>% 
  mutate(spc = factor(spc, levels = species.levels)) %>%
  mutate( sampleID = sub(".liver.TPM", "", sampleID)) %>% 
  mutate( NF = sprintf("%0.3f",NF) )
  

BSN_factors_tbl <- 
  tibble(NF=sprintf("NF = %0.3f",BSN_factors),spc=names(BSN_factors)) %>% 
  mutate(spc = factor(spc, levels = species.levels))
# Data for plotting
# Raw
data1 <- exprsMat %>% 
  purrr::map( ~ .x %>% tbl_df %>% gather( key="sampleID", value="exprs.") ) %>% 
  bind_rows(.id = "spc") %>% 
  mutate(spc = factor(spc, levels = species.levels)) %>%
  mutate( sampleID = sub(".liver.TPM", "", sampleID)) %>%
  mutate(sampleID = factor(sampleID, levels = unique(sampleID)))
# WSN
data2 <- exprsMatWSN %>% 
  purrr::map( ~ .x %>% tbl_df %>% gather( key="sampleID", value="exprs.") ) %>% 
  bind_rows(.id = "spc") %>% 
  mutate(spc = factor(spc, levels = species.levels)) %>%
  mutate( sampleID = sub(".liver.TPM", "", sampleID)) %>%
  mutate(sampleID = factor(sampleID, levels = unique(sampleID)))
# BSN
data3 <- exprsLogMatBSN %>% 
  purrr::map( ~ .x %>% tbl_df %>% gather( key="sampleID", value="exprs.") ) %>% 
  bind_rows(.id = "spc") %>% 
  mutate(spc = factor(spc, levels = species.levels)) %>%
  mutate( sampleID = sub(".liver.TPM", "", sampleID)) %>%
  mutate(sampleID = factor(sampleID, levels = unique(sampleID)))

# Plots

plot.BSN.1 <- ggplot(mapping = aes(y = sampleID)) +
  geom_density_ridges(data = data1, aes(x = log2(exprs. + 0.01)),
                      scale = 3, alpha = 0.3, size = 1,
                      colour = "#a6611a") +
  geom_density_ridges(data = data2, aes(x = log2(exprs. + 0.01)), 
                      scale = 3, alpha = 0.3, size = 0.5,
                      colour = "#018571") +
  geom_text(data=WSN_factors_tbl,mapping=aes(label=NF),x=10.5,hjust=1,vjust=-0.5,size=2) +
  labs(x = "Gene expression log2(TPM+0.01)", y = "Sample") +
  guides(fill = "none", colour = "none") +
  facet_grid(spc ~ ., scales = "free_y") +
  coord_cartesian(xlim = c(log2(0.01), 10)) +
  theme_bw() +
  theme(axis.title.y = element_blank(), 
        text = element_text(size = 8),
        panel.grid.major.y = element_blank(),
        strip.background = element_blank(),
        strip.text = element_blank())

plot.BSN.2 <- ggplot(mapping = aes(y = sampleID)) +
  geom_density_ridges(data = data2, aes(x = log2(exprs. + 0.01)), 
                      scale = 3, alpha = 0.3, size = 1,
                      colour = "#018571") +
  geom_density_ridges(data = data3, aes(x = exprs.), 
                      scale = 3, alpha = 0.3, size = 0.5,

                      colour = "#c51b8a") +
  geom_text(data=BSN_factors_tbl,mapping=aes(label=NF,y=5),x=10.5,hjust=1,vjust=-0.3,size=2) +
  labs(x = "Gene expression log2(TPM+0.01)", y = "Sample") +
  guides(fill = "none", colour = "none") +
  facet_grid(spc ~ ., scales = "free_y") +
  coord_cartesian(xlim = c(log2(0.01), 10)) +
  theme_bw() +
  theme(axis.line.y = element_blank(), 
        axis.text.y = element_blank(),
        axis.title.y = element_blank(), 
        panel.grid.major.y = element_blank(),
        text = element_text(size = 8), 
        strip.background = element_blank(), 
        strip.text = element_blank())

#pdf(file = "Figures/speciesExprsNorm.pdf", width = 8, height = 7)
grid.arrange(plot.nGenes, plot.BSN.1, plot.BSN.2, ncol = 3)

#dev.off()

Panel 1 - number of genes at TPM cutoffs per species: TPM cutoffs are blue: >0, yellow: >1, orange > 10, red > 100.
Panel 2 & 3 - gene expression level distributions per species before and after 2 normalisation steps: brown lines are unnormalised TPM values, green lines are normalised with within-species factors, and purple lines are normalised with between-species factors.

Sample clustering

colnames(combExprMat) <- species_names[colnames(combExprMat)]
d <- dist(t(combExprMat))
hc <- hclust(d,method = "ward.D")
plot(hc,xlab = "",sub = "",main="")