Fractionation bias analysis

# load synteny results
syn <- readRDS("input-data/from_ortho_pipeline/Ssal_synteny.RDS")

# load OGtbl
OGtbl <- read_tsv("input-data/from_ortho_pipeline/OGtbl.tsv",col_types = cols())

# load EVE results
EVEresTbl <- readRDS("data/runEVE/EVEresTbl.RDS")

Fractionation bias

Gene fractionation is the process were one of the duplicates of genes in duplicated chromosome regions are lost so that each duplicate region will end up with a different set of singleton genes. Biased fractionation is when one of the duplicated chromosome regions retains more singleton genes than the other.

Expression fractionation per synteny block

Expression fractionation is gene fractionation at the expression level, i.e. both copies are retained, but one is significantly down-regulated. A gene is defined as expression-fractioned if it is not down-regulated (conserved) and has a duplicate that is down-regulated (otherCopyIsDown=TRUE).

Test for each duplicate synteny block (i.e. duplicated chromosome regions) in Ssal whether the duplicates are more likely to be shifted in one of the chromosomes.

isDiff_per_multiplicon <- 
  syn$multiplicons %>% 
  filter(profile_length>100) %>% 
  left_join(syn$anchorpoints, by=c("id"="multiplicon")) %>% 
  select( multiplicon=id, gene_x, gene_y) %>% 
  gather( key="xy", value="Ssal", gene_x,gene_y) %>% 
  inner_join(EVEresTbl, by="Ssal") %>% 
  group_by(multiplicon) %>% 
  add_count(N3) %>% 
  ungroup() %>% 
  filter(n==2) %>% # only duplicates. Redundant since EVEres only has 1/2 and anchorpoints are 2+
  select(multiplicon,N3,N9,xy,sigDir,dupSigDir,Ssal) %>% 
  arrange(multiplicon,N3) %>% 
  # copy is down while the other copy is not down
  # mutate( isDiff = sigDir == "down" & dupSigDir != "downdown") %>% 
  mutate( isDiff = sigDir == "down" & dupSigDir == "down") %>% 
  group_by( N3 ) %>% 
  mutate( otherCopyIsDown = any(isDiff) & !isDiff) %>% 
  ungroup()

Expression vs gene fractionation per synteny block

Gene fractionation was calculated by counting the number of singletons in the synteny regions and take the log2 ratio of the counts.

nSsalPerN3 <- OGtbl %>% filter(spc=="Ssal") %>% count(N3)
singletonN3s <- filter(nSsalPerN3, n==1)$N3
dupN3s <- filter(nSsalPerN3, n==2)$N3

genes_per_multiplicon <-
  syn$multiplicons %>% 
  filter(genome_x=="Ssal",genome_y=="Ssal",grepl("^NC_027",list_x),grepl("^NC_027",list_y)) %>% 
  filter(profile_length>100) %>% 
  group_by(id) %>% # by multiplicon
  # get all genes within each ohnologous region
  do( genes_x = filter(select(syn$genes, list,coordinate, N3, geneID=id),
                       list==.$list_x, coordinate >= .$begin_x, coordinate <= .$end_x),
      genes_y = filter(select(syn$genes, list,coordinate, N3, geneID=id),
                       list==.$list_y, coordinate >= .$begin_y, coordinate <= .$end_y) )

# count number of singletons at each copy of the duplicated chromosome regions
retained_per_multiplicon <- tibble( 
  multiplicon = genes_per_multiplicon$id, 
  nSingle_x = sapply(genes_per_multiplicon$genes_x, function(X) {sum( X$N3 %in% singletonN3s)}),
  nSingle_y = sapply(genes_per_multiplicon$genes_y, function(Y) {sum( Y$N3 %in% singletonN3s)}),
  nDups = mapply(X=genes_per_multiplicon$genes_x, Y=genes_per_multiplicon$genes_y, FUN = function(X,Y) {
    sum((X$N3 %in% Y$N3) & (X$N3 %in% dupN3s)) }))

fractionbias_per_multiplicon <- 
  isDiff_per_multiplicon %>% 
  group_by(multiplicon) %>% 
  summarise( nShift_x = sum(xy=="gene_x" & otherCopyIsDown),
             nShift_y = sum(xy=="gene_y" & otherCopyIsDown),
             n=n()/2) %>%
  left_join(retained_per_multiplicon,by="multiplicon") %>% 
  mutate( pExprBias = map2_dbl(.x = nShift_x, .y = nShift_y, ~ binom.test( x = .x, n = .x + .y)$p.value)) %>% 
  mutate( pGeneBias = map2_dbl(.x = nSingle_x, .y = nSingle_y, ~ binom.test( x = .x, n = .x + .y)$p.value)) %>% 
  mutate( pAdjExprBias = p.adjust(pExprBias,method = "BH")) %>% 
  mutate( pAdjGeneBias = p.adjust(pGeneBias,method = "BH")) %>%
  mutate( shiftRatio=log2(nShift_x/nShift_y), geneRatio = log2(nSingle_x/nSingle_y)) %>%
  filter( is.finite(shiftRatio))

fractionbias_per_multiplicon_cortest <- 
  fractionbias_per_multiplicon %>% 
  with( cor.test(shiftRatio,geneRatio) )

pvalText <- sprintf("P = %0.3f",fractionbias_per_multiplicon_cortest$p.value)

# The P value changes depending on 

# replicate(n=100,{
#   fractionbias_per_multiplicon %>% 
#     mutate( swap = sample(c(-1,1),size = n(),replace = T)) %>% 
#     mutate( shiftRatio = shiftRatio*swap) %>% 
#     mutate( geneRatio = geneRatio*swap) %>% 
#     with( cor.test(shiftRatio,geneRatio)$p.value )
# }) %>%  hist()

fractionbias_per_multiplicon %>% 
  mutate( nShift_total = nShift_x+nShift_y) %>% 
  ggplot( aes(y=shiftRatio,x=geneRatio)) + 
  geom_smooth(method="lm", fill="blue", linetype=2, size=0.8, alpha=0.1, formula = y ~ x ) +
  geom_point(aes(color=ifelse(pAdjGeneBias<0.05,"p.adj < 0.05","n.s."))) +  
                 # size=nShift_total)) +
  scale_color_manual( values = c("p.adj < 0.05"="orange","n.s."="grey")) + labs(color = "Gene fractionation\nsignificance") + 
  annotate("text",  x=1, y = 0.65, label = pvalText, color="blue") +
  ylab("Expression fractionation bias") + xlab("Gene fractionation bias") +
  theme_bw() +
  coord_fixed()

The correlation seems significant (P = 0.027) but is driven by synteny blocks with low number of expression fractioned gene-pairs.

fractionation chunks along Ssal chromosomes

Genes along the chromosomes are divided into chunks of ~100 genes and for each chunk the proportion of singletons (gene fractionation, inner track) and the proportion of down-regulated duplicates (expression fractionation, outer track) is calculated. Note that expression fractionation is sometimes missing because there are no tested duplicates in the chunk. Synteny blocks (links in the center) with significant gene fractionation bias is shown in orange.

# gene fractionation along each salmonid chromosome

# divide the genes along the chromosome into equal sized chunks
chunkSize = 100

getBreaks <- function(n,w){round(seq(0,n,length.out = 1+floor(n/w)))}

getChunks <- function(chunkSize){
  syn$genes %>% filter(genome=="Ssal",grepl("^NC_027",list)) %>% 
  select(geneID=id, list,coordinate)  %>% 
  group_by(list) %>% 
  mutate( chunk = as.character(cut(coordinate,breaks = getBreaks(max(coordinate),chunkSize),
                                   labels = getBreaks(max(coordinate),chunkSize)[-1],include.lowest = T))) %>% 
  group_by(list,chunk) %>% 
  mutate( xstart=min(coordinate),xend=max(coordinate),xmid=(xend+xstart)/2) %>% 
  ungroup()
}

chunks = getChunks(chunkSize)

# geneFractionChunks <-
#   syn$genes %>% 
#   filter( grepl("^NC_027",list) ) %>% 
#   left_join(OGtbl %>% filter(spc=="Ssal") %>% add_count(N3) %>% select(geneID,nPerN3 = n), 
#             by=c("id"="geneID")) %>% 
#   select( id,list,nPerN3,coordinate) %>% 
#   left_join( select(chunks,-coordinate,-list),by=c("id"="geneID")) %>% 
#   group_by( list, chunk, xmid) %>%
#   filter(!is.na(nPerN3)) %>%
#   summarise( propSingletons=sum(nPerN3==1)/sum(nPerN3 %in% 1:2)) %>% 
#   arrange( list,xmid) %>% ungroup() %>% 
#   mutate( col = as.character(cut(rank(propSingletons),breaks=seq(0,max(rank(propSingletons)),
#                                                      length.out = 100+1),
#                                  labels = viridis::viridis(100))))


### Smoothed version
geneFractionChunks <-
  syn$genes %>% 
  filter( grepl("^NC_027",list) ) %>% 
  left_join(OGtbl %>% filter(spc=="Ssal") %>% add_count(N3) %>% select(geneID,nPerN3 = n), 
            by=c("id"="geneID")) %>% 
  select( id,list,nPerN3,coordinate) %>% 
  left_join( select(chunks,-coordinate,-list),by=c("id"="geneID")) %>% 
  filter(!is.na(nPerN3)) %>%
  filter(nPerN3 %in% 1:2) %>%
  mutate( isSingle = ifelse(nPerN3==1,1,0)) %>%
  add_count(list) %>% 
  group_by( list) %>%
  # Make a smooth plot out of the binary "isSingle" using loess.
  # The "span=6*chunkSize/.$n[1]" ensures that the smoothing span is the same regardless of how long the 
  # chromosomes are.
  do( propSingletons = predict(loess(y~x,data.frame(x=.$coordinate,y=.$isSingle),span=6*chunkSize/.$n[1]),
                                      newdata=data.frame(x=unique(.$xmid))),
      xmid=unique(.$xmid), chunk=unique(.$chunk)) %>% 
  unnest(cols=-list) %>% 
  arrange( list,xmid) %>% ungroup() %>% 
  mutate( col = as.character(cut(rank(propSingletons),breaks=seq(0,max(rank(propSingletons)),
                                                     length.out = 100+1),
                                 labels = viridis::viridis(100))))
  

  
    
    
  

# exprFractionChunks <- 
#   isDiff_per_multiplicon %>%
#   select(Ssal, otherCopyIsDown) %>% 
#   left_join( select(syn$genes, list, coordinate, id), by=c("Ssal"="id")) %>% 
#   left_join( select(chunks,-coordinate,-list),by=c("Ssal"="geneID")) %>% 
#   group_by( list, chunk,xmid) %>%
#   summarise( propDiff=mean(otherCopyIsDown), n=n()) %>% 
#   arrange( list,xmid) %>% ungroup() %>% 
#   mutate( col = as.character(cut(rank(propDiff),breaks=seq(0,max(rank(propDiff)), length.out = 100+1),
#                                  labels = viridis::magma(100), include.lowest = T)))

### Smoothed version
exprFractionChunks <- 
  isDiff_per_multiplicon %>%
  select(Ssal, otherCopyIsDown) %>% 
  left_join( select(syn$genes, list, coordinate, id), by=c("Ssal"="id")) %>% 
  left_join( select(chunks,-coordinate,-list),by=c("Ssal"="geneID")) %>%
  arrange( list,xmid) %>% ungroup() %>% 
  mutate( otherCopyIsDown = ifelse(otherCopyIsDown,1,0)) %>%
  add_count(list,name = "nPerChr") %>%
  group_by( list) %>%
  add_count(chunk,name = "nPerChunk") %>% 
  do( propDiff = predict(loess(y~x,data.frame(x=.$coordinate,y=.$otherCopyIsDown),span=6*mean(.$nPerChunk)/.$nPerChr[1]),
                                      newdata=data.frame(x=unique(.$xmid))),
      xmid=unique(.$xmid), chunk=unique(.$chunk)) %>% 
  unnest(cols=-list) %>% 
  mutate( col = as.character(cut(rank(propDiff),breaks=seq(0,max(rank(propDiff)), length.out = 100+1),
                                 labels = viridis::magma(100), include.lowest = T)))

Fractionation on Ssal circos plot

# initialize with chromosome lengths
# data is in bed format, i.e. 3 columns: name, start, end
chrData <- 
  # get Ssal chromosomes (-mitochondrial)
  syn$genes %>% filter(genome=="Ssal",grepl("^NC_027",list)) %>% 
  group_by(list) %>% 
  summarise( start=min(coordinate), end=max(coordinate)) %>% 
  arrange(list)

chrNames <- setNames(sprintf("%i",1:nrow(chrData)),chrData$list)

circos.par(start.degree=90)
circos.initialize(factors = chrNames[chrData$list], xlim = cbind(chrData$start, chrData$end))

# Add axis (chromosome numbers)
circos.track(ylim = c(0, 1), panel.fun = function(x, y) {
    # circos.segments(x0=CELL_META$xlim[1], x1=CELL_META$xlim[2],
    #             y0=CELL_META$ylim[1], y1=CELL_META$ylim[1], col="salmon" )
    circos.text(CELL_META$xcenter, CELL_META$ycenter, CELL_META$sector.index, 
        facing = "inside",cex=0.8)
    circos.axis(h = "bottom", major.at = CELL_META$xlim, major.tick = F,labels = "",
       labels.cex = 0.6)
}, bg.border = NA, track.height = 0.1)


#### Expression fractionation track

exprFractionChunksPerSector <-
  exprFractionChunks %>%
  mutate( fa = chrNames[list]) %>% 
  left_join( select(chunks,list,chunk,xstart,xend) %>% distinct(), by=c("list","chunk")) %>% 
  select(fa,xstart,xend,col,propDiff)
    


circos.track(track.height = 0.1, track.margin = c(0,0), ylim = c(0, 1), bg.border = NA, 
             panel.fun = function(x, y) {
    sector.index = CELL_META$sector.index
    
    x <- exprFractionChunksPerSector %>% filter(fa==sector.index)

    for(i in 1:nrow(x)) {
        circos.rect(xleft = x$xstart,ybottom = 0,xright = x$xend,ytop = 1,col=x$col,border = NA)
    }
})


#### Gene-fractionation track

geneFractionChunksPerSector <-
  geneFractionChunks %>%
  mutate( fa = chrNames[list]) %>% 
  left_join( select(chunks,list,chunk,xstart,xend) %>% distinct(), by=c("list","chunk")) %>% 
  mutate( col = as.character(cut(propSingletons,breaks=seq(0,max(geneFractionChunks$propSingletons),
                                                     length.out = 100+1),
                                 labels = viridis::viridis(100)))) %>%
  select(fa,xstart,xend,col,propSingletons)
    


circos.track(track.height = 0.1, track.margin = c(0,0), ylim = c(0, 1), bg.border = NA, 
             panel.fun = function(x, y) {
    sector.index = CELL_META$sector.index
    
    x <- geneFractionChunksPerSector %>% filter(fa==sector.index)

    for(i in 1:nrow(x)) {
        circos.rect(xleft = x$xstart,ybottom = 0,xright = x$xend,ytop = 1,col=x$col,border = NA)
    }
})


#### Synteny links

SsalSynteny <-
  syn$multiplicons %>% 
  # synteny between Ssal chromosomes
  filter(genome_x=="Ssal",genome_y=="Ssal",grepl("^NC_027",list_x),grepl("^NC_027",list_y)) %>% 
  filter(profile_length>100) %>% 
  arrange(list_x)

# color the synteny blocks based on p-value for gene fractionation bias
sigGeneFracBiasColor <- 
  SsalSynteny %>% 
  left_join(retained_per_multiplicon, by=c("id"="multiplicon")) %>% 
  mutate( pGeneBias = map2_dbl(.x = nSingle_x, .y = nSingle_y, ~ binom.test( x = .x, n = .x + .y)$p.value)) %>% 
  mutate( pAdjGeneBias = p.adjust(pGeneBias,method = "BH")) %>% 
  with( ifelse(pAdjGeneBias<0.05, "orange","grey"))

  
reg1 = SsalSynteny %>% transmute(names=chrNames[list_x], start=begin_x, end=end_x) 
reg2 = SsalSynteny %>% transmute(names=chrNames[list_y], start=begin_y, end=end_y)
# NOTE: region1/2 must be data.frame NOT tibble 
circos.genomicLink(as.data.frame(reg1), as.data.frame(reg2), 
                   # col = rainbow(29)[pmin(as.integer(reg1$names),as.integer(reg2$names))],
                   col = sigGeneFracBiasColor,
                   border = NA)

# clear
circos.clear()

Note that the coordinates in the circos plot refers to number of genes.

The “duplicated regions” are the iADHoRE synteny blocks with at least 100 profile length. These may be slightly overlapping .

Proportion of genome covered by the included synteny blocks:

mean( filter(syn$genes,genome=="Ssal", grepl("^NC_0273", list))$id %in%
        bind_rows(c(genes_per_multiplicon$genes_x,genes_per_multiplicon$genes_y))$geneID)

## [1] 0.8028156

Fractionation bias analysis

Lars Grønvold

02/06/2020