# Categories for tissue specific or unspecific shifts
nShiftedTblComplete <- readRDS("data/TissueAtlasAnalysis/nShiftedTblComplete.RDS")

# Expression shift categories
EVEresTbl <- read_rds("data/runEVE/EVEresTbl.RDS")

# Load table of TE overlaps (2200 bp promoter region)
TE_overlap <- read_tsv("input-data/TE/Ssal_overlapproportions.tsv",col_types = "ciiciiid")

# TE overlap by type/family
# files <- list.files("/mnt/SCRATCH/oymo/overlaps/4gareth", pattern = "prom_to_TEtype_*", full.names = TRUE)
# TE_family_overlap <- lapply(files, function(x) read_tsv(x, col_names = colnames(TE_overlap)))
# names(TE_family_overlap) <- sub(".*_", "", sub("\\.txt", "", basename(files)))
# TE_family_overlap_table <- tibble()
# for(family in names(TE_family_overlap)) {
#   TE_family_overlap_table <- TE_family_overlap_table %>%
#     bind_rows(
#       TE_family_overlap[[family]] %>%
#         transmute(gene_id = geneID,
#                   propOverlap,
#                   family = family)
#     )
# }
# saveRDS(TE_family_overlap_table, "input-data/TE/TE_family_overlap.RDS")

# Load family overlap data
TE_family_overlap <- read_rds("input-data/TE/TE_family_overlap.RDS")

TE content in promoters cumulative plot

EVEresTbl %>% 
  filter(!N3partial,dup_type=="duplicate", dupSigDir=="down") %>% 
  select(N3,Ssal,sigDir) %>% 
  mutate( sigDir = ifelse(sigDir=="","cons","down")) %>% 
  left_join( select(TE_overlap, Ssal=geneID, propOverlap), by="Ssal") %>% 
  left_join(select(nShiftedTblComplete,N3,nShifted),by="N3") %>% 
  bind_rows( any=., all=filter(., nShifted==15), .id="shifted") %>% 
  drop_na() %>% 
  group_by(sigDir,shifted) %>% 
  arrange(propOverlap) %>% 
  mutate( cumProp = seq(0,1,length.out = n())) %>% 
  ungroup() -> tmp

wilcox_TE_any <- 
  tmp %>% 
  filter( shifted=="any") %>% select(-Ssal, -cumProp) %>% 
  spread(key = sigDir,value = propOverlap) %>%
  with( wilcox.test(cons,down,paired=T, alternative="less"))

pval_text_any <-
  wilcox_TE_any %>%
  with( sprintf("p = %.03f",p.value) )

wilcox_TE_all <-
  tmp %>% 
  filter( shifted=="all") %>% select(-Ssal, -cumProp) %>% 
  spread(key = sigDir,value = propOverlap) %>% 
  with( wilcox.test(cons,down,paired=T, alternative="less"))

pval_text_all <-
  wilcox_TE_all %>%
  with( sprintf("p = %.1e",p.value) )

tmp %>% 
  filter( !(shifted=="all" & sigDir=="cons")) %>% 
  ggplot( ) + 
  geom_line(aes(x=propOverlap,y=cumProp,color=sigDir,linetype=shifted)) +
  scale_linetype_manual(values = c( any=1, all=2)) +
  scale_color_manual(values=c(down="#298ABD",cons="grey"), name="") +
  theme_bw() + xlab("TE content in promoter") + ylab("Cumulative proportion of duplicates") +
  annotate(geom = "text",x=0.35,y=0.85, hjust=0, color="#298ABD",
           label=paste0("down-shift, any tissues\n",pval_text_any)) +
  annotate(geom = "text", x=0.5,y=0.4, hjust=0, color="#298ABD",
           label=paste0("down-shift, all tissues\n", pval_text_all)) +
  scale_x_continuous(labels = scales::percent) +
  ggtitle("Promoter TE content in one-down duplicates")

Total TE content overlapping promoters

# Combine shift table with overlap proportions
shifts_TE_overlap <- tibble()

for(dir in c("up", "down")) {
  for(set in c("max5_tissues", "all_tissues", "any")) {
    for(shift in c("Ssal_shifted", "Ssal_cons")) {
      if(set == "any") {
        genes <- nShiftedTblComplete %>%
          filter(dupSigDir == dir) %>%
          dplyr::select(gene_id = eval(shift)) %>%
          mutate(dir = dir, set = set, shift = shift)
      } else if(set == "max5_tissues") {
        genes <- nShiftedTblComplete %>%
          filter(dupSigDir == dir) %>%
          filter(nShifted <= 5) %>%
          dplyr::select(gene_id = eval(shift)) %>%
          mutate(dir = dir, set = set, shift = shift)
      }
      else if(set == "all_tissues") {
        genes <- nShiftedTblComplete %>%
          filter(dupSigDir == dir) %>%
          filter(nShifted == 15) %>%
          dplyr::select(gene_id = eval(shift)) %>%
          mutate(dir = dir, set = set, shift = shift)
      }
      
      table <- genes %>%
        left_join(
          TE_overlap %>%
            transmute(gene_id = as.character(geneID), propOverlap),
          by = "gene_id"
        )
      
      shifts_TE_overlap <- shifts_TE_overlap %>%
        bind_rows(table)
    }
  }
}

unshifted_dups <- EVEresTbl %>% 
  filter(dup_type == "duplicate",
         isSig == FALSE) %>%
  dplyr::select(N3, Ssal) %>%
  group_by(N3) %>%
  mutate(name = paste0("Ssal", 1:length(N3))) %>%
  spread(name, Ssal) %>%
  ungroup() %>%
  filter(!is.na(Ssal1) & !is.na(Ssal2)) %>%
  dplyr::select(Ssal1, Ssal2)

for(shift in c("Ssal1", "Ssal2")) {
  genes <- unshifted_dups %>%
    dplyr::select(gene_id = eval(shift)) %>%
    mutate(dir = "unshifted", set = "unshifted", shift = shift)
  
  table <- genes %>%
        left_join(
          TE_overlap %>%
            transmute(gene_id = as.character(geneID), propOverlap),
          by = "gene_id"
        )
  shifts_TE_overlap <- shifts_TE_overlap %>%
        bind_rows(table)
}
shifts_TE_overlap %>%
  ggplot(aes(x = shift, y = propOverlap, colour = dir)) +
  geom_boxplot() +
  facet_wrap(.~set, scales = "free_x", nrow = 1) +
  scale_colour_manual(limits = c("up", "down", "unshifted"), values = c("firebrick3", "dodgerblue", "black")) +
  ggtitle(label = "Proportion of TE overlap with promoter")
## Warning: Removed 50 rows containing non-finite values (stat_boxplot).

overlap_dup_comparison <- tibble()
# Get gene set
for(dir in c("up", "down")) {
    for(set in c("max5_tissues", "all_tissues", "any")) {
      if(set == "any") {
        genes <- nShiftedTblComplete %>%
          filter(dupSigDir == dir) %>%
          dplyr::select(Ssal_shifted, Ssal_cons) %>%
          mutate(dir = dir, set = set)
      } else if(set == "max5_tissues") {
        genes <- nShiftedTblComplete %>%
          filter(dupSigDir == dir) %>%
          filter(nShifted <= 5) %>%
          dplyr::select(Ssal_shifted, Ssal_cons) %>%
          mutate(dir = dir, set = set)
      }
      else if(set == "all_tissues") {
        genes <- nShiftedTblComplete %>%
          filter(dupSigDir == dir) %>%
          filter(nShifted == 15) %>%
          dplyr::select(Ssal_shifted, Ssal_cons) %>%
          mutate(dir = dir, set = set)
      }
      genes <- genes %>%
        left_join(
          shifts_TE_overlap %>%
            mutate(Ssal_shifted = sub(".*:", "", gene_id)) %>%
            dplyr::select(Ssal_shifted, shift_propOverlap = propOverlap),
          by = "Ssal_shifted"
        ) %>%
        left_join(
          shifts_TE_overlap %>%
            mutate(Ssal_cons = sub(".*:", "", gene_id)) %>%
            dplyr::select(Ssal_cons, cons_propOverlap = propOverlap),
          by = "Ssal_cons"
        ) %>%
        mutate(diff_mean_proportion = shift_propOverlap - cons_propOverlap)
      overlap_dup_comparison <- overlap_dup_comparison %>%
        bind_rows(genes)
    }
}

# Unshifted
genes <- unshifted_dups %>%
        left_join(
          shifts_TE_overlap %>%
            mutate(Ssal1 = sub(".*:", "", gene_id), dir = "unshifted", set = "unshifted") %>%
            dplyr::select(Ssal1, dir, shift, set, shift_propOverlap = propOverlap),
          by = "Ssal1"
        ) %>%
        left_join(
          shifts_TE_overlap %>%
            mutate(Ssal2 = sub(".*:", "", gene_id), dir = "unshifted", set = "unshifted") %>%
            dplyr::select(Ssal2, cons_propOverlap = propOverlap),
          by = "Ssal2"
        ) %>%
        mutate(diff_mean_proportion = shift_propOverlap - cons_propOverlap) %>%
        rename(Ssal_shifted = Ssal1, Ssal_cons = Ssal2)
overlap_dup_comparison <- overlap_dup_comparison %>%
  bind_rows(genes) %>%
  distinct()
# Overlap comparison significance
overlap_dup_wilcox_test <- tibble()

# Test significant difference between correlations of shifting sets of genes versus unshifted
for(dir_ in c("down", "up")) {
  for(set_ in c("max5_tissues", "all_tissues", "any")) {
    
    data <- overlap_dup_comparison %>%
      filter(shift_propOverlap > 0 | cons_propOverlap > 0) %>%
      filter(set == set_, dir == dir_)
    
    pval.paired <- wilcox.test(data$shift_propOverlap, 
                               data$cons_propOverlap, 
                               paired = TRUE,
                               alternative = "greater")$p.value
    
    overlap_dup_wilcox_test <- overlap_dup_wilcox_test %>%
      bind_rows(
        tibble(dir = dir_,
               set = set_,
               #pval = pval,
               pval = pval.paired)
        
      )
  }
}
## Warning in wilcox.test.default(data$shift_propOverlap, data$cons_propOverlap, :
## cannot compute exact p-value with ties
kable(overlap_dup_wilcox_test)
dir set pval
down max5_tissues 0.2403259
down all_tissues 0.0013557
down any 0.0825368
up max5_tissues 0.0455457
up all_tissues 0.8842473
up any 0.2793993
overlap_dup_comparison %>%
  mutate(set = factor(set, levels = c("unshifted", "any", "max5_tissues", "all_tissues"))) %>%
  ggplot(aes(x = paste(set,dir), y = diff_mean_proportion, fill = dir)) +
  geom_boxplot(outlier.colour = NA, varwidth = FALSE) +
  geom_text(data = overlap_dup_wilcox_test, 
           aes(x = paste(set,dir), y = 1, label = round(pval, 4))) +
  scale_fill_manual(name = "Expression shift", 
                      limits = c("up", "down", "unshifted"), 
                      values = c("firebrick3", "dodgerblue", "white")) +
  guides(fill = "none") +
  labs(x = "Expression shift category", 
       y = "Difference in TE content in promoter\n between duplicates") +
  theme_bw() + 
  theme(text = element_text(size = 8, colour = "black"))
## Warning: Removed 44 rows containing non-finite values (stat_boxplot).

shifts_TE_overlap %>%
  filter(dir == "down") %>%
  mutate(set = factor(set, levels = c("any", "max5_tissues", "all_tissues"))) %>%
  ggplot() +
  geom_density(aes(linetype = set,  x = propOverlap, colour = shift), size = 0.75) +
  scale_colour_manual(name = "Expression shift", 
                      limits = c("Ssal_shifted", "Ssal_cons"), 
                      values = c("dodgerblue", "grey30")) +
  scale_linetype_manual(name = "Tissue specificity", 
                        limits = c("any", "max5_tissues", "all_tissues"), 
                        values = c("solid", "dotted", "dashed")) +
  guides(colour = "none", linetype = "none") +
  labs(y = "Density", 
       x = "TE content in promotor (proportion)") +
  theme_bw() + 
  theme(text = element_text(size = 8, colour = "black"), 
        axis.text = element_text(size = 8, colour = "black"),
        line = element_line(colour = "black"))
## Warning: Removed 1 rows containing non-finite values (stat_density).

#ggsave(filename = "Figures/shiftDupTEoverlap.pdf", device = "pdf", width = 3, height = 3, units = "in")
overlap_dup_comparison %>%
  mutate(set = factor(set, levels = c("unshifted", "any", "max5_tissues", "all_tissues"))) %>%
  ggplot() +
  geom_density(aes(linetype = set,  x = diff_mean_proportion, colour = dir)) +
  # geom_label(data = comp_overlap_sig,
  #            aes(x = 1, y = seq(0.5, 1.5, length.out = nrow(comp_overlap_sig)),
  #                label = paste0(substr(dir,1,1), ".", sub("_.*","",set), " ", round(pval.paired, 4))),
  #                hjust = 1, size = 2) +
  scale_colour_manual(name = "Expression shift", 
                      limits = c("up", "down", "unshifted"), 
                      values = c("firebrick3", "dodgerblue", "black")) +
  scale_linetype_manual(name = "Tissue specificity", 
                        limits = c("unshifted", "any", "max5_tissues", "all_tissues"), 
                        values = c("solid", "solid", "dotted", "dashed")) +
  guides(colour = "none", linetype = "none") +
  labs(y = "Density", 
       x = "Difference in promoter TE content between duplicates") +
  theme_bw() + 
  theme(text = element_text(size = 8, colour = "black"), 
        axis.text = element_text(size = 8, colour = "black"),
        line = element_line(colour = "black"))
## Warning: Removed 44 rows containing non-finite values (stat_density).

#ggsave(filename = "Figures/DuplicatePromoterTEContentDifference.pdf", 
#       device = "pdf", width = 5, height = 5, units = "in")

Superfamily level TE content overlapping promoters

# Combine shift table with overlap proportions
shifts_TE_overlap.families <- tibble()

for(dir in c("up", "down")) {
  for(set in c("max5_tissues", "all_tissues", "any")) {
    for(shift in c("Ssal_shifted", "Ssal_cons")) {
      if(set == "any") {
        genes <- nShiftedTblComplete %>%
          filter(dupSigDir == dir) %>%
          dplyr::select(gene_id = eval(shift)) %>%
          mutate(dir = dir, set = set, shift = shift)
      } else if(set == "max5_tissues") {
        genes <- nShiftedTblComplete %>%
          filter(dupSigDir == dir) %>%
          filter(nShifted <= 5) %>%
          dplyr::select(gene_id = eval(shift)) %>%
          mutate(dir = dir, set = set, shift = shift)
      }
      else if(set == "all_tissues") {
        genes <- nShiftedTblComplete %>%
          filter(dupSigDir == dir) %>%
          filter(nShifted == 15) %>%
          dplyr::select(gene_id = eval(shift)) %>%
          mutate(dir = dir, set = set, shift = shift)
      }
      
      table <- genes %>%
        left_join(
          TE_family_overlap %>%
            mutate(gene_id = as.character(gene_id)),
          by = "gene_id"
        )
      
      shifts_TE_overlap.families <- shifts_TE_overlap.families %>%
        bind_rows(table)
    }
  }
}

unshifted_dups <- EVEresTbl %>% 
  filter(dup_type == "duplicate",
         isSig == FALSE) %>%
  dplyr::select(N3, Ssal) %>%
  group_by(N3) %>%
  mutate(name = paste0("Ssal", 1:length(N3))) %>%
  spread(name, Ssal) %>%
  ungroup() %>%
  filter(!is.na(Ssal1) & !is.na(Ssal2)) %>%
  dplyr::select(Ssal1, Ssal2)

for(shift in c("Ssal1", "Ssal2")) {
  genes <- unshifted_dups %>%
    dplyr::select(gene_id = eval(shift)) %>%
    mutate(dir = "unshifted", set = "unshifted", shift = shift)
  
  table <- genes %>%
        left_join(
          TE_family_overlap %>%
            mutate(gene_id = as.character(gene_id)),
          by = "gene_id"
        )
  shifts_TE_overlap.families <- shifts_TE_overlap.families %>%
        bind_rows(table)
}

shifts_TE_overlap.families <- shifts_TE_overlap.families %>% 
  filter(!is.na(family))

# Order families by mean propOverlap
# family_levels <- shifts_TE_overlap.families %>%
#   dplyr::select(family, propOverlap) %>%
#   group_by(family) %>%
#   mutate(mean_propOverlap = mean(propOverlap)) %>%
#   ungroup() %>%
#   arrange(desc(mean_propOverlap)) %>%
#   .$family %>% unique()

# shifts_TE_overlap.families$family <- factor(shifts_TE_overlap.families$family, levels = family_levels)
shifts_TE_overlap.families$set <- factor(shifts_TE_overlap.families$set, levels = c("max5_tissues",  "all_tissues", "any", "unshifted"))
shifts_TE_overlap.families$shift <- factor(shifts_TE_overlap.families$shift, levels = c("Ssal_shifted", "Ssal_cons", "Ssal1", "Ssal2"))

# familyName = c("RLC"="LTR Copia", "RLG"="LTR Gypsy", "RLB"="LRT Bel-Pao", "RLR"="LRT Retrovirus", "RLE"="LTR ERV",
#                "RYD"="DIRS", "RYN"="DIR Ngaro", "RYV"="DIRS VIPER", "RPP"="PLE Penelope",
#                "RIR"="LINE", "RIT", "RIJ", "RIL", "RII", 
#                "RST", "RSL", "RSS")
# shifts_TE_overlap.families$familyName <- familyNames[shifts_TE_overlap.families$family]
overlap_dup_comparison.families <- tibble()

# Get gene set
for(dir in c("up", "down")) {
    for(set in c("max5_tissues", "all_tissues", "any")) {
      if(set == "max5_tissues") {
        genes <- nShiftedTblComplete %>%
          filter(dupSigDir == dir, nShifted <= 5) %>%
          dplyr::select(Ssal_shifted, Ssal_cons)
      }
      else if(set == "all_tissues") {
        genes <- nShiftedTblComplete %>%
          filter(dupSigDir == dir, nShifted == 15) %>%
          dplyr::select(Ssal_shifted, Ssal_cons)
      } else if(set == "any") {
        genes <- nShiftedTblComplete %>%
          filter(dupSigDir == dir) %>%
          dplyr::select(Ssal_shifted, Ssal_cons)
      }
      
      genes <- genes %>%
        mutate(dir = dir, set = set) %>%
         left_join(
          shifts_TE_overlap.families %>%
            dplyr::select(Ssal_shifted = gene_id, family, shift_propOverlap = propOverlap),
          by = "Ssal_shifted"
        ) %>%
        left_join(
          shifts_TE_overlap.families %>%
            dplyr::select(Ssal_cons = gene_id, family, cons_propOverlap = propOverlap),
          by = c("Ssal_cons", "family")
        ) %>%
        mutate(diff_proportion = shift_propOverlap - cons_propOverlap)
      
      overlap_dup_comparison.families <- overlap_dup_comparison.families %>%
        bind_rows(genes)
    }
}

# Unshifted
genes <- unshifted_dups %>%
  rename(Ssal_shifted = Ssal1, Ssal_cons = Ssal2) %>%
  mutate(dir = "unshifted", set = "unshifted") %>%
        left_join(
          shifts_TE_overlap.families %>%
            dplyr::select(Ssal_shifted = gene_id, family, shift_propOverlap = propOverlap),
          by = "Ssal_shifted"
        ) %>%
        left_join(
          shifts_TE_overlap.families %>%
            dplyr::select(Ssal_cons = gene_id, family, cons_propOverlap = propOverlap),
          by = c("Ssal_cons", "family")
        ) %>%
  mutate(diff_proportion = shift_propOverlap - cons_propOverlap)
  
overlap_dup_comparison.families <- overlap_dup_comparison.families %>%
  bind_rows(genes) %>%
  distinct()
# Family Overlap comparison significance
family_overlap_wilcox_test <- tibble()

# Test significant difference between correlations of shifting sets of genes versus unshifted
for(dir_ in c("down", "up")) {
  for(set_ in c("max5_tissues", "all_tissues", "any")) {
    for(family_ in unique(overlap_dup_comparison.families$family)) {
      
      data <- overlap_dup_comparison.families %>%
        filter(shift_propOverlap > 0 | cons_propOverlap > 0) %>%
        filter(set == set_, dir == dir_, family == family_)
      
      if(nrow(data) > 1) {
        pval.paired <- wilcox.test(data$shift_propOverlap, 
                                   data$cons_propOverlap, 
                                   paired = TRUE,
                                   alternative = "greater")$p.value
        
        family_overlap_wilcox_test <- family_overlap_wilcox_test %>%
          bind_rows(tibble(dir = dir_,
                           set = set_,
                           family = family_,
                           pval = pval.paired))
      }
    }
  }
}
## Warning in wilcox.test.default(data$shift_propOverlap, data$cons_propOverlap, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(data$shift_propOverlap, data$cons_propOverlap, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(data$shift_propOverlap, data$cons_propOverlap, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(data$shift_propOverlap, data$cons_propOverlap, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(data$shift_propOverlap, data$cons_propOverlap, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(data$shift_propOverlap, data$cons_propOverlap, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(data$shift_propOverlap, data$cons_propOverlap, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(data$shift_propOverlap, data$cons_propOverlap, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(data$shift_propOverlap, data$cons_propOverlap, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(data$shift_propOverlap, data$cons_propOverlap, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(data$shift_propOverlap, data$cons_propOverlap, :
## cannot compute exact p-value with ties
#kable(family_overlap_wilcox_test)

family_overlap_wilcox_test %>%
  filter(pval < 0.05) %>%
  kable()
dir set family pval
down any RIL 0.0230680
up max5_tissues RII 0.0078125
up all_tissues RSX 0.0338135
up all_tissues XXX 0.0159039
up any RII 0.0124708
up any RYX 0.0312500
# family_overlap_wilcox_test %>%
#   filter(dir == "down") %>%
#   kable()
zero_dups <- overlap_dup_comparison.families %>%
  filter(shift_propOverlap == 0 & cons_propOverlap == 0) %>%
  #select(Ssal_shifted, Ssal_cons, family) %>%
  gather(shift, gene_id, c("Ssal_shifted", "Ssal_cons")) %>%
  transmute(id = paste(gene_id, family)) %>%
  .$id

family_count.down <- shifts_TE_overlap.families %>%
  filter(!(paste(gene_id, family) %in% zero_dups)) %>%
  filter(dir == "down", shift == "Ssal_shifted", set == "any") %>%
  count(family, sort = TRUE)

family_count.up <- shifts_TE_overlap.families %>%
  filter(!(paste(gene_id, family) %in% zero_dups)) %>%
  filter(dir == "up", shift == "Ssal_shifted", set == "any") %>%
  count(family, sort = TRUE)

# Min X number of down shifted dups (any tissue number), for each family
minN <- 10
minFamilies.down <- family_count.down %>%
  filter(n >= minN) %>%
  .$family
length(minFamilies.down) # 16
## [1] 16
# Min X up
minN <- 10
minFamilies.up <- family_count.up %>%
  filter(n >= minN) %>%
  .$family
length(minFamilies.up) # 16
## [1] 11
shifts_TE_overlap.families %>%
  filter(!(paste(gene_id, family) %in% zero_dups)) %>%
  filter(dir == "down") %>%
  filter(family %in% minFamilies.down) %>%
  ggplot() +
  geom_density(aes(linetype = set,  x = propOverlap, colour = shift), size = 0.511) +
  scale_colour_manual(name = "Expression shift", 
                      limits = c("Ssal_shifted", "Ssal_cons"), 
                      values = c("dodgerblue", "grey30")) +
  scale_linetype_manual(name = "Tissue specificity", 
                        limits = c("unshifted", "any", "max5_tissues", "all_tissues"), 
                        values = c("solid", "solid", "dotted", "dashed")) +
  guides(colour = "none", linetype = "none") +
  labs(y = "Density", 
       x = "TE content in promoter (proportion)") +
  facet_wrap(~family, ncol = 4, scales = "free_y") +
  theme_bw() + 
  theme(text = element_text(size = 8, colour = "black"), 
        axis.text = element_text(size = 8, colour = "black"),
        line = element_line(colour = "black"))
## Warning: Groups with fewer than two data points have been dropped.

## Warning: Groups with fewer than two data points have been dropped.

overlap_dup_comparison.families %>%
  filter(dir %in% c("down", "unshifted")) %>%
  filter(family %in% minFamilies.down) %>%
  ggplot() +
  geom_density(aes(linetype = set,  x = diff_proportion, colour = dir), size = 0.75) +
  scale_colour_manual(name = "Expression shift", 
                      limits = c("down", "unshifted"), 
                      values = c("dodgerblue", "grey30")) +
  scale_linetype_manual(name = "Tissue specificity", 
                        limits = c("unshifted", "any", "max5_tissues", "all_tissues"), 
                        values = c("solid", "solid", "dotted", "dashed")) +
  guides(colour = "none", linetype = "none") +
  labs(y = "Density", 
       x = "Difference in TE content between duplicates") +
  facet_wrap(~family, ncol = 4, scales = "free") +
  theme_bw() + 
  theme(text = element_text(size = 8, colour = "black"), 
        axis.text = element_text(size = 8, colour = "black"),
        line = element_line(colour = "black"))
## Warning: Removed 384 rows containing non-finite values (stat_density).

shifts_TE_overlap.families %>%
  filter(!(paste(gene_id, family) %in% zero_dups)) %>%
  filter(dir == "up") %>%
  filter(family %in% (family_overlap_wilcox_test %>%
                        filter(pval < 0.05, dir == "up") %>%
                        .$family)) %>%
  ggplot() +
  geom_density(aes(linetype = set,  x = propOverlap, colour = shift), size = 0.5) +
  scale_colour_manual(name = "Expression shift", 
                      limits = c("Ssal_shifted", "Ssal_cons"), 
                      values = c("firebrick", "grey30")) +
  scale_linetype_manual(name = "Tissue specificity", 
                        limits = c("unshifted", "any", "max5_tissues", "all_tissues"), 
                        values = c("solid", "solid", "dotted", "dashed")) +
  guides(colour = "none", linetype = "none") +
  labs(y = "Density", 
       x = "TE content in promoter (proportion)") +
  facet_wrap(~family, ncol = 2, scales = "free_y") +
  theme_bw() + 
  theme(text = element_text(size = 8, colour = "black"), 
        axis.text = element_text(size = 8, colour = "black"),
        line = element_line(colour = "black"))

Down+cons

shifts_TE_overlap %>% 
  # calculate cumulative proportion of each category
  group_by(dir, set, shift) %>% 
  arrange(propOverlap) %>%
  mutate(cumProp = seq(0, 1, length.out = n())) %>% 
  ungroup() %>% 
  
  filter(dir == "down",
         set %in% c("any", "all_tissues")) %>%
  ggplot() +
  geom_line(aes(x=propOverlap, y=cumProp, color=shift, linetype=set), size = 1) +
  scale_color_manual(values=c(Ssal_shifted="#298ABD",Ssal_cons="grey"), name="Shift") +
  scale_linetype_manual(name = "Tissue specificity", 
                        limits = c("unshifted", "any", "max5_tissues", "all_tissues"), 
                        values = c("solid", "solid", "dotted", "dashed")) +
  xlab("TE overlap (proportion)") + ylab("Cumulative proportion") +
  coord_cartesian(xlim=c(0,1)) +
  theme_bw() + 
  theme(text = element_text(size = 8, colour = "black"),
        strip.background = element_blank())
## Warning: Removed 1 row(s) containing missing values (geom_path).

#ggsave("Figures/TEOverlapCumPropDown.pdf", width = 5, height = 3, units = "in")
shifts_TE_overlap.families %>%
  filter(!(paste(gene_id, family) %in% zero_dups)) %>%
  
  group_by(dir, set, shift, family) %>% 
  arrange(propOverlap) %>%
  mutate(cumProp = seq(0, 1, length.out = n())) %>% 
  ungroup() %>% 
  filter(dir == "down", set %in% c("any", "all_tissues")) %>%
  filter(family %in% minFamilies.down) %>%
  mutate(family = factor(family, levels = minFamilies.down)) %>%
  ggplot() +
  geom_line(aes(x=propOverlap, y=cumProp, color=shift, linetype=set), size = 0.75) +
  scale_color_manual(values=c(Ssal_shifted="#298ABD",Ssal_cons="grey"), name="") +
  scale_linetype_manual(name = "Tissue specificity", 
                        limits = c("unshifted", "any", "max5_tissues", "all_tissues"), 
                        values = c("solid", "solid", "dotted", "dashed")) +
  xlab("TE overlap (proportion)") + ylab("Cumulative proportion") +
  coord_cartesian(xlim=c(0,1)) +
  guides(colour = "none", linetype = "none") +
  facet_wrap(~family, ncol = 4) +
  theme_bw() + 
  theme(text = element_text(size = 8, colour = "black"),
        strip.background = element_blank())

#ggsave("Figures/TEfamilyOverlapCumPropDown.pdf", width = 7.5, height = 8, units = "in")

Up+cons

shifts_TE_overlap.families %>%
  filter(!(paste(gene_id, family) %in% zero_dups)) %>%
  
  group_by(dir, set, shift, family) %>% 
  arrange(propOverlap) %>%
  mutate(cumProp = seq(0, 1, length.out = n())) %>% 
  ungroup() %>% 
  
  filter(dir == "up", set %in% c("any", "max5_tissues", "all_tissues")) %>%
  filter(family %in% minFamilies.up) %>%
  mutate(family = factor(family, levels = minFamilies.up)) %>%
  ggplot() +
  geom_line(aes(x=propOverlap, y=cumProp, color=shift, linetype=set), size = 0.75) +
  scale_colour_manual(name = "Expression shift", 
                      limits = c("Ssal_shifted", "Ssal_cons"), 
                      values = c("firebrick", "grey")) +
  scale_linetype_manual(name = "Tissue specificity", 
                        limits = c("unshifted", "any", "max5_tissues", "all_tissues"), 
                        values = c("solid", "solid", "dotted", "dashed")) +
  xlab("TE overlap (proportion)") + ylab("Cumulative proportion") +
  coord_cartesian(xlim=c(0,1)) +
  guides(colour = "none", linetype = "none") +
  facet_wrap(~family, ncol = 4) +
  theme_bw() +
  theme(text = element_text(size = 8, colour = "black"),
        strip.background = element_blank())

#ggsave("Figures/TEfamilyOverlapCumPropUp.pdf", width = 7.5, height = 6, units = "in")

plot stat annotation

# N dup orthogroups in family with shift up or down, any tissues
# Down
kable(filter(family_count.down, n >= 10))
family n
DTT 640
RIJ 600
simple 577
DTX 447
XXX 427
RSX 241
RII 70
DTA 59
DTB 58
RST 55
RLX 33
RLG 29
RIL 27
RIT 27
RSS 16
RYX 13
# Up
kable(filter(family_count.up, n >= 10))
family n
DTT 226
RIJ 225
simple 191
DTX 165
XXX 158
RSX 99
RII 36
DTA 25
RST 24
DTB 17
RIL 15
# P-values for wilcox tests between cons and shift dup TE overlap proportions
# Down
# family_overlap_wilcox_test %>%
#   filter(dir == "down",
#          set %in% c("any", "all_tissues"),
#          family %in% minFamilies.down) %>%
#   mutate(pval = round(pval, 4)) %>%
#   kable()

family_overlap_wilcox_test %>%
  filter(dir == "down",
         set %in% c("any", "all_tissues"),
         family %in% minFamilies.down, 
         pval < 0.05) %>%
  mutate(pval = round(pval, 4)) %>%
  kable()
dir set family pval
down any RIL 0.0231
# Up
family_overlap_wilcox_test %>%
  filter(dir == "up",
         set %in% c("any", "all_tissues", "max5_tissues"),
         family %in% minFamilies.up,
         pval < 0.05) %>%
  mutate(pval = round(pval, 4)) %>%
  kable()
dir set family pval
up max5_tissues RII 0.0078
up all_tissues RSX 0.0338
up all_tissues XXX 0.0159
up any RII 0.0125