Analysis of Phylum and Family abundances on developmental serie 16S rRNA GENE data

# Loading packages
library(phyloseq)
library(ggplot2)
library(phytools)

## Loading required package: ape

## Loading required package: maps

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Folders, Themes, colors
source("prelude.R")

# Setting up directories
data_dir <-paste0(data_dir_path,"xpdev")
output_dir <- paste0(output_dir_path,"Figure2_dev_abundance")

####################################################
#1 Figure barplot proportion of most abundant phyla #
####################################################
# Importing data
# Defining the directory containing the data to import
setwd(data_dir)

# Working on abundance data from 100 rarefactions
comm<-read.table("xpdev_fichier_allcom.txt", header=TRUE, row.names=1)
# Getting rid of OTUs with zero abundance
comm<-comm %>% select_if((function(col) is.numeric(col) && sum(col) > 0))
comm<-t(comm)
#Importing metadata
metadata<-read.csv("nmt3_2_metadata_xp_dev.csv",sep=";",header=TRUE,row.names=1)
# Sorting by developmental stage
metadata$LifeStage_sorted<-factor(metadata$LifeStage,levels=c("Premetamorph","Prometamorph","Metamorph","Froglet","Adult"))

taxmat<-read.csv("nmt3_Gal18_abondance_tax.csv",sep=";",header=TRUE,row.names=1)
taxmat<-as.matrix(taxmat)
OTU <- otu_table(comm, taxa_are_rows = TRUE)
TAX <- tax_table(taxmat)
sample_data <- sample_data(metadata)
#Importing phylogeny
phy<-read.tree("nmt3_sequences_mpr.tree")
#Rooting the tree using midpoint rooting
phy<-midpoint.root(phy)
physeq.ini <- merge_phyloseq(OTU,sample_data,TAX,phy)

#2 - We fuse taxa at the phylum level
physeq.ini.Rg<-tax_glom(physeq.ini, taxrank="Phylum")

#3 - Transforming abundance counts into proportions
physeq.ini.Rgm <- transform_sample_counts(physeq.ini.Rg, function(x) 100 * x/sum(x))

#4 - Filtering OTUs
#We keep OTUs with at least 0.5% relative abundance in a given sample
physeq.ini.Rfgm <- filter_taxa(physeq.ini.Rgm,function(x) mean(x)>0.5,TRUE)

#5 - We merge samples corresponding to the same Developmental Stage category
# Transform all variables to factors for merging
df <- as.data.frame(lapply(sample_data(physeq.ini.Rfgm),function (y) if(class(y)!="factor" ) as.factor(y) else y),stringsAsFactors=T)
row.names(df) <- sample_names(physeq.ini.Rfgm)
sample_data(physeq.ini.Rfgm) <- sample_data(df)
#Merging
physeq.ini.Rfgm2 <- merge_samples(physeq.ini.Rfgm, "LifeStage")

#6 - We recompute the relative abundance because the sample merge sums up the relative abundance of each merged sample
physeq.ini.Rf2gm2 <- transform_sample_counts(physeq.ini.Rfgm2, function(x) 100 * x/sum(x))

# We need to recover the good legends corresponding to the category used for merging samples
sample_data(physeq.ini.Rf2gm2)$LifeStage <- levels(as.factor(sample_data(physeq.ini.Rg)$LifeStage))

#7 - We can go on with the abundance plot itself
# Defining the output dir
setwd(output_dir)

plot_stade_compo<-plot_bar(physeq.ini.Rf2gm2, "LifeStage", fill = "Phylum")+ ylab("Percentage of Sequences")+ scale_fill_manual(values=jcolors)+theme_nb()+theme(axis.text.x = element_text(angle = 0,hjust=0.5))+scale_x_discrete(limits=c("Premetamorph","Prometamorph","Metamorph","Froglet","Adult"), labels=c("Pre","Pro","Met","Fro","Adu"))+xlab("Developmental stage")

## Warning in psmelt(physeq): The sample variables: 
## Sample
##  have been renamed to: 
## sample_Sample
## to avoid conflicts with special phyloseq plot attribute names.

## Warning in psmelt(physeq): The sample variables: 
## Species
##  have been renamed to: 
## sample_Species
## to avoid conflicts with taxonomic rank names.

plot_stade_compo

ggsave("plot_stade_compo.pdf",width=6,height=5)

#8 Plot by phylum
p<-plot_bar(physeq.ini.Rf2gm2,x="LifeStage",y="Abundance",facet_grid=~Phylum,fill="Phylum")+scale_x_discrete(limits=c("Adult","Froglet","Metamorph","Prometamorph","Premetamorph"))+coord_flip()+scale_fill_manual(values=jcolors)+ylab("Abundance in %")+xlab("Developmental stage")+scale_fill_manual(values=jcolors)+ylab("Abundance in %")+xlab("Developmental stage")+theme_nb()+theme(axis.text.x = element_text(angle = 0,hjust=0.5))

## Warning in psmelt(physeq): The sample variables: 
## Sample
##  have been renamed to: 
## sample_Sample
## to avoid conflicts with special phyloseq plot attribute names.

## Warning in psmelt(physeq): The sample variables: 
## Species
##  have been renamed to: 
## sample_Species
## to avoid conflicts with taxonomic rank names.

## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

p<-p+geom_text(aes(x=Sample, y=80,label=(scales::label_number(accuracy=0.01)(Abundance))),fontface = "bold",colour="black")+theme(axis.text.x=element_text(size=12),strip.text=element_text(face="bold",size=12),strip.background=element_rect(fill="white",color="black"),legend.position="top")
p

ggsave("plot_stade_compophylum.pdf",width=16,height=4)


###############################################################################
#2 Figure barplot proportions of most abundant phyla according to Gram factor #
###############################################################################
# create dataframe from phyloseq object
df_physeq_ini_Rgm<- psmelt(physeq.ini.Rf2gm2)

## Warning in psmelt(physeq.ini.Rf2gm2): The sample variables: 
## Sample
##  have been renamed to: 
## sample_Sample
## to avoid conflicts with special phyloseq plot attribute names.

## Warning in psmelt(physeq.ini.Rf2gm2): The sample variables: 
## Species
##  have been renamed to: 
## sample_Species
## to avoid conflicts with taxonomic rank names.

# convert to tibble format
df_physeq_ini_Rgm =as_tibble(df_physeq_ini_Rgm)
# Simplify our data frame
df_physeq_ini_Rgm = df_physeq_ini_Rgm %>% select(OTU,Sample,LifeStage,DevelopmentalStage,Abundance,Phylum)
#convert to character
df_physeq_ini_Rgm$Phylum <- as.character(df_physeq_ini_Rgm$Phylum)

#simple way to rename phyla with < 1% abundance
df_physeq_ini_Rgm$Phylum[df_physeq_ini_Rgm$Abundance < 1] <- "< 1% abund."

# Add gram positive and gram negative information 

df_physeq_ini_Rgm <- df_physeq_ini_Rgm %>% mutate(Gram=case_when(
    Phylum=="Actinobacteria" ~ "Gram positive",
    Phylum=="Proteobacteria" ~ "Gram negative",
    Phylum=="Bacteroidetes" ~ "Gram negative",
    Phylum=="Firmicutes" ~ "Gram positive",
    Phylum=="Verrucomicrobia" ~ "Gram negative",
    Phylum=="Synergistetes" ~ "Gram negative",
    Phylum=="Tenericutes" ~ "Gram negative",
    Phylum=="Lentisphaera" ~ "Gram negative",
    Phylum=="Fusobacteria" ~ "Gram negative",
    Phylum=="< 1% abund." ~ "Gram positive"))

df_physeq_ini_Rgm <- df_physeq_ini_Rgm %>% mutate(Gram=as.factor(Gram))

ggplot(data=df_physeq_ini_Rgm, aes(x=Sample, y=Abundance, fill=Gram))+ geom_bar(aes(), stat="identity", position="stack")+geom_text(aes(x=Sample, y=Abundance,label=Phylum,size=Abundance),position=position_stack(vjust = .5),check_overlap = TRUE,fontface = "bold",colour="white")+scale_x_discrete(limits=c("Premetamorph","Prometamorph","Metamorph","Froglet","Adult"))+ylab("Abundance in %")+xlab("Developmental stage")+theme_nb()+theme(legend.position="bottom",strip.text=element_text(face="bold"),strip.background=element_rect(fill="white",colour="black",size=1))+ scale_size(range=c(1,5))+scale_fill_manual(values=c("black","darkgrey"))+guides(size=FALSE)

ggsave("plot_stade_compogram.pdf",width=12,height=12)


########################################################
#3 Figure barplot proportion of most abundant families #
########################################################
#2 - We fuse taxa at the family level
physeq.ini.Rg<-tax_glom(physeq.ini, taxrank="Family")

#3 - Transforming abundance counts into proportions
physeq.ini.Rgm <- transform_sample_counts(physeq.ini.Rg, function(x) 100 * x/sum(x))

#4 - Filtering OTUs
#We keep OTUs with at least 1% relative abundance in a given sample
physeq.ini.Rfgm <- filter_taxa(physeq.ini.Rgm,function(x) mean(x)>0.5,TRUE)

#5 - We merge samples corresponding to the same Developmental Stage category
# Transform all variables to factors for merging (trick required for R4.0)
df <- as.data.frame(lapply(sample_data(physeq.ini.Rfgm),function (y) if(class(y)!="factor" ) as.factor(y) else y),stringsAsFactors=T)
row.names(df) <- sample_names(physeq.ini.Rfgm)
sample_data(physeq.ini.Rfgm) <- sample_data(df)
#Merging
physeq.ini.Rfgm2 <- merge_samples(physeq.ini.Rfgm, "LifeStage")

#6 - We recompute the relative abundance because the sample merge sums up the relative abundance of each merged sample
physeq.ini.Rf2gm2 <- transform_sample_counts(physeq.ini.Rfgm2, function(x) 100 * x/sum(x))

#7 - Plotting by Life Stage

# create dataframe from phyloseq object to order Families like we want instead of
# the default alphabetical order
data_glom2<- psmelt(physeq.ini.Rf2gm2)

## Warning in psmelt(physeq.ini.Rf2gm2): The sample variables: 
## Sample
##  have been renamed to: 
## sample_Sample
## to avoid conflicts with special phyloseq plot attribute names.

## Warning in psmelt(physeq.ini.Rf2gm2): The sample variables: 
## Species
##  have been renamed to: 
## sample_Species
## to avoid conflicts with taxonomic rank names.

#convert to character
data_glom2$Family <- as.character(data_glom2$Family)

data_glom2$Family <- factor(data_glom2$Family, levels = c("Clostridiaceae 1", "Ruminococcaceae","Lachnospiraceae","Erysipelotrichaceae", "Family XIII","Peptostreptococcaceae", "Planococcaceae", "Rhodospirillaceae", "Desulfovibrionaceae","Multi-affiliation", "Rhodocyclaceae", "Pseudomonadaceae","Bradyrhizobiaceae","Aeromonadaceae","Neisseriaceae",  "Rikenellaceae","Bacteroidaceae", "Porphyromonadaceae","Synergistaceae","Verrucomicrobiaceae","Microbacteriaceae"))

p2<-ggplot(data=data_glom2, aes(x=Sample, y=Abundance, fill=Phylum,label=Family))+ geom_bar(aes(), stat="identity", position="stack",color="white")+scale_x_discrete(limits=c("Adult","Froglet","Metamorph","Prometamorph","Premetamorph"))+ylab("Abundance in %")+xlab("Developmental stage")+theme_nb()+scale_fill_manual(values=jcolors)+facet_wrap(~Family,nrow=3)+coord_flip()+theme(legend.position="top",strip.text=element_text(face="bold"),strip.background=element_rect(fill="white",colour="black",size=1))
p2

ggsave("plot_stade_compofamily.pdf",width=16,height=8)


##########################################################################
#4 Figure barplot proportion of most abundant phyla at the sample level ##
##########################################################################

physeq.ini.Rg<-tax_glom(physeq.ini, taxrank="Phylum")
physeq.ini.Rgm <- transform_sample_counts(physeq.ini.Rg, function(x) 100 * x/sum(x))
physeq.ini.Rfgm <- filter_taxa(physeq.ini.Rgm,function(x) mean(x)>0.5,TRUE)
physeq.ini.Rfgm2 <- transform_sample_counts(physeq.ini.Rfgm, function(x) 100 * x/sum(x))

plot_bar(physeq.ini.Rfgm2, x="Individu",fill = "Phylum")+ ylab("Percentage of Sequences")+ scale_fill_manual(values=jcolors)+theme_nb()+facet_grid(~LifeStage_sorted,scale="free_x")+theme(axis.text.x=element_blank(),strip.background=element_rect(fill="white"),strip.text=element_text(face="bold",size=14))+xlab("Samples")

## Warning in psmelt(physeq): The sample variables: 
## Sample
##  have been renamed to: 
## sample_Sample
## to avoid conflicts with special phyloseq plot attribute names.

## Warning in psmelt(physeq): The sample variables: 
## Species
##  have been renamed to: 
## sample_Species
## to avoid conflicts with taxonomic rank names.

ggsave("plot_stade_compophylum_by_sample.pdf",width=16,height=8)

xpdev_dev_abundance_phyloseq

N. Pollet

24/09/2020

METATETARD

Analysis of Phylum and Family abundances on developmental serie 16S rRNA GENE data