experiment 16S rRNA GENE data

# Message and warning turned off for rendering
# Analyse par phyloseq
library(phyloseq)
library(ggplot2)
library(phytools)
library(dplyr)
library(gridExtra)
library(usedist)
library(PMCMRplus)
library(multcompView)
library(picante)

# Warning turned off for rendering
## Folders, Themes, colors
source("prelude.R")

# Setting up directories
data_dir <-paste0(data_dir_path,"xptransmission")
output_dir <- paste0(output_dir_path,"Figure6_transmission")

# Importing data
# Defining the directory containing the data to import
setwd(data_dir)

#######################
#   Ordination plots  #
#######################
# Working on abundance data from 100 rarefactions
comm<-read.table("xptrans_fichier_allcom.txt",header=TRUE,row.names=1)
# Getting rid of OTUs with zero abundance
comm<-comm %>% select_if((function(col) is.numeric(col) && sum(col) > 0))
comm<-t(comm)
#Importing metadata
metadata<-read.csv("nmt3_metadata_XP_transa.csv",sep=";",header=TRUE,row.names=1)
taxmat<-read.csv("nmt3_Gal18_abondance_tax.csv",sep=";",header=TRUE,row.names=1)
taxmat<-as.matrix(taxmat)
OTU<-otu_table(comm, taxa_are_rows = TRUE)
TAX<-tax_table(taxmat)
sample_data<-sample_data(metadata)
#Importing phylogeny
phy<-read.tree("nmt3_sequences_mpr.tree")
#Rooting the tree using midpoint rooting
phy<-midpoint.root(phy)
physeq.ini = merge_phyloseq(OTU,sample_data,TAX,phy)

##Setting the working directory for output files
setwd(output_dir)


# We keep OTUs seen more than 5 times in 30% of samples
MTT <- filter_taxa(physeq.ini, function(x) sum(x > 5) > (0.3*length(x)), TRUE)
# How many OTUs in our starting data ?
length(row.names(otu_table(physeq.ini)))

## [1] 648

# How many OTUs have been filtered ?
length(row.names(otu_table(MTT)))

## [1] 382

# Il y a 382 OTUs sur 648 qui passent le filtre d'abondance et de prévalence.

# We convert into relative abundance and filter OTUs with abundance of more than 0.001%
# This filtering step should not further reduce the number of OTUs
count2proportion <- function(x){
    x[(x / sum(x)) < (1e-5)] <- 0
    return(x)
}
MTT<-transform_sample_counts(MTT,fun=count2proportion)

ordu<-ordinate(MTT, "PCoA", "unifrac", weighted=TRUE)
pw<-plot_ordination(MTT, ordu, shape="XP_Tissue")
plotw<-pw+geom_point(size=3,stroke=2)+scale_shape_manual(values=c(21,22,25),guide_legend(title=""))
plotw<-plotw+ggtitle("Community structure")+theme_npgray()+theme(legend.position="top")+scale_color_grey(guide_legend(title=""))+ stat_ellipse(type="norm",linetype=2)
plotw

ggsave("MDSPCOA_weighted_UNIFRAC.pdf",width=8,height=6)
#Removing the legend for the next plot with grid arrange
plotwf<-plotw+guides(shape=FALSE)

ordub<-ordinate(MTT, "PCoA", "unifrac", weighted=FALSE)
puw<-plot_ordination(MTT, ordub, shape="XP_Tissue")
plotuw<-puw+geom_point(size=3,stroke=2)+scale_shape_manual(values=c(21,22,25),guide_legend(title=""))

plotuw<-plotuw+ggtitle("Community membership")+scale_color_grey(guide_legend(title=""))+theme_npgray()+theme(legend.position="top")+ stat_ellipse(type="norm",linetype=2)
plotuw

ggsave("MDSPCOA_unweighted_top_UNIFRAC.pdf",width=9,height=7)

#Removing the legend for the next plot with grid arrange
plotuwf<-plotuw+guides(shape=FALSE)

grid.arrange(plotuwf, plotwf, ncol=2)

# Saving plot as a pdf file
#pdf("plotMDSPCOA_wuw_grey.pdf",width=12,height=6)
#print(grid.arrange(plotuwf, plotwf, ncol=2))
#dev.off()


###########################################
# Comparing distances in ordination plots #
###########################################
#Loading the required function from phyloseq_extended

source("https://raw.githubusercontent.com/mahendra-mariadassou/phyloseq-extended/master/R/graphical_methods.R")

## Loading required package: scales

## Loading required package: reshape2

p1 <- plot_samples(MTT, ordinate(MTT, "MDS", "cc"), color = "XP_Tissue") + ggtitle("MDS + Jaccard") +guides(color=FALSE)
p2 <- plot_samples(MTT, ordinate(MTT, "MDS", "unifrac"), color = "XP_Tissue") + ggtitle("MDS + UniFrac") +guides(color=FALSE)
p3 <- plot_samples(MTT, ordinate(MTT, "MDS", "bray"), color = "XP_Tissue") + ggtitle("MDS + Bray-Curtis") +guides(color=FALSE)
p4 <- plot_samples(MTT, ordinate(MTT, "MDS", "wunifrac"), color = "XP_Tissue") + ggtitle("MDS + weighted UniFrac") +guides(color=FALSE)
grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)

#### Boxplot of Unifrac and Weighted Unifrac distances
#Unifrac distances grouped by tissues
unifrac<-distance(physeq.ini,method="unifrac")
df_unifrac_XP_tissue<-dist_groups(unifrac,metadata$XP_Tissue)

# Significance of unweighted unifrac distances grouped by tissues
kw_unifrac_XP_tissue<-kruskal.test(df_unifrac_XP_tissue$Distance, df_unifrac_XP_tissue$Label,data=df_unifrac_XP_tissue)
pp_uni<-kwAllPairsDunnTest(x=df_unifrac_XP_tissue$Distance, df_unifrac_XP_tissue$Label,p.adjust.method="bonferroni")

# Making letters matching the significance level
mymat_uni<-tri.to.squ(pp_uni$p.value)
myletters_uni<- multcompLetters(mymat_uni,compare="<=",threshold=0.05,Letters=letters)

# Store the letters in a dataframe
myletters_uni_df <- data.frame(Sample=names(myletters_uni$Letters),letter = myletters_uni$Letters )

#Boxplot with points
ggplot(df_unifrac_XP_tissue,aes(x=Label,y=Distance))+geom_jitter(width=0.2,height=0.01,size=3,colour="darkgrey",alpha=0.9)+geom_boxplot(outlier.colour=NA,alpha=0.1)+geom_text(data = myletters_uni_df, aes(label = letter, x=Sample,y = 0.81 ),size=6,fontface="bold")+scale_x_discrete(limits=c("Between Eggs and Feces","Between Eggs and Skin","Between Feces and Skin","Within Eggs","Within Feces","Within Skin"),labels=c("Eggs-Feces","Eggs-Skin","Feces-Skin","Eggs","Skin","Feces"))+ylab("Unifrac")+xlab("Samples")+theme_npgray()

ggsave("plot_unifrac_distances_samples.pdf",width=12,height=8)

#Boxplot only
ggplot(df_unifrac_XP_tissue,aes(x=Label,y=Distance))+geom_boxplot(outlier.colour=NA,alpha=0.1)+geom_text(data = myletters_uni_df, aes(label = letter, x=Sample,y = 0.81 ),size=6,fontface="bold")+scale_x_discrete(limits=c("Between Eggs and Feces","Between Eggs and Skin","Between Feces and Skin","Within Eggs","Within Feces","Within Skin"),labels=c("Eggs-Feces","Eggs-Skin","Feces-Skin","Eggs","Skin","Feces"))+ylab("Unifrac")+xlab("Samples")+theme_npgray()+guides(fill=FALSE)

#Weighted Unifrac distances grouped by tissues
wunifrac<-distance(physeq.ini,method="wunifrac")
df_wunifrac_XP_tissue<-dist_groups(wunifrac,metadata$XP_Tissue)

# Significance of weighted unifrac distances grouped by tissues
kw_wunifrac_XP_tissue<-kruskal.test(df_wunifrac_XP_tissue$Distance, df_unifrac_XP_tissue$Label,data=df_wunifrac_XP_tissue)
pp_wuni<-kwAllPairsDunnTest(x=df_wunifrac_XP_tissue$Distance, df_wunifrac_XP_tissue$Label,p.adjust.method="bonferroni")


#With points
ggplot(df_wunifrac_XP_tissue,aes(x=Label,y=Distance))+geom_jitter(width=0.2,height=0.01,size=3,colour="darkgrey",alpha=0.9)+geom_boxplot(outlier.colour=NA,alpha=0.1)+scale_x_discrete(limits=c("Between Eggs and Feces","Between Eggs and Skin","Between Feces and Skin","Within Eggs","Within Feces","Within Skin"),labels=c("Eggs-Feces","Eggs-Skin","Feces-Skin","Eggs","Skin","Feces"))+ylab("Weighted Unifrac")+xlab("Samples")+theme_npgray()

ggsave("plot_wunifrac_distances_samples.pdf",width=12,height=8)

#Boxplot only
ggplot(df_wunifrac_XP_tissue,aes(x=Label,y=Distance,fill=Label))+geom_boxplot(outlier.colour=NA,alpha=0.1)+scale_x_discrete(limits=c("Between Eggs and Feces","Between Eggs and Skin","Between Feces and Skin","Within Eggs","Within Feces","Within Skin"),labels=c("Eggs-Feces","Eggs-Skin","Feces-Skin","Eggs","Skin","Feces"))+ylab("Weighted Unifrac")+xlab("Samples")+theme_npgray()+guides(fill=FALSE)

#Centroid distances relative to tissues
df_centroids_unifrac_XP_tissue<-dist_to_centroids(unifrac, metadata$XP_Tissue)






####################################################
# Figure barplot proportion of most abundant phyla #
####################################################

#2 - We fuse taxa at the phylum level
physeq.ini.Rg<-tax_glom(physeq.ini, taxrank="Phylum")


#3 - Transforming abundance counts into proportions
physeq.ini.Rgm = transform_sample_counts(physeq.ini.Rg, function(x) 100 * x/sum(x))

#4 - Filtering OTUs
#We keep OTUs with at least 1% relative abundance in a given sample
physeq.ini.Rfgm = filter_taxa(physeq.ini.Rgm,function(x) mean(x)>1,TRUE)

#5 - We merge samples corresponding to the same Developmental Stage category
# Transform all variables to factors for merging (trick required for R4.0)
df <- as.data.frame(lapply(sample_data(physeq.ini.Rfgm),function (y) if(class(y)!="factor" ) as.factor(y) else y),stringsAsFactors=T)
row.names(df) <- sample_names(physeq.ini.Rfgm)
sample_data(physeq.ini.Rfgm) <- sample_data(df)
#Merging
physeq.ini.Rfgm2 = merge_samples(physeq.ini.Rfgm, "XP_Tissue")

#6 - We recompute the relative abundance because the sample merge sums up the relative abundance of each merged sample
physeq.ini.Rf2gm2 = transform_sample_counts(physeq.ini.Rfgm2, function(x) 100 * x/sum(x))


# We need to recover the good legends corresponding to the category used for merging samples
sample_data(physeq.ini.Rf2gm2)$XP_Tissue <- levels(as.factor(sample_data(physeq.ini.Rg)$XP_Tissue))

#7 - We can go on with the abundance plot itself
# Defining the output dir
setwd(output_dir)
# Plotting
plot_tissue_compo<-plot_bar(physeq.ini.Rf2gm2, "XP_Tissue", fill = "Phylum")+ ylab("Percentage of Sequences")+ scale_fill_manual(values=jcolors)+theme_npgray()+theme(axis.text.x = element_text(angle = 0,hjust=0.5))+scale_x_discrete(limits=c("Eggs","Skin","Feces"), labels=c("Eggs","Skin","Feces"))+xlab("Tissues")
plot_tissue_compo

ggsave("plot_tissue_compo.pdf",width=8,height=6)

#8 Plotting by phylum
p<-plot_bar(physeq.ini.Rf2gm2,facet_grid=~Phylum,fill="Phylum")
p2<-plot_bar(physeq.ini.Rf2gm2, "XP_Tissue", fill = "Phylum")+geom_label(aes(label= scales::number(Abundance, accuracy = 0.1),fill = factor(Phylum)), colour = "white", fontface = "bold")+ ylab("Percentage of Sequences")+ scale_fill_manual(values=jcolors)+theme_npgray()+theme(axis.text.x = element_text(angle = 0,hjust=0.5))+scale_x_discrete(limits=c("Eggs","Skin","Feces"), labels=c("Eggs","Skin","Feces"))+xlab("Tissues")+facet_wrap(~Phylum)
p2

ggsave("plot_XP_Tissue_compophylum.pdf",width=9,height=5)

##################
## Family level ##
##################

#2 - We fuse taxa at the family level
physeq.ini.Rg<-tax_glom(physeq.ini, taxrank="Family")
#3 - Transforming abundance counts into proportions
physeq.ini.Rgm = transform_sample_counts(physeq.ini.Rg, function(x) 100 * x/sum(x))

#4 - Filtering OTUs
#We keep OTUs with at least 1% relative abundance in a given sample
physeq.ini.Rfgm = filter_taxa(physeq.ini.Rgm,function(x) mean(x)>0.5,TRUE)

#5 - We merge samples corresponding to the same Developmental Stage category
# Transform all variables to factors for merging (trick required for R4.0)
df <- as.data.frame(lapply(sample_data(physeq.ini.Rfgm),function (y) if(class(y)!="factor" ) as.factor(y) else y),stringsAsFactors=T)
row.names(df) <- sample_names(physeq.ini.Rfgm)
sample_data(physeq.ini.Rfgm) <- sample_data(df)
#Merging
physeq.ini.Rfgm2 = merge_samples(physeq.ini.Rfgm, "XP_Tissue")

#6 - We recompute the relative abundance because the sample merge sums up the relative abundance of each merged sample
physeq.ini.Rf2gm2 = transform_sample_counts(physeq.ini.Rfgm2, function(x) 100 * x/sum(x))

# We need to recover the good legends corresponding to the category used for merging samples
sample_data(physeq.ini.Rf2gm2)$XP_Tissue <- levels(as.factor(sample_data(physeq.ini.Rg)$XP_Tissue))

#7 - We can go on with the abundance plot itself
plot_bar(physeq.ini.Rf2gm2, "XP_Tissue", fill = "Phylum")+geom_text(aes(label= scales::number(Abundance, accuracy = 0.1)),hjust = -0.2, fontface = "bold")+ylab("Percentage of Sequences")+theme_npgray()+theme(axis.text.x = element_text(angle = 0,hjust=0.5),legend.position="top",strip.text=element_text(face="bold",size=rel(0.7)),strip.background=element_rect(fill="white"))+scale_x_discrete(limits=c("Eggs","Skin","Feces"), labels=c("Eggs","Skin","Feces"))+facet_wrap(~Family,ncol=6)+coord_flip()+xlab("Tissues")

############################
# Plot on selected phyla   #
############################
physeq.ini.Rgm = transform_sample_counts(physeq.ini.Rg, function(x) 100 * x/sum(x))

#Bacteroidetes
bacteroidetes<- subset_taxa(physeq.ini.Rgm, Phylum=="Bacteroidetes")
pbacteroidetes<-plot_tree(bacteroidetes, color="XP_Tissue", label.tips="Family", size="abundance", plot.margin=0.3,text.size=4)
pbacteroidetes

#Firmicutes
firmicutes<- subset_taxa(physeq.ini.Rg, Phylum=="Firmicutes")
pfirmicutes<-plot_tree(firmicutes, color="XP_Tissue", label.tips="Family", size="abundance", plot.margin=0.3,text.size=4)
pfirmicutes

#Alphaproteobacteria
Aproteobacteria<- subset_taxa(physeq.ini.Rg, Class=="Alphaproteobacteria")
pAproteobacteria<-plot_tree(Aproteobacteria, color="XP_Tissue", label.tips="Family", size="abundance", plot.margin=0.3,text.size=4)
pAproteobacteria

#Betaproteobacteria
Bproteobacteria<- subset_taxa(physeq.ini.Rg, Class=="Betaproteobacteria")
pBproteobacteria<-plot_tree(Bproteobacteria, color="XP_Tissue", label.tips="Family", size="abundance", plot.margin=0.3,text.size=4)
pBproteobacteria

#Deltaproteobacteria
Dproteobacteria<- subset_taxa(physeq.ini.Rg, Class=="Deltaproteobacteria")
pDproteobacteria<-plot_tree(Dproteobacteria, color="XP_Tissue", label.tips="Family", size="abundance", plot.margin=0.3,text.size=4)
pDproteobacteria

#Gammaproteobacteria
Gproteobacteria<- subset_taxa(physeq.ini.Rg, Class=="Gammaproteobacteria")
pGproteobacteria<-plot_tree(Gproteobacteria, color="XP_Tissue", label.tips="Family", size="abundance", plot.margin=0.3,text.size=4)
pGproteobacteria

##############################################
# Test the significance of unifrac distances #
##############################################
# beta diversity: compute unweighted unifrac distance
uudist<-distance(physeq.ini,method="uunifrac")
adonis(uudist~XP_Tissue,data=metadata)

## 
## Call:
## adonis(formula = uudist ~ XP_Tissue, data = metadata) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## XP_Tissue  2    1.9594 0.97969  14.891 0.46693  0.001 ***
## Residuals 34    2.2369 0.06579         0.53307           
## Total     36    4.1963                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# beta diversity: compute weighted unifrac distance
uwdist<-distance(physeq.ini,method="wunifrac")
adonis(uwdist~XP_Tissue,data=metadata)

## 
## Call:
## adonis(formula = uwdist ~ XP_Tissue, data = metadata) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## XP_Tissue  2    3.5834 1.79169  41.849 0.71112  0.001 ***
## Residuals 34    1.4557 0.04281         0.28888           
## Total     36    5.0390                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# unweighted unifrac distances grouped by tissues
kw_unifrac_XP_tissue<-kruskal.test(df_unifrac_XP_tissue$Distance, df_unifrac_XP_tissue$Label,data=df_unifrac_XP_tissue)
kw_unifrac_XP_tissue

## 
##  Kruskal-Wallis rank sum test
## 
## data:  df_unifrac_XP_tissue$Distance and df_unifrac_XP_tissue$Label
## Kruskal-Wallis chi-squared = 430.63, df = 5, p-value < 2.2e-16

pp_uni<-kwAllPairsDunnTest(x=df_unifrac_XP_tissue$Distance, df_unifrac_XP_tissue$Label,p.adjust.method="bonferroni")
pp_uni

## 
##  Pairwise comparisons using Dunn's all-pairs test

## data: df_unifrac_XP_tissue$Distance and df_unifrac_XP_tissue$Label

##                        Between Eggs and Feces Between Eggs and Skin
## Between Eggs and Skin  < 2e-16                -                    
## Between Feces and Skin 1.000                  < 2e-16              
## Within Eggs            7.2e-14                1.000                
## Within Feces           < 2e-16                0.019                
## Within Skin            < 2e-16                0.474                
##                        Between Feces and Skin Within Eggs Within Feces
## Between Eggs and Skin  -                      -           -           
## Between Feces and Skin -                      -           -           
## Within Eggs            2.3e-15                -           -           
## Within Feces           < 2e-16                1.000       -           
## Within Skin            < 2e-16                1.000       1.000

## 
## P value adjustment method: bonferroni

## alternative hypothesis: two.sided

# weighted unifrac distances grouped by tissues
kw_wunifrac_XP_tissue<-kruskal.test(df_wunifrac_XP_tissue$Distance, df_unifrac_XP_tissue$Label,data=df_wunifrac_XP_tissue)
kw_wunifrac_XP_tissue

## 
##  Kruskal-Wallis rank sum test
## 
## data:  df_wunifrac_XP_tissue$Distance and df_unifrac_XP_tissue$Label
## Kruskal-Wallis chi-squared = 447.26, df = 5, p-value < 2.2e-16

pp_wuni<-kwAllPairsDunnTest(x=df_wunifrac_XP_tissue$Distance, df_wunifrac_XP_tissue$Label,p.adjust.method="bonferroni")
pp_wuni

## 
##  Pairwise comparisons using Dunn's all-pairs test

## data: df_wunifrac_XP_tissue$Distance and df_wunifrac_XP_tissue$Label

##                        Between Eggs and Feces Between Eggs and Skin
## Between Eggs and Skin  <2e-16                 -                    
## Between Feces and Skin 1.000                  <2e-16               
## Within Eggs            <2e-16                 1.000                
## Within Feces           <2e-16                 0.297                
## Within Skin            <2e-16                 1.000                
##                        Between Feces and Skin Within Eggs Within Feces
## Between Eggs and Skin  -                      -           -           
## Between Feces and Skin -                      -           -           
## Within Eggs            <2e-16                 -           -           
## Within Feces           <2e-16                 0.068       -           
## Within Skin            <2e-16                 1.000       0.358

## 
## P value adjustment method: bonferroni

## alternative hypothesis: two.sided

xptransmission_analyse1_phyloseq

N. Pollet

07/05/2020

METATETARD

Diversity analysis on

experiment 16S rRNA GENE data