# Messages and warnings turned off for rendering
#Analysis using Deseq2 and phyloseq
library(phyloseq)
library(DESeq2)
library(ggplot2)
library(phytools)
library(dplyr)
# Warnings turned off for rendering
## Folders, Themes, colors
source("prelude.R")
# Setting up directories
data_dir<-paste0(data_dir_path,"xpdiet")
output_dir<-paste0(output_dir_path,"Figure5_diet")
# Importing data
# Defining the directory containing the data to import
setwd(data_dir)
# Importing abundance data
comm<-read.table("nmt3_abondance_xpfood_tpn_notax.csv",header=TRUE,row.names=1,sep=";")
# Getting rid of OTUs with zero abundance
comm<-comm %>% select_if((function(col) is.numeric(col) && sum(col) > 0))
comm<-t(comm)
#Importing metadata
metadata<-read.csv("nmt3_2_metadata_food.csv",sep=";",header=TRUE,row.names=1)
#Importing taxonomy
taxmat<-read.csv("nmt3_Gal18_abondance_tax.csv",sep=";",header=TRUE,row.names=1)
taxmat<-as.matrix(taxmat)
#To build the phyloseq data set
OTU<-otu_table(comm, taxa_are_rows = TRUE)
TAX<-tax_table(taxmat)
sample_data<-sample_data(metadata)
#Importing phylogeny
phy<-read.tree("nmt3_sequences_mpr.tree")
#Rooting the tree using midpoint rooting
phy<-midpoint.root(phy)
physeq.ini = merge_phyloseq(OTU,sample_data,TAX,phy)
# We keep OTUs seen more than 100 times in 20% of samples
physeq.ini.f<-filter_taxa(physeq.ini, function(x) sum(x > 10) >= (0.2*length(x)), TRUE)
# How many OTUs were found ?
length(row.names(otu_table(physeq.ini)))
## [1] 627
# How many OTUs have been filtered ?
length(row.names(otu_table(physeq.ini.f)))
## [1] 243
# Il y a 243 OTUs sur 627 qui passent le filtre d'abondance et de prévalence.
# Performing the differential analysis using DESeq2
diagdds <- phyloseq_to_deseq2(physeq.ini.f, ~ Class)
## converting counts to integer mode
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
diagdds <- DESeq(diagdds, test="Wald", fitType="local")
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## -- replacing outliers and refitting for 10 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
res <- results(diagdds, cooksCutoff = FALSE)
alpha <- 0.01
sigtab <- res[which(res$padj < alpha), ]
sigtab <- cbind(as(sigtab, "data.frame"), as(tax_table(physeq.ini)[rownames(sigtab), ], "matrix"))
sigtab <- sigtab %>% tibble::rownames_to_column(var="OTU_ID")
head(sigtab)
## OTU_ID baseMean log2FoldChange lfcSE stat pvalue
## 1 Cluster_686 11.68072 5.211772 1.0020760 5.200975 1.982453e-07
## 2 Cluster_148 55.66419 -9.104689 2.1472060 -4.240249 2.232716e-05
## 3 Cluster_175 33.90601 7.557872 0.9826646 7.691202 1.457590e-14
## 4 Cluster_184 44.74446 -4.348354 1.0100254 -4.305192 1.668407e-05
## 5 Cluster_364 33.64367 8.815808 1.0698899 8.239921 1.723250e-16
## 6 Cluster_522 18.00732 6.908281 1.0783529 6.406327 1.490674e-10
## padj Domain Phylum Class Order
## 1 1.120317e-06 Bacteria Bacteroidetes Cytophagia Cytophagales
## 2 9.195763e-05 Bacteria Bacteroidetes Sphingobacteriia Sphingobacteriales
## 3 4.427430e-13 Bacteria Bacteroidetes Sphingobacteriia Sphingobacteriales
## 4 6.990048e-05 Bacteria Bacteroidetes Sphingobacteriia Sphingobacteriales
## 5 1.046874e-14 Bacteria Bacteroidetes Sphingobacteriia Sphingobacteriales
## 6 1.665436e-09 Bacteria Bacteroidetes Sphingobacteriia Sphingobacteriales
## Family Genus Species
## 1 Cytophagaceae Sporocytophaga unknownspecies
## 2 Chitinophagaceae Sediminibacterium Multi-affiliation
## 3 Saprospiraceae unknowngenus unknownspecies
## 4 Sphingobacteriaceae Pedobacter unknownspecies
## 5 NS11-12marinegroup unknowngenus unknownspecies
## 6 NS11-12marinegroup unknowngenus unknownspecies
# Defining the output directory
setwd(output_dir)
# Phylum order
x <- tapply(sigtab$log2FoldChange, sigtab$Phylum, function(x) max(x))
x <- sort(x, TRUE)
sigtab$Phylum <- factor(as.character(sigtab$Phylum), levels=names(x))
ggplot(sigtab, aes(y=Phylum, x=log2FoldChange, color=Phylum)) + geom_point(size=4,colour="black")+geom_point(size=3,alpha=0.7)+theme(axis.text.y = element_text(size=rel(0.8)))+scale_colour_manual(values=jcolors)+theme_npgray()
# Family order
ggplot(sigtab, aes(y=Family, x=log2FoldChange, color=Phylum)) + geom_point(size=4,colour="black")+geom_point(size=3,alpha=0.7)+theme(axis.text.y = element_text(size=rel(0.8)))+scale_colour_manual(values=jcolors)+theme_npgray()+facet_grid(~Phylum)+guides(colour=FALSE)
ggsave("plot_deseq_diet_by_family.pdf",width=12,height=9)
# Genus order
x <- tapply(sigtab$log2FoldChange, sigtab$Genus, function(x) max(x))
x <- sort(x, TRUE)
sigtab$Genus <- factor(as.character(sigtab$Genus), levels=names(x))
ggplot(sigtab, aes(y=reorder(Genus,log2FoldChange), x=log2FoldChange, color=Phylum)) + geom_point(size=4,colour="black") + geom_point(size=3,alpha=0.7)+theme(axis.text.y = element_text(size=rel(0.8)))+scale_colour_manual(values=jcolors)+theme_npgray()+guides(colour=FALSE)
ggsave("plot_deseq_diet.pdf",width=12,height=14)
# Presentation using a combinaison of taxonomy and OTU ID
sigtab<-mutate(sigtab,otu2genus=paste(Family,Genus,OTU_ID,sep="_"))
ggplot(sigtab, aes(y=reorder(sigtab$otu2genus,log2FoldChange), x=log2FoldChange, color=Phylum)) + geom_point(size=4,colour="black") + geom_point(size=3,alpha=0.7)+theme(axis.text.y = element_text(size=rel(0.8)))+scale_colour_manual(values=jcolors)+theme_npgray()+guides(colour=FALSE)+ylab("OTUs")
ggsave("plot_deseq_diet_OTU2tax.pdf",width=14,height=14)
# Presentation using only OTU ID
ggplot(sigtab, aes(y=reorder(sigtab$OTU_ID,log2FoldChange), x=log2FoldChange, color=Phylum)) + geom_point(size=4,colour="black") + geom_point(size=3,alpha=0.7)+theme(axis.text.y = element_text(size=rel(0.8)))+scale_colour_manual(values=jcolors)+theme_npgray()+facet_grid(~Phylum)+guides(colour=FALSE)+ylab("OTUs")
ggsave("plot_deseq_diet_OTU.pdf",width=12,height=14)
# Selection of the differential OTUs
my_de_subset <- subset(otu_table(physeq.ini.f),rownames(otu_table(physeq.ini.f)) %in% sigtab$OTU_ID)
physeq.de.ini <- merge_phyloseq(my_de_subset, tax_table(physeq.ini.f), sample_data(physeq.ini.f),phy_tree(physeq.ini.f))
# Plotting the Heatmap using the differential OTUs
# Heatmap without clustering.
plot_heatmap(physeq.de.ini,sample.label="Class",low="#66CCFF", high="#000033", na.value="white")
ggsave("plot_heatmap_de_noclustering.pdf",height=12,width=12)
# Heatmap using unifrac PCoA for community composition
plot_heatmap(physeq.de.ini,method="PCoA",distance="unifrac",sample.label="Class",low="#66CCFF", high="#000033", na.value="white") + ylab("OTUs")+ theme_npgray()+ theme(axis.text.x = element_text(angle = 45, hjust = 1))+facet_grid(~Feed,scale="free_x", space="free_x")
ggsave("plot_heatmap_de_clustering.pdf",height=12,width=12)