Contents

The main Figure from Nature Paper

The main Figure from Nature Paper

Arumugam M, Raes J, Pelletier E, Le Paslier D, Yamada T, Mende DR, Fernandes GR, Tap J, Bruls T, Batto JM, Bertalan M. … Enterotypes of the human gut microbiome. Nature. 2011;473(7346):174-80.

library("phyloseq")
library("cluster")
data("enterotype")
ps_data = subset_samples(enterotype, SeqTech == "Sanger")
ps_data
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 553 taxa and 41 samples ]
## sample_data() Sample Data:       [ 41 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 553 taxa by 1 taxonomic ranks ]
otu_table(ps_data)[1:5,1:9]
## OTU Table:          [5 taxa and 9 samples]
##                      taxa are rows
##                    AM.AD.1    AM.AD.2  AM.F10.T1 AM.F10.T2  DA.AD.1
## -1               0.4265035 0.33226194 0.48195467 0.3885831 0.533604
## Bacteria         0.0000000 0.00000000 0.00000000 0.0000000 0.000000
## Prosthecochloris 0.0000000 0.00000000 0.00000000 0.0000000 0.000000
## Chloroflexus     0.0000000 0.00000000 0.00000000 0.0000000 0.000000
## Dehalococcoides  0.0000000 0.00026835 0.00000959 0.0000000 0.000000
##                    DA.AD.1T    DA.AD.2   DA.AD.3  DA.AD.3T
## -1               0.62499134 0.63470800 0.7798098 0.8052677
## Bacteria         0.00000000 0.00001328 0.0000000 0.0000000
## Prosthecochloris 0.00000000 0.00000000 0.0000000 0.0000000
## Chloroflexus     0.00000081 0.00000000 0.0000000 0.0000000
## Dehalococcoides  0.00000000 0.00002030 0.0000000 0.0000000
apply(otu_table(ps_data),2,sum)
##   AM.AD.1   AM.AD.2 AM.F10.T1 AM.F10.T2   DA.AD.1  DA.AD.1T   DA.AD.2 
## 0.9999999 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 
##   DA.AD.3  DA.AD.3T   DA.AD.4   ES.AD.1   ES.AD.2   ES.AD.3   ES.AD.4 
## 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 
##   FR.AD.1   FR.AD.2   FR.AD.3   FR.AD.4   FR.AD.5   FR.AD.6   FR.AD.7 
## 1.0000000 1.0000000 0.9999999 1.0000000 1.0000000 1.0000000 1.0000000 
##   FR.AD.8   IT.AD.1   IT.AD.2   IT.AD.3   IT.AD.4   IT.AD.5   IT.AD.6 
## 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000001 0.9999999 
##   JP.AD.1   JP.AD.2   JP.AD.3   JP.AD.4   JP.AD.5   JP.AD.6   JP.AD.7 
## 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 
##   JP.AD.8   JP.AD.9   JP.IN.1   JP.IN.2   JP.IN.3   JP.IN.4 
## 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
newsampledata=sample_data(ps_data)
newsampledata$Enterotype=addNA(newsampledata$Enterotype)
sample_data(ps_data)=newsampledata
table(sample_data(ps_data)$Enterotype)
## 
##    1    2    3 <NA> 
##    8    6   18    9
levels(sample_data(ps_data)$Enterotype) =c("1","2","3","0")
table(sample_data(ps_data)$Enterotype)
## 
##  1  2  3  0 
##  8  6 18  9

This has 41 samples, 9 of which were mysteriously dropped from the enterotype study, we are going to re-include them with the Enterotype classifier “0”.

data = as(otu_table(ps_data), "matrix")
data = data[-1, ]

Choose a distance (Jensen-Shannon symmetrized KL divergence)

data.dist = dist.JSD(data)

Ordination

PCOA(principal coordinates analysis or multidimen. scaling)

require(ggplot2)
## Loading required package: ggplot2
#Standard Multidimensional Scaling
ent.jsd=ordinate(ps_data,method="MDS",distance=data.dist)

p <- NULL
# Create plot, store as temp variable, p
p <- plot_ordination(ps_data, ent.jsd, color = "Enterotype", shape = "Enterotype")
    # Add title to each plot
    p <- p + ggtitle("MDS using JSD distance  ")
p

Notice the difference with

p + coord_fixed()

NonMetric Multidimensional Scaling

#NonMetric Multidimensional Scaling
ent.jsd.nm=ordinate(ps_data,method="NMDS",distance=data.dist)
## Run 0 stress 0.1495351 
## Run 1 stress 0.1463775 
## ... New best solution
## ... Procrustes: rmse 0.09367362  max resid 0.4433102 
## Run 2 stress 0.1529201 
## Run 3 stress 0.1471192 
## Run 4 stress 0.1470457 
## Run 5 stress 0.1490211 
## Run 6 stress 0.1453083 
## ... New best solution
## ... Procrustes: rmse 0.1145664  max resid 0.3276996 
## Run 7 stress 0.1495344 
## Run 8 stress 0.1453091 
## ... Procrustes: rmse 0.0003193961  max resid 0.001058172 
## ... Similar to previous best
## Run 9 stress 0.1463772 
## Run 10 stress 0.1453093 
## ... Procrustes: rmse 0.0008201763  max resid 0.004492096 
## ... Similar to previous best
## Run 11 stress 0.1612241 
## Run 12 stress 0.1453606 
## ... Procrustes: rmse 0.005661344  max resid 0.03027852 
## Run 13 stress 0.1707988 
## Run 14 stress 0.1486988 
## Run 15 stress 0.1802395 
## Run 16 stress 0.3943069 
## Run 17 stress 0.1471013 
## Run 18 stress 0.1490217 
## Run 19 stress 0.1802605 
## Run 20 stress 0.1486989 
## *** Solution reached
p <- NULL
# Create plot, store as temp variable, p
p <- plot_ordination(ps_data, ent.jsd.nm, color = "Enterotype", 
                     shape = "Enterotype")
# Add title to each plot
p <- p + ggtitle("Non Metric MDS using JSD distance  ")
p

Correspondence Analysis

#Correspondence Analysis
ent.ca = ordinate(ps_data, method = "CCA", distance = NULL)

p <- NULL
# Create plot, store as temp variable, p
p <- plot_ordination(ps_data, ent.ca, color = "Enterotype", shape = "Enterotype")
    # Add title to each plot
    p <- p + ggtitle("Correspondence Analysis (chisquare distance)")
p 

Bray Curtis

#Bray-Curtis
ent.bray = ordinate(ps_data, method = "MDS", distance = "bray")
# Create plot, store as temp variable, p
p <- plot_ordination(ps_data, ent.bray, color = "Enterotype", shape = "Enterotype")
    # Add title to each plot
    p <- p + ggtitle("Bray Curtis Distance, MDS")
p +coord_fixed()

Nonmetric MDS

#Bray-Curtis
ent.bray.nm = ordinate(ps_data, method = "NMDS", distance = "bray")
## Run 0 stress 0.1196547 
## Run 1 stress 0.1110464 
## ... New best solution
## ... Procrustes: rmse 0.03762043  max resid 0.2262629 
## Run 2 stress 0.1453483 
## Run 3 stress 0.1289909 
## Run 4 stress 0.1287229 
## Run 5 stress 0.1312438 
## Run 6 stress 0.1110458 
## ... New best solution
## ... Procrustes: rmse 0.001435505  max resid 0.005885982 
## ... Similar to previous best
## Run 7 stress 0.1110439 
## ... New best solution
## ... Procrustes: rmse 0.001037231  max resid 0.00439878 
## ... Similar to previous best
## Run 8 stress 0.1273938 
## Run 9 stress 0.1198296 
## Run 10 stress 0.1273922 
## Run 11 stress 0.1301727 
## Run 12 stress 0.1318338 
## Run 13 stress 0.1110445 
## ... Procrustes: rmse 0.0008377273  max resid 0.003172803 
## ... Similar to previous best
## Run 14 stress 0.1287216 
## Run 15 stress 0.1110465 
## ... Procrustes: rmse 0.0004036858  max resid 0.001450265 
## ... Similar to previous best
## Run 16 stress 0.1196475 
## Run 17 stress 0.1110771 
## ... Procrustes: rmse 0.001388068  max resid 0.00679678 
## ... Similar to previous best
## Run 18 stress 0.1289932 
## Run 19 stress 0.1318357 
## Run 20 stress 0.124217 
## *** Solution reached
# Create plot, store as temp variable, p
p <- plot_ordination(ps_data, ent.bray.nm, color = "Enterotype", shape = "Enterotype")
    # Add title to each plot
    p <- p + ggtitle("Bray Curtis Distance, MDS")
p

See more explorations of the data at the phyloseq vignette

Remove the outliers and redo the analyses

Take out the samples that were ignored in the paper:

###beware new name for different data set
ps_nata = subset_samples(ps_data, Enterotype !="0")
nata = as(otu_table(ps_nata), "matrix")
nata = nata[-1, ]
nata.dist = dist.JSD(nata)

A complete loop through all distances

dist_methods <- unlist(distanceMethodList)
###Remove distances requiring a tree
dist_methods <- dist_methods[-c(1:3,16)]
# Remove the user-defined distance
dist_methods = dist_methods[-which(dist_methods == "ANY")]
plist <- vector("list", length(dist_methods))
names(plist) = dist_methods
for (i in dist_methods) {
    # Calculate distance matrix
    iDist <- distance(ps_nata, method = i)
    # Calculate ordination
    iMDS <- ordinate(ps_nata, "MDS", distance = iDist)
    ## Make plot Don't carry over previous plot (if error, p will be blank)
    p <- NULL
    # Create plot, store as temp variable, p
    p <- plot_ordination(ps_nata, iMDS, color = "Enterotype", shape = "Enterotype")
    # Add title to each plot
    p <- p + ggtitle(paste("MDS using distance method ", i, sep = ""))
    # Save the graphic to file.
    plist[[i]] = p
}
require(plyr)
df = ldply(plist, function(x) x$data)
names(df)[1] <- "distance"
p = ggplot(df, aes(Axis.1, Axis.2, color = Enterotype, shape = Enterotype))
p = p + geom_point(size = 3, alpha = 0.5)
p = p + facet_wrap(~distance, scales = "free")
p = p + ggtitle("MDS on various distance metrics for Enterotype dataset")
p

With a different clustering variable

(It may be more interesting)

theme_set(theme_bw())
data(enterotype)
enterotype <- subset_taxa(enterotype, Genus != "-1")
df = ldply(plist, function(x) x$data)
names(df)[1] <- "distance"
p = ggplot(df, aes(Axis.1, Axis.2, color = SeqTech, shape = Enterotype))
p = p + geom_point(size = 3, alpha = 0.5)
p = p + facet_wrap(~distance, scales = "free")
p = p + ggtitle("MDS on various distance metrics for Enterotype dataset")
p
## Warning: Removed 378 rows containing missing values (geom_point).

How many clusters?

pam.clustering = function(x, k) {
    # x is a distance matrix and k the number of clusters
    require(cluster)
    cluster = as.vector(pam(as.dist(x), k, diss = TRUE)$clustering)
    return(cluster)
}

The authors fixed the number of clusters to 3, would we?

### The following line is already asking for 3 clusters...
data.cluster = pam.clustering(nata.dist, k = 3)
require(clusterSim)
## Loading required package: clusterSim
## Loading required package: MASS
nclusters = index.G1(t(nata), data.cluster, d = nata.dist, centrotypes = "medoids")
nclusters = NULL
for (k in 1:20) {
    if (k == 1) {
        nclusters[k] = NA
    } else {
        data.cluster_temp = pam.clustering(nata.dist, k)
        nclusters[k] = index.G1(t(nata), data.cluster_temp, d = nata.dist, centrotypes = "medoids")
    }
}
plot(nclusters, type = "h", xlab = "k clusters", ylab = "CH index", 
    main = "Optimal number of clusters (Calinsky and Harabsz 1974)")

or we could use the Gap Statistic:

exord = ordinate(ps_nata, method = "MDS", distance = "jsd")
print(gs, method = "Tibs2001SEmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = x[, axes], FUNcluster = FUNcluster, K.max = K.max,     B = B, verbose = verbose)
## B=50 simulated reference sets, k = 1..6; spaceH0="scaledPCA"
##  --> Number of clusters (method 'Tibs2001SEmax', SE.factor=1): 1
##            logW     E.logW       gap     SE.sim
## [1,] -0.1063849  0.2036021 0.3099870 0.06165868
## [2,] -0.4072377 -0.1468869 0.2603508 0.06204199
## [3,] -0.7708693 -0.4319570 0.3389123 0.06675009
## [4,] -0.9462297 -0.6479749 0.2982548 0.07386420
## [5,] -1.1250085 -0.8304651 0.2945434 0.07717195
## [6,] -1.3168929 -0.9845476 0.3323453 0.07616284
plot_clusgap(gs)

Ordination as done in Nature Paper: drop 9 outliers

require(ade4)
## Loading required package: ade4
data = as(otu_table(ps_data), "matrix")
data = data[-1, ]
clus=sample_data(ps_data)$Enterotype
data123=data[,-which(clus==0)]
data.dist123=dist.JSD(data123)
obs.pcoa=dudi.pco(data.dist123,scannf=F,nf=3)
s.class(obs.pcoa$li, grid=F,fac=clus[-which(clus==0)])

Compare to figure in the paper

EnterotypeFigure

EnterotypeFigure

This was actually made by creating new axes by using a supervised method (between group analyses) to polarize the clustering effect.

Abritrary Choice of Underlying structure?

Is a Gradient better?

Could have tested the ordering gradient and found it significant, why use an underlying categorical variable?

Is the underlying variable a categorical one?

dist123.mst=mstree(data.dist123,1)
s.label(obs.pcoa$li, clab = 0, cpoi = 2, neig = dist123.mst, cnei = 1)

We could clean up the graphical representation:

obs.pcoa2=dudi.pco(data.dist123,scannf=F,nf=2)
pcoa.mst=mstree(dist(obs.pcoa2$li),1)
s.label(obs.pcoa2$li, cpoi = 2, neig = pcoa.mst, cnei = 1,label=paste(newvar),boxes=F)

You could even test the gradient

require(vegan)
## Loading required package: vegan
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-6
newvarm=as.matrix(newvar)
res=adonis(data.dist123~newvarm,perm=99999)
res
## 
## Call:
## adonis(formula = data.dist123 ~ newvarm, permutations = 99999) 
## 
## Permutation: free
## Number of permutations: 99999
## 
## Terms added sequentially (first to last)
## 
##           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## newvarm    1   0.49049 0.49049  10.688 0.26268  1e-05 ***
## Residuals 30   1.37676 0.04589         0.73732           
## Total     31   1.86725                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Beware This is not a valid way of doing analyses, ie look at the data, then test on the same data.

How many ways ??

How many different choices did we make along the way, let’s count:
- Choose the data transformation (here proportions replaced the original counts). … log, rlog, subsample, prop, orig. - Take a subset of the data, some samples declared as outliers.
… leave out 0, 1, 2 ,..,9, + criteria (10)……
- Filter out certain taxa (unknown labels, rare, etc…)
… remove rare taxa (threshold at 0.01%, 1%, 2%,…)
- Choose a distance.
… 40 choices in vegan/phyloseq.
- Choose an ordination method and number of coordinates.
… MDS, NMDS, k=2,3,4,5..
- Choose a clustering method, choose a number of clusters.
… PAM, KNN, density based, hclust …
- Choose an underlying continuous variable (gradient or group of variables: manifold).
- Choose a graphical representation.

One answer:

\[5\times 100 \times 10 \times 40 \times 8 \times 16 \times 2 \times 4 = 204,800,000\]