Essential Biodiversity Variables of genetic composition
Workflow to calculate Essential Biodiversity Variables (EBV) of genetic composition.
Starting point
Required input data:
ASVs table
ASV taxonomy table
ASVs fasta file
OTU taxonomy table
Sample metadata
example of an ASV table
sample1 |
sample2 |
sample3 |
sample4 |
… |
|
ASV_01 |
579 |
0 |
0 |
0 |
… |
ASV_02 |
405 |
345 |
449 |
430 |
… |
ASV_03 |
0 |
0 |
231 |
69 |
… |
ASV_04 |
0 |
62 |
345 |
0 |
… |
example of an ASV taxonomy table
… |
Class |
Order |
Family |
Genus |
|
ASV_01 |
… |
Collembola |
Entomobryomorpha |
Entomobryidae |
Entomobrya |
ASV_02 |
… |
Collembola |
Entomobryomorpha |
Family_0032 |
Genus_001 |
ASV_03 |
… |
Collembola |
Entomobryomorpha |
Isotomidae |
Parisotoma |
ASV_04 |
… |
Collembola |
Entomobryomorpha |
Family_0032 |
Genus_022 |
example of a fasta file
>ASV_01
ACTTTATATTTTATTTTTGGAATTTGAGCAGGAATAGTAGGAACTTCTCTTAGTTTATTAATTCG...
>ASV_02
ATAGTAGGAACATCTCTTAGTTTATTAATTCGAACTGAACTAGGAAATCCAGGTTCACTTATTGG...
>ASV_03
TCTTTACCTTTTATTCGGTGCCTGAGCTGGCATGGTGGGGACTGCTCTTAGTCTTCTAATCCGGG...
>ASV_04
TACTTTGTATTTTGTTTTTGGGGTGTGATCTGGTATGTTGGGGACTAGGTTCAGAAGACTAATTC...
...
example of an OTU taxonomy table
… |
Class |
Order |
Family |
Genus |
|
OTU_01 |
… |
Collembola |
Entomobryomorpha |
Entomobryidae |
Entomobrya |
OTU_02 |
… |
Collembola |
Entomobryomorpha |
Family_0032 |
Genus_001 |
OTU_03 |
… |
Collembola |
Entomobryomorpha |
Isotomidae |
Parisotoma |
OTU_04 |
… |
Collembola |
Entomobryomorpha |
Family_0032 |
Genus_022 |
Calculate EBVs of genetic composition: genetic richness, nucleotide diversity and genetic differentiation
1#!/usr/bin/Rscript
2
3## Script to calculate Essential Biodiversity Variables (EBV) of genetic composition
4## - haplotype richness
5## - nucleotide diversity
6## - genetic differentiation between populations
7
8##-----------------------Settings-----------------------##
9## Working directory and results directory paths
10wd = "E:/BGE/EBV_genetic composition"
11results = file.path(wd, "results")
12
13## Input ASV table
14ASV.table = file.path(wd, "data/ASV_table_Arthropoda.csv")
15
16## ASV taxonomy table. Information of ASV clustering into OTUs (by unique taxonomy, output of OptimOTU)
17ASV.tax = file.path(wd, "data/asv_taxonomy.tsv")
18
19## OTU taxonomy table (all OTUs, the filtered and discarded sets, output of OptimOTU)
20all.OTU = file.path(wd, "data/otu_taxonomy.tsv")
21
22## Sample metadata
23sampleMetadata = file.path(wd, "data/Sample_metadata.csv") #latest shared sampling metadata with "site_name", where we can see "A" or "1" for the main trap and "B" or "2" for the back up trap.
24
25## ASV fasta file (ASV sequences)
26fasta_asv = file.path(wd, "data/ASVs_Arthropoda.fasta")
27
28## Taxonomic filtering: specify the taxonomic level and target group
29## Taxonomic level options: "phylum", "class", "order", "family", "genus", "species"
30target_taxonomic_level = "phylum"
31## Target value for the specified taxonomic level
32target_taxonomic_group = "Arthropoda"
33##------------------------------------------------------##
34
35
36
37library(data.table)
38library(stringr)
39library(iNEXT)
40library(ggplot2)
41library(pegas)
42library(ape)
43library(adegenet)
44library(mmod) #Jost's D & Gst
45
46
47
48##------------------------##
49## 1. Preparing the input ##
50##------------------------##
51
52## Load ASV and OTU data
53##----------------------
54
55## Load input ASV table:
56## This table was obtained after a strict filtering with metaMATE
57# (to obtain reliable haplotype information)
58ASV_table = fread(file = ASV.table, header = TRUE, sep = ";")
59
60## Load ASV taxonomy table (ASVs with taxonomic identification)
61ASVtax = fread(file = ASV.tax, header = TRUE)
62
63## Load the OTU taxonomy table
64## This list includes all OTUs detected,
65# including those discarded by the strict metaMATE filtering
66OTU_all = fread(file = all.OTU, header = TRUE, sep = "\t")
67
68## ASVs presence checks between ASVtax and ASV_table
69##--------------------------------------------------
70## Check if all ASVs from ASV_table are present in ASVtax
71missing_in_ASVtax <- setdiff(ASV_table$ASV, ASVtax$seq_id)
72if(length(missing_in_ASVtax) > 0){
73 warning(paste("The following", length(missing_in_ASVtax),
74 "ASVs from ASV_table are not present in ASVtax:",
75 paste(head(missing_in_ASVtax, 10), collapse = ", "),
76 ifelse(length(missing_in_ASVtax) > 10, "...", "")))
77} else {
78 message("All ASVs from ASV_table are present in ASVtax")
79}
80
81## Check if all ASVs from ASVtax are present in ASV_table
82missing_in_ASV_table <- setdiff(ASVtax$seq_id, ASV_table$ASV)
83if(length(missing_in_ASV_table) > 0){
84 warning(paste("The following", length(missing_in_ASV_table),
85 "ASVs from ASVtax are not present in ASV_table:",
86 paste(head(missing_in_ASV_table, 10), collapse = ", "),
87 ifelse(length(missing_in_ASV_table) > 10, "...", "")))
88} else {
89 message("All ASVs from ASVtax are present in ASV_table")
90}
91
92## Find the intersection of ASVs (ASVs present in both matrices)
93common_ASVs <- intersect(ASV_table$ASV, ASVtax$seq_id)
94message(paste("Number of common ASVs in both matrices:",
95 length(common_ASVs)))
96
97## Filter both matrices to contain only matching ASVs
98ASVtax <- ASVtax[seq_id %in% common_ASVs]
99ASV_table <- ASV_table[ASV %in% common_ASVs]
100
101## Add OTU id to ASVtax
102##---------------------
103ASV.OTUtax <- merge(x=ASVtax, y=OTU_all[,.(OTU, species)], all.x = TRUE)
104
105## Add OTU id to the ASV_table
106names(ASV.OTUtax)[which(names(ASV.OTUtax)=="seq_id")] <- "ASV"
107ASV_table = merge(x = ASV_table, y = ASV.OTUtax[, .(ASV, OTU)],
108 by = "ASV", all.x = TRUE)
109
110## Keeping only the specified taxonomic group (OTU and ASV)
111##---------------------------------------------------------
112# Check if the specified taxonomic level exists in the data
113if(!target_taxonomic_level %in% names(ASV.OTUtax)){
114 stop(paste("Error: Taxonomic level '", target_taxonomic_level,
115 "' not found in ASV.OTUtax. Available levels:",
116 paste(names(ASV.OTUtax), collapse = ", ")))
117}
118
119### Filter by the specified taxonomic level and group
120targetTax <- ASV.OTUtax[get(target_taxonomic_level) ==
121 target_taxonomic_group]
122if(nrow(targetTax) == 0){
123 warning(paste("No ASVs found with", target_taxonomic_level, "=",
124 target_taxonomic_group))
125} else {
126 message(paste("Filtered to", nrow(targetTax), "ASVs with",
127 target_taxonomic_level, "=", target_taxonomic_group))
128}
129
130ASV_table_targetTax <- ASV_table[ASV %in% targetTax$ASV]
131
132## Convert ASV table to long format
133ASV = melt(data = ASV_table_targetTax,
134 id.vars = colnames(ASV_table_targetTax)
135 [c(1, ncol(ASV_table_targetTax))],
136 variable.name = "sample", value.name = "Abundance")
137
138# Remove ASV-sample combinations with zero abundance
139ASV = ASV[Abundance > 0]
140gc()
141
142
143## Sample metadata - loading file
144##-------------------------------
145sample_metadata = fread(file = sampleMetadata, header = TRUE, sep = ",")
146
147## Adding "site & country" to ASV dataset
148## Here, countries are used as 'populations' to compute Jost's D and Gst
149# (for pairwise genetic distance comparisons between countries)
150# (Keeping the site column is optional)
151ASV_site <- merge(x=ASV, y=sample_metadata[, .(site, country, sample)],
152 by="sample", all.x = TRUE)
OTU selection
We use iNEXT R package to assess sample completeness (coverage) and expected diversity per OTU. All OTUs with a coverage ≥ 0.95 and an expected/observed diversity ratio ≥ 0.8 were kept for calculating EBV of genetic composition.
1#!/usr/bin/Rscript
2
3## Selecting OTUs (species):
4##--------------------------
5## Here, we work only with OTU that were well sampled.
6# To identify them, we use iNEXT
7
8## 1) Creating the input for iNEXT, it can be "incidence_raw",
9## which is a presence-absence matrix,
10## or it can be "incidence_freq" which is a vector of frequencies,
11## where the first position should be the number of sites
12## where the OTU was found.
13
14## "incidence_raw"
15## Here we also calculate the nucleotide diversity, using pegas::nuc.div()
16## it calculates the average proportion of nucleotide differences
17## across all pairwise sequence comparisons.
18## (sum(proportion of sites that differ between two sequences =
19## no.differences/total length))/no.comparisons(=no.seqs*(no.seqs-1)/2)
20asv.seqs.all <- ape::read.dna(fasta_asv, format = "fasta")
21
22OTU_hap <- list()
23nucleotide.div <- list()
24
25## OTUs in our sample:
26OTU2keep <- ASV_table_targetTax[, unique(OTU)]
27
28# Build a presence/absence matrix for each OTU and compute nucleotide diversity
29# (this may take a minute or two)
30for(i in OTU2keep){
31 asv_otu <- ASV_site[OTU==i, .(sample, ASV)]
32 asv_otu[, presence := 1] # new variable fill of 1 that indicates presence
33 ## To wide format: rows = ASV, column = samples, 0=absence, 1=presence
34 asv_table <- dcast(asv_otu, ASV~sample, value.var = "presence")
35 asv_table[is.na(asv_table)] <- 0
36 namesASV <- asv_table[, ASV]
37 asv_table[, ASV := NULL]
38 asv_matrix <- as.matrix(asv_table)
39 rownames(asv_matrix) <- namesASV
40 OTU_hap[[i]] <- asv_matrix
41
42 ## Nucleotide richness
43 asv.seqs.subset <- asv.seqs.all[rownames(asv.seqs.all) %in% namesASV,]
44 nucleotide.div[[i]] <- pegas::nuc.div(asv.seqs.subset)
45}
46
47## "incidence_freq"
48hap_freq <- list()
49
50# Build a incidence frequency format for iNEXT rarefaction analysis
51# (this may take a minute or two)
52for(i in OTU2keep){
53 asv_otu <- ASV_site[OTU==i, .(sample, ASV)]
54 asv_otu[, presence := 1] # new variable fill of 1 that indicates presence
55 ## To wide format: rows = ASV, column = samples, 0=absence, 1=presence
56 asv_table <- dcast(asv_otu, ASV~sample, value.var = "presence")
57 asv_table[is.na(asv_table)] <- 0
58 asv_table[, ASV := NULL]
59 hap_freq[[i]] <- c(ncol(asv_table), apply(asv_table, 1, sum))
60}
61
62## 2) Rarefaction using iNEXT
63## --------------------------
64#out.raw <- iNEXT(hap_freq, q=0, datatype = "incidence_freq") #q=0 -> haplotype richness
65## We get warnings and errors when trying to run all species at the same time.
66## Instead, we do it one by one in a loop:
67
68results_list2 <- list()
69## Set a progress bar to check the progress of the most demanding part of the loop
70pb <- txtProgressBar(min = 1, max = length(OTU2keep), style = 3)
71count=0
72
73for(sp in OTU2keep){
74 count=count+1
75 tryCatch({ #we added this because for some OTU is not possible to run iNEXT() and this way the loop does not stop executing
76 results_list2[[sp]] <- iNEXT(hap_freq[sp], q=0, datatype = "incidence_freq")
77 }, error = function(e){
78 warning(paste("Problem with", sp))
79 })
80
81
82 setTxtProgressBar(pb, count) #progress bar
83}
84close(pb)
85
86results_list <- results_list2
87
88totry <- setdiff(OTU2keep, names(results_list)) #5770 OTUs did not run with iNEXT()
89
90## However, some OTU that did not run with iNEXT() do run in a second trial (I don't know why)
91##https://github.com/JohnsonHsieh/iNEXT/issues/81 (not solved)
92##https://github.com/JohnsonHsieh/iNEXT/issues/75
93for(sp in totry){
94 tryCatch({ #I added this because for some OTU is not possible to run iNEXT() and this way the loop does not stop executing
95 results_list2[[sp]] <- iNEXT(hap_freq[sp], q=0, datatype = "incidence_freq")
96 }, error = function(e){
97 warning(paste("Problem with", sp))
98 })
99}
100while(length(results_list2) > length(results_list)){
101 print(paste("totry length", length(totry)))
102 results_list <- results_list2
103 for(sp in totry){
104 tryCatch({ #I added this because for some OTU is not possible to run iNEXT() and this way the loop does not stop executing
105 results_list2[[sp]] <- iNEXT(hap_freq[sp], q=0, datatype = "incidence_freq")
106 }, error = function(e){
107 warning(paste("Problem with", sp))
108 })
109 }
110 totry <- setdiff(OTU2keep, names(results_list2))
111}
112
113## Checking coverage (=completedness) and comparing observed and estimated number of
114##haplotypes per OTU
115results.data <- as.data.frame(matrix(NA, nrow=length(results_list), ncol = 2))
116names(results.data) <- c("coverage", "obs_est") #storing SC (sample coverage) and observed/estimated data
117rownames(results.data) <- names(results_list)
118
119for(i in 1:nrow(results.data)){
120 otu = rownames(results.data)[i]
121 results.data$coverage[i] <- results_list[[otu]]$DataInfo$SC
122 results.data$obs_est[i] <- results_list[[otu]]$AsyEst$Observed[1]/results_list[[otu]]$AsyEst$Estimator[1] #Observed and estimated value of Species Richness
123}
124length(which(results.data$coverage == 1))/nrow(results.data)
125length(which(results.data$coverage > 0.97))/nrow(results.data)
126length(which(results.data$coverage > 0.95))/nrow(results.data)
127length(which(results.data$coverage > 0.90))/nrow(results.data)
128coverage_plot <- ggplot(results.data, aes(x=coverage)) +
129 geom_histogram(binwidth = 0.01) +
130 theme_minimal() +
131 labs(title = "OTU frequency based on haplotype sampling completedness (coverage)", x = "coverage", y = "no. OTUs")
132coverage_plot
133
134length(which(results.data$obs_est == 1))/nrow(results.data)
135length(which(results.data$obs_est > 0.95))/nrow(results.data)
136length(which(results.data$obs_est > 0.90))/nrow(results.data)
137length(which(results.data$obs_est > 0.85))/nrow(results.data)
138length(which(results.data$obs_est > 0.80))/nrow(results.data)
139obsest_plot <- ggplot(results.data, aes(x=obs_est)) +
140 geom_histogram(binwidth = 0.02) +
141 theme_minimal() +
142 labs(title = "OTU frequency based on no.observed haplotypes / estimated no.haplotypes", x = "obs/est", y = "no. OTUs")
143obsest_plot
144
145## Selecting a threshold
146## Here we select all OTUs with obs/est>=0.8 & coverage>=0.95
147OTUselected1 <- rownames(results.data)[intersect(which(results.data$coverage>=0.95), which(results.data$obs_est>=0.8))]
Haplotype richness and nucleotide diversity
1#!/usr/bin/Rscript
2
3##-----------------------------------------------------------------##
4## 2. Calculating EBV: haplotype richness and nucleotide diversity ##
5##-----------------------------------------------------------------##
6
7## Adding to the OTU taxonomy table haplotype richness, nucleotide diversity per OTU,
8##number of samples where the OTU was found and number of countries where each OTU was found:
9ebv_taxonomy <- OTU_all[OTU %in% OTUselected1][, `:=`(n.samples.final = 0, n.countries.final = 0, nucleotide.diversity = 0, haplotype.richness = 0)]
10
11for(otu in OTUselected1){
12 samples.found.otu <- ASV[OTU==otu, sample]
13 no.countries.found.otu <- ASV_site[sample %in% samples.found.otu, uniqueN(country)]
14 ebv_taxonomy[OTU==otu, `:=`(haplotype.richness = length(hap_freq[[otu]])-1,
15 nucleotide.diversity = nucleotide.div[[otu]],
16 n.samples.final = length(unique(samples.found.otu)),
17 n.countries.final = no.countries.found.otu)]
18}
Genetic differentiation
Using the BGE case study of high mountain systems as an example, we calculate the genetic differentiation between countries.
In each country, we sampled five elevation points over a 20-week period, resulting in approximately 100 samples per country. For calculating Jost’s D and Gst indices, each detection of a haplotype in a sample is treated as a single individual. For instance, if haplotype A is detected in samples 1 and 5 from Germany and in sample 29 from Spain, this corresponds to two individuals of haplotype A in Germany and one in Spain.
1#!/usr/bin/Rscript
2
3##---------------------------------------------##
4## 3. Calculating EBV: genetic differentiation ##
5##---------------------------------------------##
6
7## Here we calculate the genetic differentiation between countries
8## 7 countries, one mountain gradient sampled per country
9## The main factor grouping samples is "country".
10## We calculate the overall genetic differentiation between countries per OTU
11## and an average pairwise dissimilarity matrix across all OTUs using Jost's D.
12
13## We work with all well sampled OTUs, removing those not shared between countries
14OTU2work <- ebv_taxonomy[n.countries.final>1, OTU]
15
16## Countries are considered as populations
17pops <- ASV_site[OTU %in% OTU2work, unique(country)]
18
19## Loop per OTU to calculate overall and pairwise genetic distance:
20D.matricesALL_list <- list()
21Gst.matricesALL_list <- list()
22## New column in ebv_taxonomy to include the overall genetic differentiation
23ebv_taxonomy[, `:=`(JostD = 0, Gst =0)]
24## Set a progress bar to check the progress of the most demanding part of the loop
25pb <- txtProgressBar(min = 1, max = length(OTU2work), style = 3)
26count=0
27for(otu in OTU2work){
28 count=count+1
29 ## (1) getting the ASVs of the OTU & adding individual ID
30 asv_otu <- ASV_site[OTU==otu]
31 asv_otu <- asv_otu[, indivID := paste0("indiv_", seq(1:nrow(asv_otu)))]
32 ## (2) filtering the fasta file
33 asv.seqs.subset <- asv.seqs.all[rownames(asv.seqs.all) %in% asv_otu$ASV,]
34 otu.seqs.list <- as.list(asv.seqs.subset)
35 ## (3) creating a sequence file following asv_otu (each indiv - one sequence --> duplicated asvs)
36 indiv.seqs.list <- lapply(asv_otu$ASV, function(n) otu.seqs.list[[n]])
37 names(indiv.seqs.list) <- paste0(asv_otu$indivID, "_", asv_otu$ASV)
38 indiv.seqs <- do.call(rbind, indiv.seqs.list)
39 class(indiv.seqs) <- "DNAbin"
40 ## (3) alignment
41 indiv.alignment <- muscle(indiv.seqs) #if not found: exec = "/usr/bin" (although this should work as it is in $PATH)
42 ## (3) convert the alignment (DNAbin) in genind object (adegenet package)
43 indiv.alignment.gd <- DNAbin2genind(indiv.alignment)
44 ## (4) adding population & country information to the genind object
45 indiv.alignment.gd@pop <- as.factor(asv_otu$country)
46 ##checking that individual order and population/country order is the same as in asv_otu
47 for(check in 1:nrow(asv_otu)){
48 stopifnot(str_detect(rownames(indiv.alignment.gd@tab)[check], asv_otu$indivID[check]))
49 stopifnot(indiv.alignment.gd@pop[check] == asv_otu$country[check])
50 }
51 ## (5) Genetic differentiation
52 ## (5.1) Overall Jost's D & Gst
53 overall.D <- D_Jost(indiv.alignment.gd)
54 overall.Gst <- Gst_Hedrick(indiv.alignment.gd)
55 ebv_taxonomy[OTU==otu, `:=`(JostD = overall.D$global.het, Gst = overall.Gst$global)]
56 ## (5.2) Pairwise Jost's D & Gst
57 if(length(unique(asv_otu$country))==length(pops)){ #total number of populations/sites/elevation sampling points
58 D.dist <- pairwise_D(indiv.alignment.gd)
59 D.matrix <- as.matrix(D.dist)
60 D.matricesALL_list[[otu]] <- D.matrix[pops, pops] #all matrices must have the same column and row order
61 Gst.dist <- pairwise_Gst_Hedrick(indiv.alignment.gd)
62 Gst.matrix <- as.matrix(Gst.dist)
63 Gst.matricesALL_list[[otu]] <- Gst.matrix[pops, pops]
64 }else{ #we need to build the whole popxpop matrix to later average all of them
65 dist.obj.D <- pairwise_D(indiv.alignment.gd)
66 dist.m.D <- as.matrix(dist.obj.D)
67 dist.obj.Gst <- pairwise_Gst_Hedrick(indiv.alignment.gd)
68 dist.m.Gst <- as.matrix(dist.obj.Gst)
69 pop.matrix.D <- matrix(NA, nrow=length(pops), ncol=length(pops))
70 colnames(pop.matrix.D) = rownames(pop.matrix.D) = pops
71 pop.matrix.Gst <- matrix(NA, nrow=length(pops), ncol=length(pops))
72 colnames(pop.matrix.Gst) = rownames(pop.matrix.Gst) = pops
73 for(r in rownames(dist.m.D)){
74 for(c in colnames(dist.m.D)){
75 pop.matrix.D[which(rownames(pop.matrix.D)==r), which(colnames(pop.matrix.D)==c)] = dist.m.D[r,c]
76 pop.matrix.Gst[which(rownames(pop.matrix.Gst)==r), which(colnames(pop.matrix.Gst)==c)] = dist.m.Gst[r,c]
77 }
78 }
79 D.matricesALL_list[[otu]] <- pop.matrix.D
80 Gst.matricesALL_list[[otu]] <- pop.matrix.Gst
81 }
82 setTxtProgressBar(pb, count) #progress bar
83}
84close(pb)
85
86## Just in case, apply the same order in all matrices on the list (this would not be necessary as it is done inside the loop)
87D.matricesALL_list <- lapply(D.matricesALL_list, function(x) x[pops, pops])
88Gst.matricesALL_list <- lapply(Gst.matricesALL_list, function(x) x[pops, pops])
89
90## Saving the two lists of pairwise distance matrices
91saveRDS(D.matricesALL_list, file.path(results, "OTUgeneticDisMatrices_JostD_ALLcountries.rds"))
92saveRDS(Gst.matricesALL_list, file.path(results, "OTUgeneticDisMatrices_Gst_ALLcountries.rds"))
93## Calculate the mean across all matrices ignoring NAs
94D.mean_distanceALL <- Reduce("+", lapply(D.matricesALL_list, function(x){x[is.na(x)]<-0; x})) / Reduce("+", lapply(D.matricesALL_list, function(x) !is.na(x)))
95Gst.mean_distanceALL <- Reduce("+", lapply(Gst.matricesALL_list, function(x){x[is.na(x)]<-0; x})) / Reduce("+", lapply(Gst.matricesALL_list, function(x) !is.na(x)))
96## Saving the two mean pairwise distance matrices between countries
97write.table(D.mean_distanceALL, file.path(results,"MeanPairwiseD_allCountries.txt"))
98write.table(Gst.mean_distanceALL, file.path(results,"MeanPairwiseGst_allCountries.txt"))
99
100
101## Saving the final OTU table with the EBV of genetic composition:
102fwrite(ebv_taxonomy, file.path(results, "FinalOTUtable_geneticEBV.csv"))



