Other taxonomy assignment tools
BOLDdigger3
BOLDigger3 automates the taxonomic identification of DNA sequences against the Barcode of Life Data System (BOLD v5) databases, using BOLD’s identification service.
BOLDdigger3
1 #!/bin/bash
2
3 # specify the query fasta file
4 fasta=$"ASVs.fasta"
5
6 # run BOLDdigger3
7 boldigger3 identify $fasta --db 2 --mode 3 --thresholds 97 95 90 85
8
9 ## Or run BOLDdigger3 by specifying the python version if you have multiple ones installed
10 # py -3.13 -m boldigger3 identify cox1renamedkoopia.fasta --db 2 --mode 3 --thresholds 97 95
--db 2 specifies the BOLD v5 database of “ANIMAL SPECIES-LEVEL LIBRARY (PUBLIC + PRIVATE)”,--mode 3 specifies the mode of identification if “Exhaustive Search”.--thresholds 97 95 90 85 specify the thresholds for Species-, Genus-, Family-, and Order-level identifications.Check the BOLDdigger3 documentation for more details about the options.
BLAST
Reference database is a fasta file; no special formatting is required.
BLAST
1 #!/bin/bash
2
3 # specify the query fasta file
4 fasta=$"ASVs.fasta"
5
6 # specify reference database for BLAST
7 reference_database="../CO1Classifier_v5.1.0/CO1Classifier_v5.1.0.fasta"
8 reference_database=$(realpath $reference_database) # get full directory path
9
10 ## if the database is just in fasta format, then convert it to BLAST format
11
12 ### Check and assign BLAST database
13 d1=$(echo $reference_database | awk 'BEGIN{FS=OFS="."}{print $NF}')
14 #make blast database if db is not formatted for BLAST
15 db_dir=$(dirname $reference_database)
16 check_db_presence=$(ls -1 $db_dir/*.nhr 2>/dev/null | wc -l)
17 if (( $check_db_presence != 0 )); then
18 if [[ $d1 == "fasta" ]] || [[ $d1 == "fa" ]] || \
19 [[ $d1 == "fas" ]] || [[ $d1 == "fna" ]] || \
20 [[ $d1 == "ffn" ]]; then
21 database=$"-db $reference_database"
22 elif [[ $d1 == "ndb" ]] || [[ $d1 == "nhr" ]] || \
23 [[ $d1 == "nin" ]] || [[ $d1 == "not" ]] || \
24 [[ $d1 == "nsq" ]] || [[ $d1 == "ntf" ]] || \
25 [[ $d1 == "nto" ]]; then
26 reference_database=$(echo $reference_database | awk 'BEGIN{FS=OFS="."}NF{NF-=1};1')
27 database=$"-db $reference_database"
28 fi
29 elif [[ $d1 == "fasta" ]] || [[ $d1 == "fa" ]] || \
30 [[ $d1 == "fas" ]] || [[ $d1 == "fna" ]] || \
31 [[ $d1 == "ffn" ]]; then
32 printf '%s\n' "Note: converting fasta formatted database for BLAST"
33 makeblastdb -in $reference_database -input_type fasta -dbtype nucl
34 database=$"-db $reference_database"
35 fi
36
37 #BLAST
38 printf '%s\n' "# Running BLAST for $(grep -c "^>" $fasta) sequences"
39 blastn -strand plus \
40 -num_threads 20 \
41 -query $fasta \
42 $database \
43 -out 10BestHits.txt -task blastn \
44 -max_target_seqs 10 -evalue=0.001 \
45 -word_size=7 -reward=1 \
46 -penalty=-1 -gapopen=1 -gapextend=2 \
47 -outfmt "6 qseqid stitle qlen slen qstart qend sstart send evalue length nident mismatch gapopen gaps sstrand qcovs pident"
48
49 #qseqid = Query Seq-id
50 #qlen = Query sequence length
51 #sacc = Subject accession
52 #slen = Subject sequence length
53 #qstart = Start of alignment in query
54 #qend = End of alignment in query
55 #sstart = Start of alignment in subject
56 #send = End of alignment in subject
57 #evalue = Expect value
58 #length = Alignment length
59 #pident = Percentage of identical matches
60 #nident = Number of identical matches
61 #mismatch = Number of mismatches
62 #gapopen = Number of gap openings
63 #gaps = Total number of gaps
64 #1st_hit = BLAST 1st hit
65 #sstrand = Subject Strand
66 #qcovs = Query Coverage Per Subject
67
68 ### parse BLAST 1st hit
69 awk 'BEGIN{FS="\t"}''!seen[$1]++' 10BestHits.txt > BLAST_1st_hit.txt
70 #check which seqs got a hit
71 gawk 'BEGIN{FS="\t"}{print $1}' < BLAST_1st_hit.txt | \
72 uniq > gothits.names
73 #add no_hits flag
74 seqkit seq -n $fasta > $fasta.names
75 grep -v -w -F -f gothits.names $fasta.names | \
76 sed -e 's/$/\tNo_significant_similarity_found/' >> BLAST_1st_hit.txt
77 #add header
78 sed -i '1 i\
79 qseqid\t1st_hit\tqlen\tslen\tqstart\tqend\tsstart\tsend\tevalue\tlength\tnident\tmismatch\tgapopen\tgaps\tsstrand\tqcovs\tpident' \
80 BLAST_1st_hit.txt
81
82 #remove unnecessary files
83 rm *.names
RDP-classifier
Taxonomy assignment with the RDP-classifier against CO1Classifier v5.1.0 database.
assign taxonomy with RDP-classifier
1 #!/bin/bash
2
3 # download the CO1Classifier reference databse
4 wget \
5 "https://github.com/terrimporter/CO1Classifier/releases/download/RDP-COI-v5.1.0/RDP_COIv5.1.0.zip"
6 # unzip the database and edit name
7 unzip RDP_COIv5.1.0.zip && mv mydata CO1Classifier_v5.1.0_RDP
8
9 # specify reference database for RDP
10 reference_database="CO1Classifier_v5.1.0_RDP/rRNAClassifier.properties"
11 reference_database=$(realpath $reference_database) # get database names with full path
12
13 # specify input fasta file
14 cd ASV_table
15 ASV_fasta="ASVs_TagJumpFiltered.fasta"
16 ASV_fasta_tmp="ASVs_TagJumpFiltered_minmax.fasta"
17
18 # select by size to only retain ASVs that are within the range of
19 # expected variation. In this case we set it to 400 up to 430 bps
20 vsearch --fastx_filter $ASV_fasta \
21 --fastq_minlen 400 \
22 --fastq_maxlen 430 \
23 --fastaout $ASV_fasta_tmp
24
25 mv $ASV_fasta_tmp $ASV_fasta
26
27 # Run RDP-classifier
28 time rdp_classifier \
29 -Xmx12g \
30 classify \
31 -t $reference_database \
32 -f allrank \
33 -o RDP.taxonomy.txt \
34 -q $ASV_fasta
Get target taxa from RDP-classifier results
This part filters the ASV dataset to include only target taxonomic group for the following analyses.
For example, if you are interested in Metazoa, then discard all ASVs that do not match to the target taxon based on the user defined threshold (default = 0.8).
get only target taxon annotations
1 #!/usr/bin/env Rscript
2 ### Filter dataset based on RDP classifier results to include target taxa
3
4 # specify taxon and threshold
5 taxon="Metazoa" # target taxonomic group(s);
6 # when specifying multiple groups, they should be from the same taxonomic
7 # level. Separator is "," (e.g., "Hymenoptera, Lepidoptera")
8 tax_level="kingdom" # allowed levels: kingdom | phylum | class | order | family | genus
9 threshold="0.8" # threshold for considering an ASV as a target taxon;
10 # default is 0.8
11
12 # species_bootstrap = 0.9 # threshold for species-level identification;
13 # default is 0.9
14
15 # specify the ASV table and ASVs.fasta file that would be filtered to include only target taxa
16 ASV_fasta = "ASVs_TagJumpFiltered.fasta"
17 ASV_table = "ASV_table_TagJumpFiltered.txt"
18
19 # specify the RDP-classifier output file (taxonomy file)
20 taxtab="RDP.taxonomy.txt"
21
22 #--------------------------------------#
23 library(stringr)
24 library(dplyr)
25
26 # read ASV table
27 table = read.table(ASV_table, sep = "\t", check.names = F, header = T, row.names = 1)
28
29 # read taxonomy table
30 tax = read.table(taxtab, sep = "\t", check.names = F, row.names = 1)
31 cat("\n Input =", nrow(tax), "features.\n")
32 # remove not needed columns from tax dataframe
33 tax = tax[, -c(1, 2, 3, 4, 6, 9, 12, 15, 18, 21, 24, 27)]
34 # assign colnames for tax
35 colnames(tax) = c("superkingdom", "superkingdom_BootS",
36 "kingdom", "kingdom_BootS",
37 "phylum","phylum_BootS",
38 "class", "class_BootS",
39 "order", "order_BootS",
40 "family", "family_BootS",
41 "genus", "genus_BootS",
42 "species", "species_BootS")
43
44 # taxon list
45 taxon_list = strsplit(taxon, ", ")[[1]]
46
47 ### extract only target-taxon ASVs from the 'raw' RDP results
48 tax_filtered = tax %>%
49 filter(.data[[tax_level]] %in% taxon_list)
50
51 ### change all tax ranks to "unclassified_*" when
52 # the bootstrap values is less than the specified threshold
53 #kingdom
54 tax_filtered = tax_filtered %>% mutate(kingdom = ifelse(kingdom_BootS <
55 threshold, paste0("unclassified_", superkingdom), as.character(kingdom)))
56 #phylum
57 tax_filtered = tax_filtered %>% mutate(phylum = ifelse(phylum_BootS <
58 threshold, paste0("unclassified_", kingdom), as.character(phylum)))
59 #replace potential "unclassified_unclassified_" with "unclassified_"
60 tax_filtered$class = stringr::str_replace(tax_filtered$class, "unclassified_unclassified_",
61 "unclassified_")
62 #class
63 tax_filtered = tax_filtered %>% mutate(class = ifelse(class_BootS <
64 threshold, paste0("unclassified_", phylum), as.character(class)))
65 #replace potential "unclassified_unclassified_" with "unclassified_"
66 tax_filtered$class = stringr::str_replace(tax_filtered$class, "unclassified_unclassified_",
67 "unclassified_")
68 #order
69 tax_filtered = tax_filtered %>% mutate(order = ifelse(order_BootS <
70 threshold, paste0("unclassified_", class), as.character(order)))
71 #replace potential "unclassified_unclassified_" with "unclassified_"
72 tax_filtered$order = stringr::str_replace(tax_filtered$order, "unclassified_unclassified_",
73 "unclassified_")
74 #family
75 tax_filtered = tax_filtered %>% mutate(family = ifelse(family_BootS <
76 threshold, paste0("unclassified_", order), as.character(family)))
77 #replace potential "unclassified_unclassified_" with "unclassified_"
78 tax_filtered$family = stringr::str_replace(tax_filtered$family, "unclassified_unclassified_",
79 "unclassified_")
80 #genus
81 tax_filtered = tax_filtered %>% mutate(genus = ifelse(genus_BootS <
82 threshold, paste0("unclassified_", family), as.character(genus)))
83 #replace potential "unclassified_unclassified_" with "unclassified_"
84 tax_filtered$genus = stringr::str_replace(tax_filtered$genus, "unclassified_unclassified_",
85 "unclassified_")
86
87 # species to genus_sp when the bootstrap values is < "species_bootstrap" threshold
88 tax_filtered = tax_filtered %>% mutate(species = ifelse(species_BootS < species_bootstrap,
89 paste0(genus, "_sp"), species))
90
91 ### count occurrences of each taxon in df (RDP results)
92 count_taxa = function(df, taxa) {
93 sapply(taxa, function(taxon) sum(apply(df, 1, function(row) any(row == taxon))))
94 }
95 taxon_counts = count_taxa(tax_filtered, taxon_list)
96
97 # Check the counts
98 if (all(taxon_counts == 0)) {
99 print("ERROR: None of the specified taxa are present in the RDP results.")
100 } else {
101 if (any(taxon_counts == 0)) {
102 warning("One or more of the specified taxa are not present in the RDP results.")
103 }
104 print(taxon_counts)
105 }
106
107 ### extract only target-taxon ASVs from the 'threshold filtered' RDP results
108 tax_filtered_thresh = tax_filtered %>%
109 filter(.data[[tax_level]] %in% taxon_list)
110 # write filtered RDP taxonomy table
111 tax_filtered_thresh = cbind(ASV = rownames(tax_filtered_thresh), tax_filtered_thresh)
112 write.table(tax_filtered_thresh,
113 file = "RDP.taxonomy.filt.txt",
114 quote = F,
115 row.names = F,
116 sep = "\t")
117
118 ### filter the ASV table to match ASVs that were kept in the tax_filtered table
119 table_filt = table[rownames(table) %in% rownames(tax_filtered_thresh), ]
120
121 ### check ASV table; if 1st col is sequence, then remove it for metaMATE
122 if (colnames(table_filt)[1] == "Sequence") {
123 cat(";; 2nd column was 'Sequence', removing this ... \n")
124 table_filt = table_filt[, -1]
125 }
126
127 # write filtered table
128 table_filt = cbind(ASV = rownames(table_filt), table_filt)
129 write.table(table_filt,
130 file = paste0(sub("\\.[^.]*$", "_tax_filt.txt", ASV_table)),
131 quote = F,
132 row.names = F,
133 sep = "\t")
134
135 # filter ASV_fasta
136 library(Biostrings)
137 fasta = readDNAStringSet(ASV_fasta)
138 fasta.tax_filt = fasta[names(fasta) %in% rownames(table_filt)]
139 # write filtered ASV_fasta
140 writeXStringSet(fasta.tax_filt,
141 paste0(sub("\\.[^.]*$", "_tax_filt.fasta", ASV_fasta)),
142 width = max(width(fasta.tax_filt)))



