library(PhyInsight)
The rmBadStrings
functions
(rmBadStrings_1()
, rmBadStrings_2()
, and
rmBadStrings_3()
) can be used to automatically clean a DNA
string set that would otherwise be unsuitable for analysis. These
functions use slightly different methods to automatically remove strings
that are mismatched with others and those whose distances return
NaN
values or are considered outliers.
# query the data using the taxon name
<- querySpecData("Nepenthes")
specdata
# subset results that only have nucleotides from the matK region
<- subset(specdata, markercode == "matK")
specdata
# get one observation per species
<- getSpeciesRepr(specdata) specdata
# generate a DNA bin
<- genDNABin(specdata)
DNABin
# use the DNA bin to create a DNA string set
<- genDNAStringSet(DNABin)
DNAStringset
# automatically manipulate the DNA strings
<- ManipStringSet(DNAStringset) DNAStringSet_manip
At this point attempting to create a phylo tree will result in an
error. Using DECIPHER::BrowseSeqs(DNAStringSet_manip)
to
view the string set will reveal a series of mismatched and fragmented
strings. Using rmBadStrings_1()
will automatically remove
all mismatched strings. After which, a tree can be created and and
plotted.
# use the function to remove unsuitable strings and store into an object
<- rmBadStrings_1(
BadStringsRemoved DNAStringSet = DNAStringSet_manip,
specimen_dataframe = specdata
)
<- BadStringsRemoved[[1]]
DNAStringSet_new <- BadStringsRemoved[[2]] specdata_new
# automatically generate a phylo tree
<- genPhytree(DNAStringSet_new)
PhyloTree
# change the label names
$tip.label <- specdata_new$species_name
PhyloTree
# plot the phylo tree
plot(
PhyloTree,label.offset = 0.0001,
cex = 1
)
The rmBadStrings
functions also have optional arguments
to remove strings whose DNA distances are considered outliers.
rmOutliers
is a logical argument that when set to
TRUE
will automatically remove outliers as well as
performing its regular processes. max_Z_score
is a
numerical value that allows the user to change the maximum Z score for
each string’s DNA distance. The default value for this argument is 3 as
a score higher than this is generally considered an outlier.
# use the function to remove unsuitable strings and remove outliers
<- rmBadStrings_1(
BadStringsRemoved DNAStringSet = DNAStringSet_manip,
specimen_dataframe = specdata,
rmOutliers = TRUE
)#> [1] "Outlier strings detected and removed: 58"
<- BadStringsRemoved[[1]]
DNAStringSet_new <- BadStringsRemoved[[2]] specdata_new
# automatically generate a phylo tree
<- genPhytree(DNAStringSet_new)
PhyloTree
# change the label names
$tip.label <- specdata_new$species_name
PhyloTree
# plot the phylo tree
plot(
PhyloTree,label.offset = 0.0001,
cex = 1
)
# use the function to remove unsuitable strings and remove outliers
<- rmBadStrings_1(
BadStringsRemoved DNAStringSet = DNAStringSet_manip,
specimen_dataframe = specdata,
rmOutliers = TRUE,
max_Z_score = 2
)#> [1] "Outlier strings detected and removed: 58"
#> [1] "Outlier strings detected and removed: 8"
#> [2] "Outlier strings detected and removed: 17"
#> [3] "Outlier strings detected and removed: 20"
#> [4] "Outlier strings detected and removed: 27"
#> [5] "Outlier strings detected and removed: 28"
#> [6] "Outlier strings detected and removed: 48"
#> [7] "Outlier strings detected and removed: 49"
#> [8] "Outlier strings detected and removed: 54"
#> [9] "Outlier strings detected and removed: 70"
#> [1] "Outlier strings detected and removed: 2"
#> [2] "Outlier strings detected and removed: 20"
#> [3] "Outlier strings detected and removed: 22"
#> [4] "Outlier strings detected and removed: 30"
#> [5] "Outlier strings detected and removed: 32"
#> [6] "Outlier strings detected and removed: 35"
#> [7] "Outlier strings detected and removed: 37"
#> [8] "Outlier strings detected and removed: 42"
#> [9] "Outlier strings detected and removed: 49"
#> [10] "Outlier strings detected and removed: 64"
#> [11] "Outlier strings detected and removed: 66"
#> [12] "Outlier strings detected and removed: 68"
#> [1] "Outlier strings detected and removed: 3"
#> [2] "Outlier strings detected and removed: 7"
#> [3] "Outlier strings detected and removed: 11"
#> [4] "Outlier strings detected and removed: 13"
#> [5] "Outlier strings detected and removed: 27"
#> [6] "Outlier strings detected and removed: 33"
#> [7] "Outlier strings detected and removed: 34"
#> [8] "Outlier strings detected and removed: 36"
#> [9] "Outlier strings detected and removed: 37"
#> [10] "Outlier strings detected and removed: 57"
#> [1] "Outlier strings detected and removed: 8"
#> [2] "Outlier strings detected and removed: 12"
#> [3] "Outlier strings detected and removed: 19"
#> [4] "Outlier strings detected and removed: 42"
<- BadStringsRemoved[[1]]
DNAStringSet_new <- BadStringsRemoved[[2]] specdata_new
# automatically generate a phylo tree
<- genPhytree(DNAStringSet_new)
PhyloTree
# change the label names
$tip.label <- specdata_new$species_name
PhyloTree
# plot the phylo tree
plot(
PhyloTree,label.offset = 0.0001,
cex = 1
)