library(SELEX)
## Loading required package: rJava
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:rJava':
##
## anyDuplicated, duplicated, sort, unique
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## Filter, Find, Map, Position, Reduce, anyDuplicated, append,
## as.data.frame, basename, cbind, colMeans, colSums, colnames,
## dirname, do.call, duplicated, eval, evalq, get, grep, grepl,
## intersect, is.unsorted, lapply, lengths, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, rank, rbind,
## rowMeans, rowSums, rownames, sapply, setdiff, sort, table,
## tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 3.5.1
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
library(stringi)
library(Biostrings)
library(SelexGLM)
## Loading required package: RColorBrewer
library(devtools)
library(reshape2)
library(ggplot2)
library(Rmisc)
## Loading required package: lattice
## Loading required package: plyr
##
## Attaching package: 'plyr'
## The following object is masked from 'package:XVector':
##
## compact
## The following object is masked from 'package:IRanges':
##
## desc
## The following object is masked from 'package:S4Vectors':
##
## rename
We start with some initialization related to the SELEX
package:
options(java.parameters = "-Xmx4000M")
workDir = tempdir()
selex.config(workingDir=workDir, maxThreadNumber=4)
Next, we will define the SELEX samples that we want to analyze. We will use the example data from the SELEX
package:
selex.loadAnnotation(system.file("extdata", "config.xml", package="SELEX"))
selex.sampleSummary()
## seqName sampleName rounds leftBarcode rightBarcode
## 2 R0.libraries R0.barcodeCG 0 TGG CCACGTC
## 3 R0.libraries R0.barcodeGC 0 TGG CCAGCTG
## 1 R2.libraries ExdHox.R2 2 TGG CCAGCTG
## leftFlank rightFlank
## 2 GTTCAGAGTTCTACAGTCCGACGATCTGG CCACGTCTCGTATGCCGTCTTCTGCTTG
## 3 GTTCAGAGTTCTACAGTCCGACGATCTGG CCAGCTGTCGTATGCCGTCTTCTGCTTG
## 1 GTTCAGAGTTCTACAGTCCGACGATCTGG CCAGCTGTCGTATGCCGTCTTCTGCTTG
## seqFile
## 2 /Library/Frameworks/R.framework/Versions/3.5/Resources/library/SELEX/extdata/R0.fastq.gz
## 3 /Library/Frameworks/R.framework/Versions/3.5/Resources/library/SELEX/extdata/R0.fastq.gz
## 1 /Library/Frameworks/R.framework/Versions/3.5/Resources/library/SELEX/extdata/R2.fastq.gz
r0.train = selex.sample(seqName = 'R0.libraries', sampleName='R0.barcodeGC', round = 0)
r0.test = selex.sample(seqName = 'R0.libraries', sampleName='R0.barcodeCG', round = 0)
dataSample = selex.sample(seqName = 'R2.libraries', sampleName = 'ExdHox.R2', round = 2)
Markov model is built, information gain is used to identify k-mer length of binding site, kmer tables are built, and probes are counted in a way that corrects for the zero-deflated nature of data corrected.
# MARKOV MODEL BUILT
kmax = selex.kmax(sample = r0.test)
## Counting [R0.libraries.R0.barcodeCG.0][ K = 1 ]
## Counting [R0.libraries.R0.barcodeCG.0][ K = 2 ]
## Counting [R0.libraries.R0.barcodeCG.0][ K = 3 ]
## Counting [R0.libraries.R0.barcodeCG.0][ K = 4 ]
## Counting [R0.libraries.R0.barcodeCG.0][ K = 5 ]
## Counting [R0.libraries.R0.barcodeCG.0][ K = 6 ]
## [ sample id : R0.libraries.R0.barcodeCG.0, filter: variableRegionIncludeRegex:null,variableRegionExcludeRegex:null,variableRegionGroupRegex:null ]
## [ R0.libraries.R0.barcodeCG.0.kmax = 5 ]
# Train Markov model on Hm 16bp library Round 0 data
mm = selex.mm(sample = r0.train, order = NA, crossValidationSample =r0.test, Kmax = kmax, mmMethod = "TRANSITION")
## Overwriting Kmax = 5
## Counting [R0.libraries.R0.barcodeGC.0][ K = 1 ]
## Counting [R0.libraries.R0.barcodeGC.0][ K = 2 ]
## Counting [R0.libraries.R0.barcodeGC.0][ K = 3 ]
## Counting [R0.libraries.R0.barcodeGC.0][ K = 4 ]
## Counting [R0.libraries.R0.barcodeGC.0][ K = 5 ]
## [ markovLength = 3 ]
## [ maxR = 0.989094 ]
## [ Model = MarkovModelInfo [markovLength=3, markovLengthTotalCount=483784, markovR2=0.9890939798281818, markovCountsPath=/var/folders/6r/52dcl0sj1yg0z69w89t6fjyr0000gp/T/Rtmp7nUaGI//R0.libraries.R0.barcodeGC.0.3.dat_A7FE7F4E2E78A43F892C7F3227FFA520, markovObjPath=/var/folders/6r/52dcl0sj1yg0z69w89t6fjyr0000gp/T/Rtmp7nUaGI//R0.libraries.R0.barcodeGC.0.3.dat_A7FE7F4E2E78A43F892C7F3227FFA520.prob.obj, sample=config.ExperimentReference@1188e820, markovModelMethod=TRANSITION, crossValidationSample=config.ExperimentReference@7a46a697, filter=variableRegionIncludeRegex:null,variableRegionExcludeRegex:null,variableRegionGroupRegex:null,kmerIncludeRegex:null,kmerExcludeRegex:null,kmerIncludeOnly:null] ]
mmscores = selex.mmSummary(sample = r0.train)
ido = which(mmscores$R==max(mmscores$R))
mm.order = mmscores$Order[ido]
More preliminaries:
# INFOGAIN USED TO CALCULATE KLEN
libLen = as.numeric(as.character(selex.getAttributes(dataSample)$VariableRegionLength))
selex.infogain(sample = dataSample, k = c((mm.order+1):libLen), markovModel = mm)
## Counting [InfoGain][ K = 3 ]
## Counting [InfoGain][ K = 4 ]
## Counting [InfoGain][ K = 5 ]
## Counting [InfoGain][ K = 6 ]
## Counting [InfoGain][ K = 7 ]
## Counting [InfoGain][ K = 8 ]
## Counting [InfoGain][ K = 9 ]
## Counting [InfoGain][ K = 10 ]
## Counting [InfoGain][ K = 11 ]
## Counting [InfoGain][ K = 12 ]
## Counting [InfoGain][ K = 13 ]
## Counting [InfoGain][ K = 14 ]
## Counting [InfoGain][ K = 15 ]
## Counting [InfoGain][ K = 16 ]
## [1] 2.420417
infoscores = selex.infogainSummary(sample = dataSample)
#information gain barplot
idx = which(infoscores$InformationGain==max(infoscores$InformationGain))
colstring = rep('BLUE', nrow(infoscores))
colstring[idx] = 'RED'
barplot(height=infoscores$InformationGain, names.arg=infoscores$K, col=colstring,
xlab="Oligonucleotide Length (bp)", ylab="Information Gain (bits)")
kLen = infoscores$K[idx]
# For the sake of previous analysis on the Hox data used in this example, I will use kLen.f = 12 as my k-mer length, even though kLen identified through the information gain analysis has kLen = 13
data.kmerTable = selex.affinities(sample=dataSample, k=kLen, markovModel=mm)
## Counting [R2.libraries.ExdHox.R2.2][ K = 9 ]
## [ Lowest Count = 1 ]
data.kmerTable = data.kmerTable[order(-data.kmerTable$Affinity), ]
rownames(data.kmerTable) = NULL
data.probeCounts = getProbeCounts(dataSample, markovModel = mm)
## Counting [R2.libraries.split.1.ExdHox.R2.split.1.2][ K = 16 ]
## [ Lowest Count = 1 ]
## Counting [R2.libraries.split.2.ExdHox.R2.split.2.2][ K = 16 ]
## [ Lowest Count = 1 ]
summary(data.probeCounts)
## Probe ObservedCount Probability Round
## Length:24493 Min. :0.00000 Min. :2.813e-11 Min. :2
## Class :character 1st Qu.:0.00000 1st Qu.:2.525e-10 1st Qu.:2
## Mode :character Median :0.00000 Median :3.863e-10 Median :2
## Mean :0.03768 Mean :4.486e-10 Mean :2
## 3rd Qu.:0.00000 3rd Qu.:5.723e-10 3rd Qu.:2
## Max. :4.00000 Max. :3.667e-09 Max. :2
print(data.probeCounts[1:10,])
## Probe ObservedCount Probability Round
## 1 GAGAATGATTGATTAC 4 3.691869e-10 2
## 2 GTTGATTGATGGGTTT 3 1.198516e-09 2
## 3 GTGATTGATTGTTTTC 3 1.078425e-09 2
## 4 GTAATCAATCACTTTA 3 3.952812e-10 2
## 5 GATGATTGATCGATGT 3 5.086886e-10 2
## 6 GAATGATTGATTACAT 3 5.432474e-10 2
## 7 ATGATTGATTATGTTT 3 1.434117e-09 2
## 8 ATGATTGATTAGTTTT 3 1.355661e-09 2
## 9 AATGATTGATTATTGT 3 1.051582e-09 2
## 10 TTTGATTGATTGGTTA 2 1.397763e-09 2
# Inputs about library are data specific
model = new("model",
varRegLen = libLen,
leftFixedSeq = "GTTCAGAGTTCTACAGTCCGACGATCTGG",
rightFixedSeq ="CCAGCTGTCGTATGCCGTCTTCTGCTTG",
seedLen = kLen,
leftFixedSeqOverlap = 4,
initialAffinityCutoff = 0.00,
missingValueSuppression = 1,
minSeedValue = .001,
upFootprintExtend = 2,
includeWindowFactor = FALSE,
confidenceLevel = .95,
verbose = FALSE,
useFixedValuesOffset.N = FALSE,
rounds = list(c(2)),
rcSymmetric = FALSE,
minAffinity = 0.01
)
Inspect current state of model object:
model@features@N
## An object of class 'N'
##
## Slot "seedLen": 9
##
## Slot "N.upFootprintExtend": 2
##
## Slot "N.downFootprintExtend": 2
##
## Slot "fS.upFootprintExtend": 2
##
## Slot "fS.downFootprintExtend": 2
##
## Slot "fpLen": 13
##
## Slot "N.set": 1 2 3 4 5 6 7 8 9 10 11 12 13
##
## Slot "N.equivMat":
## 13 x 13 null equivalence matrix
##
## Slot "N.values":
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## N.A 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.C 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.G 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.T 0 0 0 0 0 0 0 0 0 0 0 0 0
##
##
## Slot "N.errors":
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## N.A 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.C 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.G 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.T 0 0 0 0 0 0 0 0 0 0 0 0 0
##
##
## Slot "N.z":
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## N.A 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.C 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.G 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.T 0 0 0 0 0 0 0 0 0 0 0 0 0
##
##
## Slot "N.sig":
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## N.A 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.C 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.G 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.T 0 0 0 0 0 0 0 0 0 0 0 0 0
##
##
## Slot "N.oldValues":
## <4 x 13 x 0 array of double>
##
## Slot "N.oldErrors":
## <4 x 13 x 0 array of double>
##
## Slot "N.oldZ":
## <4 x 13 x 0 array of double>
##
## Slot "N.oldSig":
## <4 x 13 x 0 array of double>
# Model nucleotide Betas before seed PSAM is added
addSeedPsam(model) = seedTable2psam(model, data.kmerTable)
# Model nucleotide Betas after seed PSAM is added
model@features@N
## An object of class 'N'
##
## Slot "seedLen": 9
##
## Slot "N.upFootprintExtend": 2
##
## Slot "N.downFootprintExtend": 2
##
## Slot "fS.upFootprintExtend": 2
##
## Slot "fS.downFootprintExtend": 2
##
## Slot "fpLen": 13
##
## Slot "N.set": 1 2 3 4 5 6 7 8 9 10 11 12 13
##
## Slot "N.equivMat":
## 13 x 13 null equivalence matrix
##
## Slot "N.values":
## 1 2 3 4 5 6 7 8
## N.A 0 0 0.0000000 -1.100719 -1.748610 0.000000 -3.042044 -1.808151
## N.C 0 0 -1.5065161 -3.042044 -3.042044 -3.042044 -1.841388 -1.016157
## N.G 0 0 -0.5525097 -3.042044 0.000000 -3.042044 -3.042044 -1.012792
## N.T 0 0 -0.4682265 0.000000 -3.042044 -3.042044 0.000000 0.000000
## 9 10 11 12 13
## N.A -1.293313 0.000000 -3.042044 0 0
## N.C -3.042044 -3.042044 -3.042044 0 0
## N.G 0.000000 -3.042044 -3.042044 0 0
## N.T -2.042044 -3.042044 0.000000 0 0
##
##
## Slot "N.errors":
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## N.A 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.C 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.G 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.T 0 0 0 0 0 0 0 0 0 0 0 0 0
##
##
## Slot "N.z":
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## N.A 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.C 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.G 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.T 0 0 0 0 0 0 0 0 0 0 0 0 0
##
##
## Slot "N.sig":
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## N.A 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.C 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.G 0 0 0 0 0 0 0 0 0 0 0 0 0
## N.T 0 0 0 0 0 0 0 0 0 0 0 0 0
##
##
## Slot "N.oldValues":
## <4 x 13 x 0 array of double>
##
## Slot "N.oldErrors":
## <4 x 13 x 0 array of double>
##
## Slot "N.oldZ":
## <4 x 13 x 0 array of double>
##
## Slot "N.oldSig":
## <4 x 13 x 0 array of double>
#Use this definition of data for complete analysis
data = data.probeCounts
data = topModelMatch(data, model)
# Uses aligned probes to build design matrix
data = addDesignMatrix(data, model)
# Constructs regression expression with independent features using design matrix
regressionFormula = updatedRegressionFormula(data, model)
fit = glm(regressionFormula,
data=data,
family = poisson(link="log"))
model = addNewBetas(model, data, fit)
# Nucleotide Features after first round of fitting
# GABRIELLA: this plotting commmand is not working, can you fix it?
plot(model, Nplot.ddG = TRUE, verticalPlots = TRUE)
data = data.probeCounts
data.nrow = nrow(data)
for (i in 2:3) {
data = topModelMatch(data, model)
data = addDesignMatrix(data, model)
if (data.nrow == nrow(data)) {
print ("Stability Reached")
break
} else {
data.nrow = nrow(data)
}
regressionFormula = updatedRegressionFormula(data, model)
fit = glm(regressionFormula,
data=data,
family = poisson(link="log"))
model = addNewBetas(model,data,fit)
# Nucleotide Features after i'th round of fitting
}
model@features@N@N.values
## 1 2 3 4 5 6
## N.A -0.01734237 0.15359548 0.0000000 -1.206585 -1.245653 0.000000
## N.C -0.27795622 -0.12276138 -0.8281241 -6.837651 -7.256490 -6.869103
## N.G 0.00000000 0.00000000 -0.5393352 -6.726998 0.000000 -7.274068
## N.T -0.24185909 -0.01475184 -0.2903688 0.000000 -7.140814 -1.000000
## 7 8 9 10 11 12
## N.A -7.2152042 -1.4381072 -0.7816134 0.00000 -1.0000000 -0.59259342
## N.C -0.7656346 -0.8076662 -7.2286227 -1.00000 -6.7070581 -0.24269649
## N.G -6.8276901 -0.6853362 0.0000000 -6.98462 -0.9237684 -0.04816454
## N.T 0.0000000 0.0000000 -1.4976298 -1.00000 0.0000000 0.00000000
## 13
## N.A 0.0000000
## N.C -0.9659581
## N.G -0.2320415
## N.T -0.6828586
Save model object for future reference:
save(model, file = "HowToFitMononucleotideModel.Result.RData")