Skip to content

Commit 78253bd

Browse files
committed
de done
1 parent 9a35d51 commit 78253bd

2 files changed

Lines changed: 192 additions & 3 deletions

File tree

data/samples.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
sample condition
2+
HBR_Rep1 HBR
3+
HBR_Rep2 HBR
4+
HBR_Rep3 HBR
5+
UHR_Rep1 UHR
6+
UHR_Rep2 UHR
7+
UHR_Rep3 UHR

rna_seq.md

Lines changed: 185 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,190 @@ now we can import the salmon quantification:
100100

101101
```R
102102
samples <- read.table("samples.txt", header = TRUE)
103-
104-
files <- file.path("salmon", samples$quant, "quant.sf")
105-
names(files) <- paste0("sample", 1:6)
103+
files <- file.path("quant", samples$quant, "quant.sf")
104+
names(files) <- paste0(samples$sample)
106105
txi.salmon <- tximport(files, type = "salmon", tx2gene = tx2gene, reader = read_tsv)
107106
```
107+
108+
take a look at the data:
109+
110+
```R
111+
head(txi.salmon$counts)
112+
```
113+
114+
## differential expression using DeSeq2
115+
116+
install the necessary package
117+
118+
```R
119+
biocLite('DESeq2')
120+
```
121+
122+
then load it:
123+
124+
```R
125+
library(DESeq2)
126+
```
127+
128+
Instantiate the DESeqDataSet and generate result table. See ?DESeqDataSetFromTximport and ?DESeq for more information about the steps performed by the program.
129+
130+
131+
```R
132+
dds <- DESeqDataSetFromTximport(txi.salmon, samples, ~condition)
133+
dds <- DESeq(dds)
134+
res <- results(dds)
135+
```
136+
137+
run the `summary` command to have an idea of how many genes are up and down-regulated between the two conditions
138+
139+
`summary(res)`
140+
141+
DESeq uses a negative binomial distribution. Such distribution has two parameters: mean and dispersion. The dispersion is a parameter describing how much the variance deviates from the mean.
142+
143+
You can read more about the methods used by DESeq2 in the [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0550-8) or the [vignette](https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq/inst/doc/DESeq.pdf)
144+
145+
Plot dispersions:
146+
147+
```R
148+
plotDispEsts(dds, main="Dispersion plot")
149+
```
150+
151+
For clustering and heatmaps, we need to log transform our data:
152+
153+
```R
154+
rld <- rlogTransformation(dds)
155+
head(assay(rld))
156+
```
157+
158+
Then, we create a sample distance heatmap:
159+
160+
```R
161+
library(RColorBrewer)
162+
library(gplots)
163+
164+
(mycols <- brewer.pal(8, "Dark2")[1:length(unique(samples$condition))])
165+
sampleDists <- as.matrix(dist(t(assay(rld))))
166+
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
167+
col=colorpanel(100, "black", "white"),
168+
ColSideColors=mycols[samples$condition],
169+
RowSideColors=mycols[samples$condition],
170+
margin=c(10, 10), main="Sample Distance Matrix")
171+
```
172+
173+
We can also plot a PCA:
174+
175+
```R
176+
DESeq2::plotPCA(rld, intgroup="condition")
177+
```
178+
179+
It is time to look at some p-values:
180+
181+
```R
182+
table(res$padj<0.05)
183+
res <- res[order(res$padj), ]
184+
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
185+
names(resdata)[1] <- "Gene"
186+
head(resdata)
187+
```
188+
189+
Examine plot of p-values, the MA plot and the Volcano Plot:
190+
191+
```R
192+
hist(res$pvalue, breaks=50, col="grey")
193+
DESeq2::plotMA(dds, ylim=c(-1,1), cex=1)
194+
195+
# Volcano plot
196+
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-2.5,2)))
197+
with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))
198+
```
199+
200+
## KEGG pathway analysis
201+
202+
As always, install and load the necessary packages:
203+
204+
```R
205+
biocLite("AnnotationDbi")
206+
biocLite("org.Hs.eg.db")
207+
biocLite("pathview")
208+
biocLite("gage")
209+
biocLite("gageData")
210+
211+
library(AnnotationDbi)
212+
library(org.Hs.eg.db)
213+
library(pathview)
214+
library(gage)
215+
library(gageData)
216+
```
217+
218+
Let’s use the mapIds function to add more columns to the results. The row.names of our results table has the Ensembl gene ID (our key), so we need to specify keytype=ENSEMBL. The column argument tells the mapIds function which information we want, and the multiVals argument tells the function what to do if there are multiple possible values for a single input value. Here we ask to just give us back the first one that occurs in the database. Let’s get the Entrez IDs, gene symbols, and full gene names.
219+
220+
```R
221+
res$symbol = mapIds(org.Hs.eg.db,
222+
keys=row.names(res),
223+
column="SYMBOL",
224+
keytype="ENSEMBL",
225+
multiVals="first")
226+
res$entrez = mapIds(org.Hs.eg.db,
227+
keys=row.names(res),
228+
column="ENTREZID",
229+
keytype="ENSEMBL",
230+
multiVals="first")
231+
res$name = mapIds(org.Hs.eg.db,
232+
keys=row.names(res),
233+
column="GENENAME",
234+
keytype="ENSEMBL",
235+
multiVals="first")
236+
237+
head(res)
238+
```
239+
240+
```R
241+
data(kegg.sets.hs)
242+
data(sigmet.idx.hs)
243+
kegg.sets.hs = kegg.sets.hs[sigmet.idx.hs]
244+
head(kegg.sets.hs, 3)
245+
```
246+
247+
```R
248+
foldchanges = res$log2FoldChange
249+
names(foldchanges) = res$entrez
250+
head(foldchanges)
251+
```
252+
253+
```R
254+
# Get the results
255+
keggres = gage(foldchanges, gsets=kegg.sets.hs, same.dir=TRUE)
256+
257+
# Look at both up (greater), down (less), and statatistics.
258+
lapply(keggres, head)
259+
260+
# Get the pathways
261+
keggrespathways = data.frame(id=rownames(keggres$greater), keggres$greater) %>%
262+
tbl_df() %>%
263+
filter(row_number()<=5) %>%
264+
.$id %>%
265+
as.character()
266+
keggrespathways
267+
268+
# Get the IDs.
269+
keggresids = substr(keggrespathways, start=1, stop=8)
270+
keggresids
271+
```
272+
273+
```R
274+
# Define plotting function for applying later
275+
plot_pathway = function(pid) pathview(gene.data=foldchanges, pathway.id=pid, species="hsa", new.signature=FALSE)
276+
277+
# plot multiple pathways (plots saved to disk and returns a throwaway list object)
278+
tmp = sapply(keggresids, function(pid) pathview(gene.data=foldchanges, pathway.id=pid, species="hsa"))
279+
```
280+
281+
```R
282+
data(go.sets.hs)
283+
data(go.subs.hs)
284+
gobpsets = go.sets.hs[go.subs.hs$BP]
285+
286+
gobpres = gage(foldchanges, gsets=gobpsets, same.dir=TRUE)
287+
288+
lapply(gobpres, head)
289+
```

0 commit comments

Comments
 (0)