李祥春
2016年6月3号
For toolkits contributed by me visit https://github.com/lixiangchun.
The R codes in the PPT have been tested with R 3.2.2. You may need to install R 3.2 or newer to proceed with this tutorial.
$ mkdir work
$ cd work
$ R
> q() ## 退出
> help(lm)
> ?lm
> library(survival)
> help(package=survival) ## 查看survival包的整体概要
> x = c(1,2,3)
> ls() ## 查看有哪些变量
> rm(x, y) ## 删除指定变量
> x <- c(10.4, 5.6, 3.1, 6.4, 21.7)
> x = c(10.4, 5.6, 3.1, 6.4, 21.7)
> assign("x", c(10.4, 5.6, 3.1, 6.4, 21.7))
> c(10.4, 5.6, 3.1, 6.4, 21.7) -> x
> y <- c(x, 0, x) # 使用已有向量构建新的向量
> v <- 2*x + y + 1
> cv <- sd(v) / mean(v) ## 变异系数
> seq(-5, 5, by=.2) -> s3
> s4 <- seq(length=51, from=-5, by=.2)
> ?seq ## more information about seq(...)
> s5 <- rep(x, times=5) ## five copies of x end-to-end in s4
> s6 <- rep(x, each=5) ## repeat each element in x five times
x <- c(1,2,5,8,20)
temp <- x > 13
print(temp)
[1] FALSE FALSE FALSE FALSE TRUE
print( (7==9) | (7>0) )
[1] TRUE
print(( 7==9) & (7>0) )
[1] FALSE
## 产检的运算符:<, <=, >, >=, ==, &(与), |(或), !(非)
z <- c(1:3,NA) # NA: Not Available
ind <- is.na(z)
print(z)
[1] 1 2 3 NA
print(ind)
[1] FALSE FALSE FALSE TRUE
print(0/0) # NaN: Not a Number
[1] NaN
print(Inf - Inf)
[1] NaN
Z <- c("green", "blue", "red")
print(Z)
[1] "green" "blue" "red"
labs <- paste(LETTERS[1:8], letters[1:8], sep="->")
print(labs)
[1] "A->a" "B->b" "C->c" "D->d" "E->e" "F->f" "G->g" "H->h"
n <- length(labs) # 获取元素个数
print(n)
[1] 8
a <- c("green","green","red","blue","green","red")
a <- factor(a)
print(a)
[1] green green red blue green red
Levels: blue green red
a <- factor(a, levels = c("red","green","blue"))
print(a)
[1] green green red blue green red
Levels: red green blue
b <- c(1,1,4,4,3,2,3,2,4)
b <- factor(b)
print(b)
[1] 1 1 4 4 3 2 3 2 4
Levels: 1 2 3 4
e <- numeric()
print(e)
numeric(0)
print(length(e))
[1] 0
e[1] <- 1
e[3] <- 3
e[8] <- 8
print(e)
[1] 1 NA 3 NA NA NA NA 8
a1 <- c(5,6,8)
a2 <- 1:3
a3 <- c("a","b","c")
d <- data.frame(a1, a2, a3)
print(d)
a1 a2 a3
1 5 1 a
2 6 2 b
3 8 3 c
d$a4 <- c("A","B","C") # 给数据框添加新变量
d$a5 <- c(3, NA, 5)
print(d)
a1 a2 a3 a4 a5
1 5 1 a A 3
2 6 2 b B NA
3 8 3 c C 5
read.table(...)
read.csv(...) ## 读取csv文件
read.xls(...) ## gdata包,读取excel文件
fread(...) ## data.table包,读取大文件,速度非常快
data(...) ## 加载指定数据集
## Installing required R packages
## On MacOS or Linux
install.packages("devtools")
library(devtools)
## On Windows, you need to install additional packages
#install.packages("RCurl")
#install.packages("httr")
#library(RCurl)
#library(httr)
#set_config( config( ssl_verifypeer = 0L ) )
#options(download.file.method = "wininet") ## Windows only
install_github("lixiangchun/lxctk")
install_github("lixiangchun/plotr")
## Loading all required package in this section
## lxctk is my personal toolkit that has been tested and widely used in my group. It can be installed from Github
library(lxctk)
# Misc R codes with my modification
library(plotr)
# Loading required data set in my R package.
# This dataset includes clinicopathological information for regular-mutated gastric cancer and mutation status of SMGs. It is used in this tutorial.
data("RegularMutatedGC")
MutSigCV Implemented in MATLAB, use 8Gb memory.
MutSigCL - MutSig by clustered mutations
mutsigclfn(bkgrSQLiteDB, obs_data, outfile='out.txt', genes=c(), type="CL", hotspot.alg=c('hclust','ratio'), min.cl=0.2, nperm=1000, mc.cores=4, bkgr_data=dbConnect(dbDriver('SQLite'), bkgrSQLiteDB))
## See
mutsigclfn(bkgrSQLiteDB, obs_data, outfile='out.txt', genes=c(), type='FN', hotspot.alg=c('hclust','ratio'), min.cl=0.2, nperm=1000, mc.cores=4, bkgr_data=dbConnect(dbDriver('SQLite'), bkgrSQLiteDB))
# Define a legend panel, different integer represents different mutation types.
# E.g. 2 - Synonymous, 3 - Missense, ..., 7 - Nonsense
# '0' is used by default to represent synonymous mutation.
legend.panel <- data.frame(V1=c(7,6,5,4,3,2),
V2=c("Nonsense","Splice site","Frame shift","Inframe indel","Missense","Syn."))
# The molecular subtype defined by SMGs identified in my Cancer Research paper.
nmfsubtypes <- RegularMutatedGC$nmf_clustid
# Mutation matrix of SMGs across samples. Rows are samples whereas columns are SMGs.
x <- RegularMutatedGC[, 15:45]
# Sort x column-by-column for better visualization
x <- sortDataFrame(x, decreasing = TRUE)
# Color codes will be used in visualizing matrix 'x'.
cols <- c('white','grey88','#644B39','forestgreen',
'#FF8B00','#9867CC','#DB1C00')
# Passed processed inputs to oncoprinter(...)
# Parameters:
# x: matrix recording SMG mutation category (column) in tumor samples (row).
# cols: colors used for mutation categories
# legend.panel: legend panel mapping mutation category to mutation type (or name)
oncoprinter(x, cols, legend.panel)
## Sorting by specified subtypes.
## Parameters:
## x: initial mutation matrix of SMGs.
## idx: sorting x according to subtypes.
## col.names: columns selected for sorting, use all by default.
## decreasing: if TRUE sort x in decreasing order.
##
## For more info about this function, check its man page.
xx <- sort.data.frame.by.index(x, nmfsubtypes)
## Visualization
oncoprinter(xx, cols, legend.panel)
## oncoprinter(...) can be easily extended to other types of alterations. However, you must provide a legend panel that is consistent with input matrix.
# Assign to new variable
d <- RegularMutatedGC
# Select specific items
d <- d[d$Sex!="" & d$Stage!="" & (d$Lauren_subtype == "diffuse" | d$Lauren_subtype == 'intestinal'),]
# Make sure that the data type is appropriate
# Use F as reference group
d$Sex <- factor(d$Sex, levels=c('F','M'))
# Use intestinal as reference group
d$Lauren_subtype <- factor(d$Lauren_subtype, levels=c('intestinal', 'diffuse'))
d$Stage <- factor(d$Stage, levels=c('Early','Late'))
d$nmf_clustid <- factor(d$nmf_clustid, levels=c(1,2))
d$cohort <- d$Study
# Only the TCGA and Tianjin cohorts have both Surv_time and vital_status. The other 3 cohorts did not contribute to prognosis analysis.
d$cohort[d$cohort!='TCGA'] <- 'Asian'
d$cohort <- factor(d$cohort,levels=c('Asian','TCGA'))
## Colors used for C1/2
C1.col <- '#BE5C81'
C2.col <- '#495456'
## Survival analysis
plot.survfit.lixc(Surv(Surv_time, vital_status) ~ nmf_clustid, d)
## More controls on KM survival analysis
## Parameters:
## legend.labels: legends for survival curves
## lwd: line width
## legend.xycoord: coordinates of the legend
## ci.lab.xycoord: coordinates of p-value
## col: colors
## For more info, please read its manpage.
plot.survfit.lixc(Surv(Surv_time, vital_status) ~ nmf_clustid, d, legend.labels = c("C1","C2"), ci.lab.xycoord=c(30,0.15), legend.xycoord=c(15,0.4), no.ci.lab = TRUE, lwd=3, col = c(C1.col, C2.col), xlim=c(0,80), ylim=c(0,1.1))
## Survival fit
fit <- survfit(Surv(Surv_time, vital_status) ~ nmf_clustid, d)
## ggsurv
ggsurv(fit)
## More controls on ggsurv
## legend.labels: legend labels
## col.surv: colors for survival curves
## confin: remove survival curve confidence intervals
## pval: coordinate to place p-value
## pval.size: font size of p-value
## ggdefault: use ggplot default color scheme
## grid: add grid
ggsurv(fit, legend.labels = c("C1","C2"), col.surv = c(C1.col, C2.col), confin = FALSE, pval = c(9, 0.3), pval.size = 4, ggdefault = TRUE, grid = TRUE)
## Univariate Cox analysis in a single function. A forest plot is produced to exhibit hazards ratio, associated 95% CIs and p-values.
unicoxph(Surv(Surv_time, vital_status) ~ Age + Sex + Lauren_subtype + Stage + nmf_clustid, d)
## More control in unicoxph
## Variable names to be displayed on the forest plot
variables_to_display <- c("Age", "Sex (Male vs. Female)",
"Lauren's subtype (D vs. I)",
"Stage (III/IV vs. I/II",
"Mol. subtype (C2 vs. C1)")
unicoxph(Surv(Surv_time, vital_status) ~ Age + Sex + Lauren_subtype + Stage + nmf_clustid, d, variables_to_display)
## Use coxph to perform multivariate Cox regression analysis
mcox <- coxph(Surv(Surv_time, vital_status) ~ Age + Sex + Lauren_subtype + Stage + nmf_clustid, d)
## Use forest plot to visualize mcox
plot.coxph(coxObj = mcox)
## Use user-specified variable names
plot.coxph(coxObj = mcox, variable.display = variables_to_display)
## For more info about coxph, see its manpage in R survival package.
An example of mutational signature deciphering can be found at http://lixiangchun.github.io/msig/
msig_cols <- c('#702E33','#BE5C81','#836E8D','#EC8C70','#596C22','#A7B762')
data('plot.mutation.signature.ex')
# If you want to save figure to file, supply filename to `figpdf`.
plot.mutation.signature(df, figpdf=NULL, color=msig_cols)
#
plot.mutational.exposures(NULL,
system.file('data/Rank_eq_9.exposures.txt', package = 'lxctk'))
# legoplot representation of 96 mutational categories
legoplot(system.file('data/stad_regular_GC_originalGenomes.txt'))