I
am working on human colon cancer recently. I want to see different expression
gene in colon cancer cells lines. I downloaded GSE46549 dataset from NCBI (gene
expression analysis of 6 colon cancer cell lines (HCT116, HT-29, RKO, SW480,
LIM1512 and CaCo2), http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE46549).
Each sample has two time points. The platforms of these microarray is GPL6244
[HuGene-1_0-st] Affymetrix Human Gene 1.0 ST Array [transcript (gene) version].
In R/Bioconductor, we can use package affy
to read these microarray raw data. Here is what I did using R.
# using “affy” to read Affymetrix microarray raw
data
# using
method in “simpleaffy” to see if the
RNA samples degrade or not
> library (affy)
> library (simpleaffy)
# Opens
file browser to select specific CEL files.
> colon_data <- ReadAffy (widget=TRUE)
# show data
information
> colon_data
AffyBatch object
size of arrays=1050x1050 features (20 kb)
cdf=HuGene-1_0-st-v1 (32321 affyids)
number of samples=12
number of genes=32321
annotation=hugene10stv1
notes=
# define 12 colors,
> col <- rainbow(12)
# Assessing degree of RNA
degradation
> RNA_deg <- AffyRNAdeg (colon_data)
# plotting
map
> plotAffyRNAdeg(RNA_deg, col=col)
> legend(legend=sampleNames(colon_data),
x="topright", lty=1, cex=0.55, col=col)
# Check if
the slopes and profiles of the lines are similar for all the arrays.
# See the
distribution of gene expression value of raw data with boxplot. The distribution
of raw value show difference
> boxplot(colon_data, col=col)
> lab <- sampleNames (colon_data)
> text(x =
seq_along(lab), y = par("usr")[3]-0.2, srt = 35, adj = 1,
labels = lab, xpd = TRUE)
#
normalizing data with rma method
> colon_data_normal <- rma(colon_data)
Background correcting
Normalizing
Calculating Expression
# get the expression value of samples
> colon_data_exp <- exprs(colon_data_normal)
# See the
distribution of gene expression value of normalinzed data with boxplot.
> boxplot (colon_data_exp,xaxt="n", col=col)
> text(x =
seq_along(lab), y = par("usr")[3]-0.2, srt = 35, adj = 1,
labels = lab, xpd = TRUE)
Hierarchical
Clustering
# calculate
the distance of samples
> distance <- dist(t(colon_data_exp))
# Do sample
cluster and plote
> hc <- hclust(distance, method="ave")
> plot(hc, hang=-1)
# cut tree
into 4 clusters
> rect.hclust(hc, k=4)
# change matrix data type to data.frame
> colon_data_exp <- as.data.frame(colon_data_exp)
# rename data.frame column name
> names (colon_data_exp) <- c("CaCo2_0h",
"CaCo2_48h", "HCT116_0h", "HCT116_48h",
"HT29_0h", "HT29_48h", "LIM1215_0h",
"LIM1215_48h", "RKO_0h", "RKO_48h", "SW480_0h",
"SW480_48h" )
# show first 6 rows of expression data
> head(colon_data_exp)
CaCo2_0h
CaCo2_48h HCT116_0h HCT116_48h HT29_0h
HT29_48h LIM1215_0h LIM1215_48h
RKO_0h RKO_48h SW480_0h SW480_48h
7892501 6.752377
6.592593 7.576409 6.685253 6.006243 6.271173 4.511232
5.958222 5.371307 6.156128 5.779157
5.976742
7892502 6.846297
6.166110 6.173140 5.301924 5.895528 6.652065 5.657238
5.858828 6.478357 6.632395 5.883862
5.483908
7892503 3.592670
4.089133 3.595432 3.983314 3.468884 3.329923 3.598740
4.242956 4.065588 3.548275 3.785297
3.397205
7892504 9.180533
9.308925 8.655324 8.786714 8.101120 8.157576 9.133334
8.804358 8.933208 9.067537 8.494151
8.202562
7892505 3.966066
4.963173 4.181195 4.357158 4.238690 4.334786 3.705270
4.698990 3.530189 4.019916 4.057861
3.633473
7892506 6.161011
5.537104 5.048375 5.354080 6.311650 6.777239 6.325180
6.337767 5.204783 6.191189 6.470301
5.762595
#using
corrgram method in corrgram package to see the samples correlation
> library (corrgram)
> corrgram(cor(colon_data_exp),
lower.panel=panel.conf, upper.panel=panel.pie, col.regions=cm.colors,
main="Sample Correlation")
# loading
limma package, using linear model to do gene different expression analysis.
> library (limma)
# separate
samples into 6 group
> groups <- c("CaCo2", "CaCo2",
"HCT116", "HCT116", "HT29", "HT29",
"LIM1215", "LIM1215", "RKO", "RKO",
"SW480", "SW480")
# change
groups to factor
> groups <- as.factor(groups)
# using
model.matrix to b
uild the
model matrix
> des <- model.matrix(~groups)
# show
matrix design
> des
(Intercept)
groupsHCT116 groupsHT29 groupsLIM1215 groupsRKO groupsSW480
1 1 0 0 0 0 0
2 1 0 0 0 0 0
3 1 1 0 0 0 0
4 1 1 0 0 0 0
5 1 0 1 0 0 0
6 1 0 1 0 0
0
7 1 0 0 1 0 0
8 1 0 0 1 0 0
9 1 0 0 0 1 0
10 1 0 0 0 1 0
11 1 0 0 0 0 1
12 1 0 0 0 0 1
attr(,"assign")
[1] 0 1 1 1 1 1
attr(,"contrasts")
attr(,"contrasts")$groups
[1] "contr.treatment"
# Do analysis using a linear model
# using limFit() followed by eBayes() method
> fit <- lmFit(colon_data_exp, des)
> fit <- eBayes(fit)
#
using toptable to see the different expression gene
> toptable(fit, coef=2)
> toptable(fit, coef=3)
> toptable(fit, coef=4)
> toptable(fit, coef=5)
> toptable(fit, coef=6)
logFC t
P.Value adj.P.Val B
8139212 6.388442 65.60093 1.119720e-13 3.619048e-09 19.99287
7918694 5.064495 44.59762 3.955354e-12 4.240144e-08 17.78992
7963427 6.126554 43.93864 4.537555e-12 4.240144e-08 17.69051
7906079 -5.324317 -42.03273 6.830113e-12 4.240144e-08 17.38849
7923547 5.250144 41.45834 7.753910e-12 4.240144e-08 17.29302
8168557 -5.885267 -41.27382 8.079409e-12 4.240144e-08 17.26189
8166611 -6.039658 -40.70440 9.183197e-12 4.240144e-08 17.16440
8088180 4.370571 38.74243 1.447748e-11 4.913646e-08 16.81113
7922337 -5.164845 -38.60976 1.494225e-11 4.913646e-08 16.78623
7991070 4.059239 38.27807 1.617830e-11 4.913646e-08 16.72337
# get 5000
differential gene each group
> diff_2 <- toptable(fit, coef=2, number=5000)
> diff_3 <- toptable(fit, coef=3, number=5000)
> diff_4 <- toptable(fit, coef=4, number=5000)
> diff_5 <- toptable(fit, coef=5, number=5000)
> diff_6 <- toptable(fit, coef=6, number=5000)
# get
differential gene that there is 1.5 fold changes and adjust p value < 0.01
of each group
> diff_2 <- diff_2[abs(diff_2$logFC) >= 1.5
& diff_2$adj.P.Val < 0.01,]
> diff_3 <- diff_3[abs(diff_2$logFC) >= 1.5
& diff_3$adj.P.Val < 0.01,]
> diff_3 <- diff_3[abs(diff_3$logFC) >= 1.5
& diff_3$adj.P.Val < 0.01,]
> diff_4 <- diff_4[abs(diff_4$logFC) >= 1.5
& diff_4$adj.P.Val < 0.01,]
> diff_5 <- diff_5[abs(diff_5$logFC) >= 1.5
& diff_5$adj.P.Val < 0.01,]
> diff_6 <- diff_6[abs(diff_6$logFC) >= 1.5
& diff_6$adj.P.Val < 0.01,]
# get the
row names of differential gene.
> row_2 <- rownames(diff_2)
> row_3 <- rownames(diff_3)
> row_4 <- rownames(diff_4)
> row_5 <- rownames(diff_5)
> row_6 <- rownames(diff_6)
# merge all
the gene names to one dataset.
> diff_row_name <- union(row_2, row_3)
> diff_row_name <- union(diff_row_name, row_4)
> diff_row_name <- union(diff_row_name, row_5)
> diff_row_name <- union(diff_row_name, row_6)
# get all
the differential expression gene value from gene expression dataset. Now
colon_diff_exp contain all differential gene.
> colon_diff_exp <- colon_data_exp[match(diff_row_name,
rownames(colon_data_exp)),]
# show the
differential expression gene number
> nrow(colon_diff_exp)
[1] 3184
# plot heatmap to see clusters of differential gene.
> library (gplots)
> heatmap.2(as.matrix(colon_diff_exp), trace="none", col=bluered, density.info="none", labRow=F, srtCol=40)
# k-means
cluster for differential gene. Differential gene was classed into 10 group
> km <- kmeans(colon_diff_exp, 10)
> kmeansclus <- km$cluster
> names(kmeansclus) <- NULL
# merge
k-means group number to gene expression dataset
> km_colon_diff <- cbind (colon_diff_exp,
kmeansclus)
# sorting
gene using k-means group number
> km_colon_diff <-
km_colon_diff[order(km_colon_diff$kmeansclus),]
> library (lattice)
> levelplot (as.matrix (t(km_colon_diff)), col.regions=cm.colors, aspect=1.2, xlab="", ylab="")
# using the
mean data to do dot plot
> RKO <-rowMeans(colon_diff_exp[,9:10])
> CaCo2 <-rowMeans(colon_diff_exp[,1:2])
> HCT116 <-rowMeans(colon_diff_exp[,3:4])
> HT29 <-rowMeans(colon_diff_exp[,5:6])
> LIM1215 <-rowMeans(colon_diff_exp[,7:8])
> SW480 <-rowMeans(colon_diff_exp[,11:12])
> mean_diff_exp <- cbind(RKO, CaCo2, HCT116, HT29,
LIM1215, SW480)
> pairs(mean_diff_exp,col=kmeansclus, pch=19, cex=0.6)
# Show the
different cluster
by Changlong