Sunday, January 18, 2015

Different Expression Gene in colon cancer cell lines with R

I am working on human colon cancer recently. I want to see different expression gene in colon cancer cells lines. I downloaded GSE46549 dataset from NCBI (gene expression analysis of 6 colon cancer cell lines (HCT116, HT-29, RKO, SW480, LIM1512 and CaCo2), http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE46549). Each sample has two time points. The platforms of these microarray is GPL6244 [HuGene-1_0-st] Affymetrix Human Gene 1.0 ST Array [transcript (gene) version]. In R/Bioconductor, we can use package affy to read these microarray raw data. Here is what I did using R.

# using “affy” to read Affymetrix microarray raw data
# using method in “simpleaffy” to see if the RNA samples degrade or not
> library (affy)
> library (simpleaffy)

# Opens file browser to select specific CEL files.
> colon_data <- ReadAffy (widget=TRUE)

# show data information
> colon_data
AffyBatch object
size of arrays=1050x1050 features (20 kb)
cdf=HuGene-1_0-st-v1 (32321 affyids)
number of samples=12
number of genes=32321
annotation=hugene10stv1
notes=

# define 12 colors,
> col <- rainbow(12)

# Assessing degree of RNA degradation
> RNA_deg <- AffyRNAdeg (colon_data)

# plotting map
> plotAffyRNAdeg(RNA_deg, col=col)
> legend(legend=sampleNames(colon_data), x="topright", lty=1, cex=0.55, col=col)

# Check if the slopes and profiles of the lines are similar for all the arrays.



# See the distribution of gene expression value of raw data with boxplot. The distribution of raw value show difference
> boxplot(colon_data, col=col)
> lab <- sampleNames (colon_data)
> text(x =  seq_along(lab), y = par("usr")[3]-0.2, srt = 35, adj = 1, labels = lab, xpd = TRUE)



 # normalizing data with rma method
 > colon_data_normal <- rma(colon_data)
Background correcting
Normalizing
Calculating Expression

# get the expression value of samples
> colon_data_exp <- exprs(colon_data_normal)
  
# See the distribution of gene expression value of normalinzed data with boxplot.
> boxplot (colon_data_exp,xaxt="n", col=col)
> text(x =  seq_along(lab), y = par("usr")[3]-0.2, srt = 35, adj = 1, labels = lab, xpd = TRUE)
  




Hierarchical Clustering

# calculate the distance of samples
> distance <- dist(t(colon_data_exp))

# Do sample cluster and plote
> hc <- hclust(distance, method="ave")
> plot(hc, hang=-1)

# cut tree into 4 clusters
> rect.hclust(hc, k=4)




# change matrix data type to data.frame
> colon_data_exp <- as.data.frame(colon_data_exp)

# rename data.frame column name
> names (colon_data_exp) <- c("CaCo2_0h", "CaCo2_48h", "HCT116_0h", "HCT116_48h", "HT29_0h", "HT29_48h", "LIM1215_0h", "LIM1215_48h", "RKO_0h", "RKO_48h", "SW480_0h", "SW480_48h" )


# show first 6 rows of expression data
> head(colon_data_exp)

  CaCo2_0h CaCo2_48h HCT116_0h HCT116_48h  HT29_0h HT29_48h LIM1215_0h LIM1215_48h   RKO_0h  RKO_48h SW480_0h SW480_48h
7892501 6.752377  6.592593  7.576409   6.685253 6.006243 6.271173   4.511232    5.958222 5.371307 6.156128 5.779157  5.976742
7892502 6.846297  6.166110  6.173140   5.301924 5.895528 6.652065   5.657238    5.858828 6.478357 6.632395 5.883862  5.483908
7892503 3.592670  4.089133  3.595432   3.983314 3.468884 3.329923   3.598740    4.242956 4.065588 3.548275 3.785297  3.397205
7892504 9.180533  9.308925  8.655324   8.786714 8.101120 8.157576   9.133334    8.804358 8.933208 9.067537 8.494151  8.202562
7892505 3.966066  4.963173  4.181195   4.357158 4.238690 4.334786   3.705270    4.698990 3.530189 4.019916 4.057861  3.633473
7892506 6.161011  5.537104  5.048375   5.354080 6.311650 6.777239   6.325180    6.337767 5.204783 6.191189 6.470301  5.762595

#using corrgram method in corrgram package to see the samples correlation

> library (corrgram)
> corrgram(cor(colon_data_exp), lower.panel=panel.conf, upper.panel=panel.pie, col.regions=cm.colors, main="Sample Correlation")



# loading limma package, using linear model to do gene different expression analysis.

> library (limma)

# separate samples into 6 group
> groups <- c("CaCo2", "CaCo2", "HCT116", "HCT116", "HT29", "HT29", "LIM1215", "LIM1215", "RKO", "RKO", "SW480", "SW480")

# change groups to factor
> groups <- as.factor(groups)

# using model.matrix to b
uild the model matrix
> des <- model.matrix(~groups)
# show matrix design
> des
   (Intercept) groupsHCT116 groupsHT29 groupsLIM1215 groupsRKO groupsSW480
1            1            0          0             0         0           0
2            1            0          0             0         0           0
3            1            1          0             0         0           0
4            1            1          0             0         0           0
5            1            0          1             0         0           0
6            1            0          1             0         0           0
7            1            0          0             1         0           0
8            1            0          0             1         0           0
9            1            0          0             0         1           0
10           1            0          0             0         1           0
11           1            0          0             0         0           1
12           1            0          0             0         0           1
attr(,"assign")
[1] 0 1 1 1 1 1
attr(,"contrasts")
attr(,"contrasts")$groups
[1] "contr.treatment"


# Do analysis using a linear model
# using limFit() followed by eBayes() method

> fit <- lmFit(colon_data_exp, des)
> fit <- eBayes(fit)
# using toptable to see the different expression gene
> toptable(fit, coef=2)
> toptable(fit, coef=3)
> toptable(fit, coef=4)
> toptable(fit, coef=5)

> toptable(fit, coef=6)

            logFC         t      P.Value    adj.P.Val        B
8139212  6.388442  65.60093 1.119720e-13 3.619048e-09 19.99287
7918694  5.064495  44.59762 3.955354e-12 4.240144e-08 17.78992
7963427  6.126554  43.93864 4.537555e-12 4.240144e-08 17.69051
7906079 -5.324317 -42.03273 6.830113e-12 4.240144e-08 17.38849
7923547  5.250144  41.45834 7.753910e-12 4.240144e-08 17.29302
8168557 -5.885267 -41.27382 8.079409e-12 4.240144e-08 17.26189
8166611 -6.039658 -40.70440 9.183197e-12 4.240144e-08 17.16440
8088180  4.370571  38.74243 1.447748e-11 4.913646e-08 16.81113
7922337 -5.164845 -38.60976 1.494225e-11 4.913646e-08 16.78623
7991070  4.059239  38.27807 1.617830e-11 4.913646e-08 16.72337

# get 5000 differential gene each group
> diff_2 <- toptable(fit, coef=2, number=5000)
> diff_3 <- toptable(fit, coef=3, number=5000)
> diff_4 <- toptable(fit, coef=4, number=5000)
> diff_5 <- toptable(fit, coef=5, number=5000)
> diff_6 <- toptable(fit, coef=6, number=5000)

# get differential gene that there is 1.5 fold changes and adjust p value < 0.01 of each group

> diff_2 <- diff_2[abs(diff_2$logFC) >= 1.5 & diff_2$adj.P.Val < 0.01,]
> diff_3 <- diff_3[abs(diff_2$logFC) >= 1.5 & diff_3$adj.P.Val < 0.01,]
> diff_3 <- diff_3[abs(diff_3$logFC) >= 1.5 & diff_3$adj.P.Val < 0.01,]
> diff_4 <- diff_4[abs(diff_4$logFC) >= 1.5 & diff_4$adj.P.Val < 0.01,]
> diff_5 <- diff_5[abs(diff_5$logFC) >= 1.5 & diff_5$adj.P.Val < 0.01,]
> diff_6 <- diff_6[abs(diff_6$logFC) >= 1.5 & diff_6$adj.P.Val < 0.01,]

# get the row names of differential gene.

> row_2 <- rownames(diff_2)
> row_3 <- rownames(diff_3)
> row_4 <- rownames(diff_4)
> row_5 <- rownames(diff_5)
> row_6 <- rownames(diff_6)

# merge all the gene names to one dataset.
> diff_row_name <- union(row_2, row_3)
> diff_row_name <- union(diff_row_name, row_4)
> diff_row_name <- union(diff_row_name, row_5)
> diff_row_name <- union(diff_row_name, row_6)

# get all the differential expression gene value from gene expression dataset. Now colon_diff_exp contain all differential gene.

> colon_diff_exp <- colon_data_exp[match(diff_row_name, rownames(colon_data_exp)),]

# show the differential expression gene number
> nrow(colon_diff_exp)
[1] 3184

# plot heatmap to see clusters of differential gene.
> library (gplots)

> heatmap.2(as.matrix(colon_diff_exp), trace="none", col=bluered, density.info="none", labRow=F, srtCol=40)





 # k-means cluster for differential gene. Differential gene was classed into 10 group

> km <- kmeans(colon_diff_exp, 10)
> kmeansclus <- km$cluster
> names(kmeansclus) <- NULL

# merge k-means group number to gene expression dataset
> km_colon_diff <- cbind (colon_diff_exp, kmeansclus)

# sorting gene using k-means group number
> km_colon_diff <- km_colon_diff[order(km_colon_diff$kmeansclus),]
> library (lattice)
>  levelplot (as.matrix (t(km_colon_diff)), col.regions=cm.colors, aspect=1.2, xlab="", ylab="")



# using the mean data to do dot plot

> RKO <-rowMeans(colon_diff_exp[,9:10])
> CaCo2 <-rowMeans(colon_diff_exp[,1:2])
> HCT116 <-rowMeans(colon_diff_exp[,3:4])
> HT29 <-rowMeans(colon_diff_exp[,5:6])
> LIM1215 <-rowMeans(colon_diff_exp[,7:8])
> SW480 <-rowMeans(colon_diff_exp[,11:12])
> mean_diff_exp <- cbind(RKO, CaCo2, HCT116, HT29, LIM1215, SW480)
> pairs(mean_diff_exp,col=kmeansclus, pch=19, cex=0.6)


# Show the different cluster


by Changlong

Saturday, January 17, 2015

Principal Component Analysis with R


Principal component analysis (PCA) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components. The number of principal components is less than or equal to the number of original variables. There are two methods in R to perform PCA. One is princomp, another is prcomp. I think these two methods are almost same.

#Use data iris as example
> myiris <- iris[,-5]
> pca1 <- princomp (myiris)
> pca2 <- prcomp (myiris)
> pca1

Call:
princomp(x = myiris)

Standard deviations:
   Comp.1    Comp.2    Comp.3    Comp.4
2.0494032 0.4909714 0.2787259 0.1538707

 4  variables and  150 observations.

> pca2
Standard deviations:
[1] 2.0562689 0.4926162 0.2796596 0.1543862

Rotation:
                     PC1         PC2         PC3        PC4
Sepal.Length  0.36138659 -0.65658877  0.58202985  0.3154872
Sepal.Width  -0.08452251 -0.73016143 -0.59791083 -0.3197231
Petal.Length  0.85667061  0.17337266 -0.07623608 -0.4798390
Petal.Width   0.35828920  0.07548102 -0.54583143  0.7536574

#show the variables of pca result
> names (pca1)
[1] "sdev"     "loadings" "center"   "scale"    "n.obs"    "scores"   "call"   
> names (pca2)
[1] "sdev"     "rotation" "center"   "scale"    "x"      
> col <- rainbow(4, alpha=0.5)
> plot (pca1$loadings, col=col, pch=16, cex=4) # plot PCA1 and PCA2
> plot (pca2$rotation, col=col, pch=16, cex=4) # plot PCA1 and PCA2



#show loadings value
> loadings(pca1)
Loadings:
             Comp.1 Comp.2 Comp.3 Comp.4
Sepal.Length  0.361 -0.657 -0.582  0.315
Sepal.Width         -0.730  0.598 -0.320
Petal.Length  0.857  0.173        -0.480
Petal.Width   0.358         0.546  0.754

               Comp.1 Comp.2 Comp.3 Comp.4
SS loadings      1.00   1.00   1.00   1.00
Proportion Var   0.25   0.25   0.25   0.25
Cumulative Var   0.25   0.50   0.75   1.00

# show rotation value
> pca2$rotation
                     PC1         PC2         PC3        PC4
Sepal.Length  0.36138659 -0.65658877  0.58202985  0.3154872
Sepal.Width  -0.08452251 -0.73016143 -0.59791083 -0.3197231
Petal.Length  0.85667061  0.17337266 -0.07623608 -0.4798390
Petal.Width   0.35828920  0.07548102 -0.54583143  0.7536574

#also show PCA1 and PCA2

> plot (pca1$loadings[1:4, 1],pca1$loadings[1:4,2], col=col, pch=16, cex=4)

#plot PCA2 and PCA3
> plot (pca1$loadings[1:4, 2],pca1$loadings[1:4,3], col=col, pch=16, cex=4)
> plot (pca2$rotation[1:4, 2],pca2$rotation[1:4,3], col=col, pch=16, cex=4)


# plot PCA3 and PCA4
> plot (pca2$rotation[1:4, 3],pca2$rotation[1:4,4], col=col, pch=16, cex=4)
> plot (pca1$loadings[1:4, 3],pca1$loadings[1:4,4], col=col, pch=16, cex=4)



> pairs (pca1$loadings, col=col, pch=18, cex=3)

# plot all component




> pairs (pca2$rotation, col=col, pch=18, cex=3)































See almost the same.

Monday, January 12, 2015

Chipseq track analysis and visualization with “chipseq” and “Gviz” package

# packages ‘Gviz’ and ‘chipseq’
> library (Gviz)
> library (chipseq)

# Load aligned chipseq reads sample ‘cstest’ to workspace.
> data(cstest)
#use names function to see samples included in ‘cstest’
#two sample in this dataset—‘ctcf’ and ‘gfp’

> names(cstest)
[1] "ctcf" "gfp"
# display the data of cstest dataset
> cstest
GRangesList of length 2:
$ctcf
GRanges with 450096 ranges and 0 metadata columns:
           seqnames                 ranges strand
              <Rle>              <IRanges>  <Rle>
       [1]    chr10     [3012936, 3012959]      +
       [2]    chr10     [3012941, 3012964]      +
       [3]    chr10     [3012944, 3012967]      +
       [4]    chr10     [3012955, 3012978]      +
       [5]    chr10     [3012963, 3012986]      +
       ...      ...                    ...    ...
  [450092]    chr12 [121239376, 121239399]      -
  [450093]    chr12 [121245849, 121245872]      -
  [450094]    chr12 [121245895, 121245918]      -
  [450095]    chr12 [121246344, 121246367]      -
  [450096]    chr12 [121253499, 121253522]      -

...
<1 more element>
---
seqlengths:
         chr1         chr2 ...  chrY_random chrUn_random
197195432    181748087 ...     58682461      5900358

# Get ‘ctcf’ data for further analysis
> ctcf <- cstest$ctcf
# display the data of ctcf
> ctcf
GRanges with 450096 ranges and 0 metadata columns:
           seqnames                 ranges strand
              <Rle>              <IRanges>  <Rle>
       [1]    chr10     [3012936, 3012959]      +
       [2]    chr10     [3012941, 3012964]      +
       [3]    chr10     [3012944, 3012967]      +
       [4]    chr10     [3012955, 3012978]      +
       [5]    chr10     [3012963, 3012986]      +
       ...      ...                    ...    ...
  [450092]    chr12 [121239376, 121239399]      -
  [450093]    chr12 [121245849, 121245872]      -
  [450094]    chr12 [121245895, 121245918]      -
  [450095]    chr12 [121246344, 121246367]      -
  [450096]    chr12 [121253499, 121253522]      -
  ---
  seqlengths:
           chr1         chr2 ...  chrY_random chrUn_random
      197195432    181748087 ...     58682461      5900358


#calculate coverage of ctcf
> ctcf.cov <- coverage(ctcf)
#get the island of ctcf at chromosome 10
> island <- slice (ctcf.cov$chr10, lower=1)
#display the island range
> range(width(island))
[1]  24 371
# get the largest island in binding region
> largeisland <- island[width(island) == 371,]
#display the largest island information
> largeisland
Views on a 129993255-length Rle subject

views:
       start      end width
[1] 67475840 67476210   371 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 ...]
# get the track information of largest island in chromosome 10 including 5000 bp upstream of largest island and 5000 bp downstream of largest island
> chiptrack <- window (ctcf.cov$chr10, start=start(largeisland) - 5000, end=end(largeisland) + 5000)
#display the information of track
> chiptrack
integer-Rle of length 371 with 99 runs
  Lengths: 17  7  5  6  6 12  5  1 11  5  9  7 ...  7  5 23  1  6  5 10  2  1 11 10
  Values :  1  2  1  2  3  2  1  2  1  2  3  4 ...  3  2  1  2  3  2  3  4  3  2  1
#transform the RLE format data to numeric format data
> chiptrack <- as.numeric (chiptrack)

#prepare DataTrack of this island for track ploting
>  genome <- "hg19"
> dtrack <- DataTrack (data=chiptrack, start=(start(largeisland) -5000):(end(largeisland) +5000), end=(start(largeisland) -5000):(end(largeisland) +5000), chromosome="chr10", genome=genome, name="Density", background.title="brown", cex.title=1.2, background.title="white", fontcolor.title="darkblue", col.axis="darkblue", cex.axis=1)

#prepare ideogramTrack of chromosome 10, human hg19 genome
> itrack <- IdeogramTrack (genome=genome, chromosome="chr10")

#prepare GenomeAxisTrack
> atrack <- GenomeAxisTrack ()

# plot the tracks of largest binding domain of ctcf at chromosome 10

> plotTracks(list(itrack, atrack, dtrack), type="histogram", col.histogram="darkblue", fill.histogram="darkblue")



plotting result: