http://www.dataschool.io/dplyr-tutorial-for-faster-data-manipulation-in-r/
http://stat545-ubc.github.io/block009_dplyr-intro.html
http://stat545-ubc.github.io/bit001_dplyr-cheatsheet.html
http://gettinggeneticsdone.blogspot.com/2014/08/do-your-data-janitor-work-like-boss.html
I followed the example in the early edition of Bioinformatics Data Skills on the dplyr part and put a gist below. The example used dplyr to manipulate and summarize the human annotation gff file downloaded here http://useast.ensembl.org/info/data/ftp/index.html.
I downloaded the ensemble 74 build which is different from the 75 build in the example of the book.
I believe the gencode v19 annotation file is based on ensemble 74.
I also included comments in the R code describing how to do the same job with linux command line.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
setwd("/home/tommy/annotations/human/ensemble/") | |
# set the colClasses for faster reading in the data | |
gtf_cols <- c(seqname="factor", source="factor", feature="factor", | |
start="integer", end="integer", score="character", | |
strand="factor", frame="factor", attribute="character") | |
hs_gtf <- read.delim('Homo_sapiens.GRCh37.74.gtf.gz', header=FALSE, | |
col.names=names(gtf_cols), comment.char="#") | |
# load all the data into memory, this gz file is around 28Mb, if gunzip it will be around 900Mb | |
# very slow, it took me 2-3 mins to finish loading.....try data.table. usually I work with gtf file | |
# on command line. of course, specialized tools such as python library HTSeq and gffutils, bioawk etc are | |
# also existed there for more complicated analysis. | |
head(hs_gtf) | |
dim(hs_gtf) # how many rows and columns 2 million rows | |
hs_df<- tbl_df(hs_gtf) #convert to a local dataframe, when you print, it will not | |
# print out all the rows. similar to head() | |
head(hs_df$attribute, n=3) | |
extractGeneID<- function(x) gsub("gene_id ([^;]+);.*", "\\1", x) | |
# on linux command (it is all about writing regular expression!) | |
# generally,awk does not support back reference http://ubuntuforums.org/showthread.php?t=2198616 | |
# but gensub function supports in gawk (Gnu awk) | |
# ex. $ echo 'noon' | awk '{print gensub(/(.)(.)/,"\\2\\1","g")}' output:onno | |
# zcat Homo_sapiens.GRCh37.74.gtf.gz| head |awk -F'\t' '{ a=gensub(/gene_id "([^;]+)";.*/, "\\1", "g",$9); print $0"\t"a}' | |
# the gene_id has quotes | |
# use sed instead: | |
# zcat Homo_sapiens.GRCh37.74.gtf.gz | awk -F '\t' '{print $9}' | sed 's/gene_id "\([^;]\+\)";.*/\1/' | |
# append this column to the original file: | |
# paste <(zcat Homo_sapiens.GRCh37.74.gtf.gz) <(zcat Homo_sapiens.GRCh37.74.gtf.gz | awk -F '\t' '{print $9}' | sed 's/gene_id "\([^;]\+\)";.*/\1/') | |
# this is a long linux command, I constructed bit by bit, make sure every step works. | |
# use mutate from dplyr to add a column based on original column | |
# add a column with GeneID | |
hs_df<- mutate(hs_df, gene_id = extractGeneID(attribute)) # takes several seconds | |
# select only the columns we are interested | |
select(hs_df, seqname, feature, start, end, gene_id) | |
select(hs_df, seqname, source, feature, start, end, gene_id) | |
table(hs_df$source) # on linux command zcat Homo_sapiens.GRCh37.74.gtf.gz | datamash -s -g 2 count 2 | |
# use filer to keep the rows with source == "protein_coding" | |
pc_df<- filter(hs_df, source == "protein_coding") #fast | |
pc_df # zcat Homo_sapiens.GRCh37.74.gtf.gz | awk '$2=="protein_coding"'| wc -l returns 1667560 | |
# multiple filter rules | |
filter(hs_df, feature == "exon", strand == "+") #ensemble 74 is different from ensemble 75 in which has a feature "gene" | |
# and comment lines | |
# use %>% (pronounced as "then") to get the exon number of each gene | |
# SQL can do the same job, see my previous post here http://crazyhottommy.blogspot.com/2013/11/mysql-rocks.html | |
# extract exon number | |
extractExonNumber <- function(x) as.integer(gsub(".*exon_number (\\d+);.*", "\\1", x)) | |
pce_df <- filter(hs_df, source == "protein_coding", feature == "exon") %>% | |
mutate(exon_number=extractExonNumber(attribute)) | |
pce_df %>% filter(feature=="exon") %>% | |
group_by(gene_id) %>% | |
summarise(num_exons = max(exon_number)) %>% | |
arrange(desc(num_exons)) | |
# Source: local data frame [22,642 x 2] | |
# gene_id num_exons | |
# 1 ENSG00000155657 363 | |
# 2 ENSG00000183091 182 | |
# 3 ENSG00000131018 147 | |
# 4 ENSG00000114270 118 | |
# 5 ENSG00000054654 116 | |
# 6 ENSG00000154358 116 | |
# 7 ENSG00000203832 111 | |
# 8 ENSG00000127481 107 | |
# 9 ENSG00000143341 107 | |
# 10 ENSG00000198626 107 |