Diving into Genetics and Genomics: read in all the files in a folder to R

Sunday, February 7, 2016

read in all the files in a folder to R

Ming Tang

February 7, 2016

read in all the files in a folder with a loop

see a post
I downloaded some broadpeak data from UCSC and put them in a folder /Users/mtang1/projects/ENCODE_ChIPseq/data/

I cut only first four columns of those files and How can I read in all the bed files as GRanges?

# the base of the path
cancer.base<- "/Users/mtang1/projects/ENCODE_ChIPseq/data"

# list.files will print out all the files in the folder ending with broadPeak
cancer.lines.peak.files<- list.files(cancer.base, pattern="*.broadPeak$", full.names=TRUE)
cancer.lines.peak.files

## [1] "/Users/mtang1/projects/ENCODE_ChIPseq/data/wgEncodeHaibTfbsEcc1Pol2V0416102Dm002p1hPkRep2.broadPeak"
## [2] "/Users/mtang1/projects/ENCODE_ChIPseq/data/wgEncodeHaibTfbsHct116Pol24h8V0416101PkRep2.broadPeak"   
## [3] "/Users/mtang1/projects/ENCODE_ChIPseq/data/wgEncodeHaibTfbsHelas3Pol2Pcr1xPkRep2.broadPeak"         
## [4] "/Users/mtang1/projects/ENCODE_ChIPseq/data/wgEncodeHaibTfbsHepg2Pol2Pcr2xPkRep2.broadPeak"          
## [5] "/Users/mtang1/projects/ENCODE_ChIPseq/data/wgEncodeHaibTfbsK562Pol2V0416101PkRep2.broadPeak"        
## [6] "/Users/mtang1/projects/ENCODE_ChIPseq/data/wgEncodeHaibTfbsPanc1Pol24h8V0416101PkRep1.broadPeak"    
## [7] "/Users/mtang1/projects/ENCODE_ChIPseq/data/wgEncodeHaibTfbsU87Pol24h8V0416101PkRep2.broadPeak"

# using a for loop to read in all the files and assign the cell-line name as the varaible name of the GRanges
# import function in rtracklayer will read in bed file as GRanges

library(rtracklayer)
for (file in cancer.lines.peak.files){
        # extract the cell line name
        cell.line.name <- gsub(".+wgEncodeHaibTfbs(.+)Pol2.+", "\\1", file)
        print (sprintf("reading in %s peak file", cell.line.name))
        file.handler<- file
        print (cell.line.name)
        # assign the cell-line name as the varaible name of the GRanges
        assign(cell.line.name, import(file.handler, format = "BED"))
        
}

## [1] "reading in Ecc1 peak file"
## [1] "Ecc1"
## [1] "reading in Hct116 peak file"
## [1] "Hct116"
## [1] "reading in Helas3 peak file"
## [1] "Helas3"
## [1] "reading in Hepg2 peak file"
## [1] "Hepg2"
## [1] "reading in K562 peak file"
## [1] "K562"
## [1] "reading in Panc1 peak file"
## [1] "Panc1"
## [1] "reading in U87 peak file"
## [1] "U87"

Ecc1

## GRanges object with 35977 ranges and 1 metadata column:
##           seqnames                 ranges strand   |        name
##              <Rle>              <IRanges>  <Rle>   | <character>
##       [1]     chr1       [713816, 714479]      *   |       peak1
##       [2]     chr1       [762384, 763181]      *   |       peak2
##       [3]     chr1       [859112, 859462]      *   |       peak3
##       [4]     chr1       [878092, 878373]      *   |       peak4
##       [5]     chr1       [878472, 878807]      *   |       peak5
##       ...      ...                    ...    ... ...         ...
##   [35973]     chrX [154299651, 154300068]      *   |   peak35973
##   [35974]     chrX [154444181, 154444945]      *   |   peak35974
##   [35975]     chrX [154493584, 154493918]      *   |   peak35975
##   [35976]     chrX [154841987, 154842828]      *   |   peak35976
##   [35977]     chrX [155110716, 155111326]      *   |   peak35977
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths

U87

## GRanges object with 42355 ranges and 1 metadata column:
##           seqnames                 ranges strand   |        name
##              <Rle>              <IRanges>  <Rle>   | <character>
##       [1]     chr1       [ 10081,  10449]      *   |       peak1
##       [2]     chr1       [713834, 714370]      *   |       peak2
##       [3]     chr1       [752419, 752810]      *   |       peak3
##       [4]     chr1       [762601, 763171]      *   |       peak4
##       [5]     chr1       [842211, 842561]      *   |       peak5
##       ...      ...                    ...    ... ...         ...
##   [42351]     chrX [154996980, 154997243]      *   |   peak42351
##   [42352]     chrX [154997284, 154997491]      *   |   peak42352
##   [42353]     chrX [155110793, 155111216]      *   |   peak42353
##   [42354]     chrX [155117874, 155118167]      *   |   peak42354
##   [42355]     chrX [155259636, 155260008]      *   |   peak42355
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths

Use lapply

It is awkward to use for loop in R, let’s just use lapply function. see a post

# full path to every file

## lapply will use import to read in every bed file and keep them in a list.
data<- lapply(cancer.lines.peak.files, import, format = "BED")
names(data)<- gsub(".+wgEncodeHaibTfbs(.+)Pol2.+", "\\1", cancer.lines.peak.files)

data$U87

## GRanges object with 42355 ranges and 1 metadata column:
##           seqnames                 ranges strand   |        name
##              <Rle>              <IRanges>  <Rle>   | <character>
##       [1]     chr1       [ 10081,  10449]      *   |       peak1
##       [2]     chr1       [713834, 714370]      *   |       peak2
##       [3]     chr1       [752419, 752810]      *   |       peak3
##       [4]     chr1       [762601, 763171]      *   |       peak4
##       [5]     chr1       [842211, 842561]      *   |       peak5
##       ...      ...                    ...    ... ...         ...
##   [42351]     chrX [154996980, 154997243]      *   |   peak42351
##   [42352]     chrX [154997284, 154997491]      *   |   peak42352
##   [42353]     chrX [155110793, 155111216]      *   |   peak42353
##   [42354]     chrX [155117874, 155118167]      *   |   peak42354
##   [42355]     chrX [155259636, 155260008]      *   |   peak42355
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths

Diving into Genetics and Genomics

My github papge

Sunday, February 7, 2016

read in all the files in a folder to R

Ming Tang

February 7, 2016

read in all the files in a folder with a loop

Use lapply

No comments:

Post a Comment

Labels

My Blog List