#annotate_cgi.r # The methods in this file provide the functionality of # annotating genomic intervals with evolutionary classification # of CpG islands #load bioconductor library source("http://bioconductor.org/biocLite.R") library(IRanges) # Function: annotate_cg # # Input: # data - dataframe containing the columns: Chrom, Loc # Chrom - the chromosome number (1-22) # Loc - the location of the CpG # file - CpG classification file location # # Output: data including an additional column of classification. annotate_cg <- function(data, file="T1.txt") { #validate data format if (dim(data)[1]==0) { print("Empty data file") return() } if (length(data$Chrom)==0) { print("Missing Chrom column in data") return() } if (length(data$Loc)==0) { print("Missing Loc column in data") return() } print(paste("Data file format ok, processing", dim(data)[1], "intervals...")) #load CpG classification file (T1.txt) t1 <- read.delim(file) if (dim(t1)[1]==0) { print("Error loading CpG classification file") return() } print("Loaded CpG classification file successfully") processed_data <- c() #iterating over intervals in data file for (i in 1:dim(data)[1]) { interval <- data[i,] cgi <- t1[(t1$Chrom==interval$Chrom & t1$Start<=interval$Loc & t1$End>=interval$Loc),] if (dim(cgi)[1]==0) { #no classification interval$CpG.classification <- NA } else { interval$CpG.classification <- cgi$CpG.classification } processed_data <- rbind(processed_data, interval) } return(as.data.frame(processed_data)) } # Function: filter_cg_intervals # # Input: # data - dataframe containing the columns: Chrom, Start, End # Chrom - the chromosome number (1-22) # Start - the location of the first loci of the interval # End - the location of the last loci of the interval # filter - select (0) or remove (1) # cpg_class - the CpG classification to select or remove (according to filter), # 0 - Hypo-deaminated annotated CpGs # 1 - BGC annotated CpGs # file - CpG classification file location # # Output: new dataframe containing all interval segments that overlap # with the filtered classification # # Notes: Requires boiconductor library IRanges # Number of intervals may increase. filter_cg_intervals <- function(data, filter=0, cpg_class=0, file="T1.txt") { #validate data format if (dim(data)[1]==0) { print("Empty data file") return() } if (length(data$Chrom)==0) { print("Missing Chrom column in data") return() } if (length(data$Start)==0) { print("Missing Start column in data") return() } if (length(data$End)==0) { print("Missing End column in data") return() } print(paste("Data file format ok, processing", dim(data)[1], "intervals...")) #load CpG classification file (T1.txt) t1 <- read.delim(file) if (dim(t1)[1]==0) { print("Error loading CpG classification file") return() } print("Loaded CpG classification file successfully") #checking filter if (cpg_class==0) { class_name = "hypo-deaminated island" } else { class_name = "BGC island" } if (filter==0) { print(paste("Selecting intervals with CpG classification:",class_name)) } else { print(paste("Filtering out intervals with CpG classification:",class_name)) } processed_data <- c() #iterating over all chromosomes for (chr in 1:22) { data_chr <- data[data$Chrom==chr,] t1_chr <- t1[t1$Chrom==chr & t1$CpG.classification==class_name,] #converting to IRanges format d_ranges <- IRanges(start=data_chr$Start, end=data_chr$End, width=NULL) t_ranges <- IRanges(start=t1_chr$Start, end=t1_chr$End, width=NULL) if (filter==0) { result <- as.data.frame(intersect(d_ranges, t_ranges)) } else { result <- as.data.frame(setdiff(d_ranges, t_ranges)) } if (dim(result)[1]>0) { result_chr <- cbind(Chrom=c(chr),Start=result$start, End=result$end) processed_data <- rbind(processed_data, result_chr) } } return(as.data.frame(processed_data)) }