#annotate_cgi.r

# The methods in this file provide the functionality of 
# annotating genomic intervals with evolutionary classification
# of CpG islands

#load bioconductor library
source("http://bioconductor.org/biocLite.R")
library(IRanges)


# Function: annotate_cg
#
# Input: 
#   data - dataframe containing the columns: Chrom, Loc 
#	Chrom - the chromosome number (1-22)
#	Loc - the location of the CpG
#   file - CpG classification file location
#
# Output: data including an additional column of classification.

annotate_cg <- function(data, file="T1.txt") {

#validate data format
  if (dim(data)[1]==0) {
	print("Empty data file")
	return()
  }
  if (length(data$Chrom)==0) {
	print("Missing Chrom column in data")
	return()
  }
  if (length(data$Loc)==0) {
	print("Missing Loc column in data")
	return()
  }
  print(paste("Data file format ok, processing", dim(data)[1], "intervals..."))

#load CpG classification file (T1.txt)
  t1 <- read.delim(file)
  if (dim(t1)[1]==0) {
	print("Error loading CpG classification file")
	return()
  }
  print("Loaded CpG classification file successfully")

 processed_data <- c()
#iterating over intervals in data file
 for (i in 1:dim(data)[1]) {
 	interval <- data[i,]
	cgi <- t1[(t1$Chrom==interval$Chrom & 
		   t1$Start<=interval$Loc & t1$End>=interval$Loc),]
	if (dim(cgi)[1]==0) { 
		#no classification
		interval$CpG.classification <- NA
	} else {
		interval$CpG.classification <- cgi$CpG.classification	
	}
	processed_data <- rbind(processed_data, interval)
 }
 return(as.data.frame(processed_data))
}


# Function: filter_cg_intervals
#
# Input: 
#   data - dataframe containing the columns: Chrom, Start, End
#       Chrom - the chromosome number (1-22)
#       Start - the location of the first loci of the interval
#       End - the location of the last loci of the interval
#   filter - select (0) or remove (1)
#   cpg_class - the CpG classification to select or remove (according to filter), 
#	0 - Hypo-deaminated annotated CpGs
#	1 - BGC annotated CpGs
#   file - CpG classification file location
#
# Output: new dataframe containing all interval segments that overlap
#         with the filtered classification
#
# Notes: Requires boiconductor library IRanges
#        Number of intervals may increase.


filter_cg_intervals <- function(data, filter=0, cpg_class=0, file="T1.txt") {
#validate data format
  if (dim(data)[1]==0) {
        print("Empty data file")
        return()
  }
  if (length(data$Chrom)==0) {
        print("Missing Chrom column in data")
        return()
  }
  if (length(data$Start)==0) {
        print("Missing Start column in data")
        return()
  }
  if (length(data$End)==0) {
        print("Missing End column in data")
        return()
  }
  print(paste("Data file format ok, processing", dim(data)[1], "intervals..."))

#load CpG classification file (T1.txt)
  t1 <- read.delim(file)
  if (dim(t1)[1]==0) {
        print("Error loading CpG classification file")
        return()
  }
  print("Loaded CpG classification file successfully")
 #checking filter
 if (cpg_class==0) {
	class_name = "hypo-deaminated island"
 } else {
	class_name = "BGC island"
 }
 if (filter==0) {
	 print(paste("Selecting intervals with CpG classification:",class_name))
 } else {
	 print(paste("Filtering out intervals with CpG classification:",class_name))
 } 
 processed_data <- c()
#iterating over all chromosomes
 for (chr in 1:22) {
        data_chr <- data[data$Chrom==chr,]
	t1_chr <- t1[t1$Chrom==chr & t1$CpG.classification==class_name,]
	#converting to IRanges format
	d_ranges <- IRanges(start=data_chr$Start, end=data_chr$End, width=NULL)
	t_ranges <- IRanges(start=t1_chr$Start, end=t1_chr$End, width=NULL)
	if (filter==0) {
		result <- as.data.frame(intersect(d_ranges, t_ranges))
	} else {
		result <- as.data.frame(setdiff(d_ranges, t_ranges))
	}
	if (dim(result)[1]>0) {
		result_chr <- cbind(Chrom=c(chr),Start=result$start, End=result$end)
        	processed_data <- rbind(processed_data, result_chr)
	}
 }
 return(as.data.frame(processed_data))
}