事情是這樣的,好不容易從文獻中找到一手SNPs數(shù)據(jù),喜大普奔……
定睛一看,沒有eaf值啊,這可咋整,后續(xù)需要用到 read_outcome_data
函數(shù),eaf值是必須的呢!
在這里停滯了好久,準(zhǔn)備放棄這部分數(shù)據(jù)了,但是又覺得很可惜,數(shù)次徘徊……
于是開始搜搜搜,然后B站還真的給我推了,柳暗花明又一村啊——
1 方法1:snp_add_eaf [孟德爾隨機化之代碼生成eaf_嗶哩嗶哩_bilibili https://www.bilibili.com/video/BV1kR4y1f7Pz/?spm_id_from=333.788.recommend_more_video.-1&vd_source=e5e36ce10569a7a5d647d18bdb42e4b5
更驚喜的事情是,原作者閃現(xiàn)評論區(qū)提供了代碼出處:
[(https://github.com/linjf15/MR_tricks/tree/main/GWAS_preprocessing)
snp_add_eaf <- function (dat, build = '37' , pop = 'EUR' ) { stopifnot(build %in % c('37' ,'38' )) stopifnot('SNP' %in % names(dat)) # Create and get a url server <- ifelse(build == '37' ,'http://grch37.rest.' ,'http://rest.' ) pop <- paste0('1000GENOMES:phase_3:' ,pop) snp_reverse_base <- function (x) { x <- stringr::str_to_upper(x) stopifnot(x %in % c('A' ,'T' ,'C' ,'G' )) switch (x,'A' ='T' ,'T' ='A' ,'C' ='G' ,'G' ='C' ) } res_tab <- lapply(1 :nrow(dat), function (i) { print(paste0('seaching for No.' , i, ' SNP' )) dat_i <- dat[i,] ext <- paste0('/variation/Homo_sapiens/' ,dat_i$SNP, '?content-type=application/json;pops=1' ) url <- paste(server, ext, sep = '' ) res <- httr::GET(url) # Converts http errors to R errors or warnings httr::stop_for_status(res) # Convert R objects from JSON res <- httr::content(res) res_pop <- jsonlite::fromJSON(jsonlite::toJSON(res))$populations # Filter query results based on population set res_pop <- try (res_pop[res_pop$population == pop,]) if ('try-error' %in % class(res_pop)) { print(paste0('There is not information for population ' ,pop)) queried_effect_allele <- 'NR' queried_other_allele <- 'NR' queried_eaf <- -1 } else { if (nrow(res_pop)==0 ) { print(paste0('There is not information for population ' ,pop)) queried_effect_allele <- 'NR' queried_other_allele <- 'NR' queried_eaf <- -1 } else { queried_effect_allele <- res_pop[1 ,'allele' ][[1 ]] queried_other_allele <- res_pop[2 ,'allele' ][[1 ]] queried_eaf <- res_pop[1 ,'frequency' ][[1 ]] } } effect_allele <- ifelse('effect_allele.exposure' %in % names(dat), dat_i$effect_allele.exposure, dat_i$effect_allele) other_allele <- ifelse('effect_allele.exposure' %in % names(dat), dat_i$other_allele.exposure, dat_i$other_allele) if ('effect_allele.exposure' %in % names(dat)) { name_output <- unique(c(names(dat), 'eaf.exposure' ,'reliability.exposure' )) } else { name_output <- unique(c(names(dat), 'eaf' ,'reliability.exposure' )) } len_effect_allele <- nchar(effect_allele) len_other_allele <- nchar(other_allele) if (len_effect_allele==1 &len_other_allele==1 ) { if ((queried_effect_allele==effect_allele & queried_other_allele==other_allele)| (queried_effect_allele==other_allele & queried_other_allele==effect_allele)) { dat_i$eaf.exposure <- ifelse(effect_allele == queried_effect_allele, queried_eaf, 1 -queried_eaf) dat_i$eaf <- dat_i$eaf.exposure dat_i$reliability.exposure <- 'high' } else { r_queried_effect_allele <- snp_reverse_base(queried_effect_allele) r_queried_other_allele <- snp_reverse_base(queried_other_allele) if ((r_queried_effect_allele==effect_allele & r_queried_other_allele==other_allele)| (r_queried_effect_allele==other_allele & r_queried_other_allele==effect_allele)) { dat_i$eaf.exposure <- ifelse(effect_allele == r_queried_effect_allele, queried_eaf, 1 -queried_eaf) dat_i$eaf <- dat_i$eaf.exposure dat_i$reliability.exposure <- 'high' } else { dat_i$eaf.exposure <- ifelse(effect_allele == queried_effect_allele, queried_eaf, 1 -queried_eaf) dat_i$eaf <- dat_i$eaf.exposure dat_i$reliability.exposure <- 'low' } } } else { # To identify the potential DEL/ INS short_allele <- ifelse(len_effect_allele==1 , effect_allele, other_allele) short_allele_eaf <- ifelse(short_allele == queried_effect_allele, queried_eaf, 1 -queried_eaf) dat_i$eaf.exposure <- ifelse(effect_allele == short_allele, short_allele_eaf, 1 -short_allele_eaf) dat_i$eaf <- dat_i$eaf.exposure dat_i$reliability.exposure <- 'low' } dat_i[name_output] }) return (do.call(rbind, res_tab)) }
如果能確定數(shù)據(jù)的參考基因組版本,還可以自定義哦~
2 方法2:get_eaf_from_1000G [孟德爾隨機化, 補eaf, 本地與在線_嗶哩嗶哩_bilibili]
[(https://www.bilibili.com/video/BV1Dh411j7yP/?spm_id_from=333.999.0.0&vd_source=e5e36ce10569a7a5d647d18bdb42e4b5)
這個up主介紹了兩種方法,第一種如上所述,第二種來自于另外一個團隊【代碼作者:廣州醫(yī)科大學(xué)第一臨床學(xué)院周浩彬 ,第二臨床學(xué)院謝治鑫】,用起來都非常方便。
說明文檔:[Get_MR/Get_MR1.0 help.md at main · HaobinZhou/Get_MR · GitHub]
[(https://github.com/HaobinZhou/Get_MR/blob/main/Get_MR1.0 help.md)
get_eaf_from_1000G<-function (dat,path,type='exposure' ){ corrected_eaf_expo<-function (data_MAF){ effect=data_MAF$effect_allele.exposure other=data_MAF$other_allele.exposure A1=data_MAF$A1 A2=data_MAF$A2 MAF_num=data_MAF$MAF EAF_num=1 -MAF_num harna<-is.na(data_MAF$A1) harna<-data_MAF$SNP[which(harna==T )] cor1<-which(data_MAF$effect_allele.exposure !=data_MAF$A1) data_MAF$eaf.exposure=data_MAF$MAF data_MAF$type='raw' data_MAF$eaf.exposure[cor1]=EAF_num[cor1] data_MAF$type[cor1]='corrected' cor2<-which(data_MAF$other_allele.exposure ==data_MAF$A1) cor21<-setdiff(cor2,cor1) cor12<-setdiff(cor1,cor2) error<-c(cor12,cor21) data_MAF$eaf.exposure[error]=NA data_MAF$type[error]='error' data_MAF<-list(data_MAF=data_MAF,cor1=cor1,harna=harna,error=error) return (data_MAF) } corrected_eaf_out<-function (data_MAF){ effect=data_MAF$effect_allele.outcome other=data_MAF$other_allele.outcome A1=data_MAF$A1 A2=data_MAF$A2 MAF_num=data_MAF$MAF EAF_num=1 -MAF_num harna<-is.na(data_MAF$A1) harna<-data_MAF$SNP[which(harna==T )] cor1<-which(data_MAF$effect_allele.outcome !=data_MAF$A1) data_MAF$eaf.outcome=data_MAF$MAF data_MAF$type='raw' data_MAF$eaf.outcome[cor1]=EAF_num[cor1] data_MAF$type[cor1]='corrected' cor2<-which(data_MAF$other_allele.outcome ==data_MAF$A1) cor21<-setdiff(cor2,cor1) cor12<-setdiff(cor1,cor2) error<-c(cor12,cor21) data_MAF$eaf.outcome[error]=NA data_MAF$type[error]='error' data_MAF<-list(data_MAF=data_MAF,cor1=cor1,harna=harna,error=error) return (data_MAF) } if (type=='exposure' && (is.na(dat$eaf.exposure[1 ])==T || is.null(dat$eaf.exposure)==T )){ r<-nrow(dat) setwd(path) MAF<-fread('fileFrequency.frq' ,header = T ) dat<-merge(dat,MAF,by.x = 'SNP' ,by.y = 'SNP' ,all.x = T ) dat<-corrected_eaf_expo(dat) cor1<-dat$cor1 harna<-dat$harna error<-dat$error dat<-dat$data_MAF print(paste0('一共有' ,(r-length(harna)-length(error)),'個SNP成功匹配EAF,占比' ,(r-length(harna)-length(error))/r*100 ,'%' )) print(paste0('一共有' ,length(cor1),'個SNP是major allele,EAF被計算為1-MAF,在成功匹配數(shù)目中占比' ,length(cor1)/(r-length(harna)-length(error))*100 ,'%' )) print(paste0('一共有' ,length(harna),'個SNP在1000G中找不到,占比' ,length(harna)/r*100 ,'%' )) print(paste0('一共有' ,length(error),'個SNP在輸入數(shù)據(jù)與1000G中效應(yīng)列與參照列,將剔除eaf,占比' ,length(error)/r*100 ,'%' )) print('輸出數(shù)據(jù)中的type列說明:' ) print('raw:EAF直接等于1000G里的MAF數(shù)值,因為效應(yīng)列是minor allele' ) print('corrected:EAF等于1000G中1-MAF,因為效應(yīng)列是major allele' ) print('error:輸入數(shù)據(jù)與1000G里面提供的數(shù)據(jù)完全不一致,比如這個SNP輸入的效應(yīng)列是C,參照列是G,但是1000G提供的是A-T,這種情況下,EAF會被清空(NA),當(dāng)成匹配失敗' ) return (dat) } if (type=='outcome' && (is.na(dat$eaf.outcome[1 ])==T || is.null(dat$eaf.outcome)==T )){ r<-nrow(dat) setwd(path) MAF<-fread('fileFrequency.frq' ,header = T ) dat<-merge(dat,MAF,by.x = 'SNP' ,by.y = 'SNP' ,all.x = T ) dat<-corrected_eaf_out(dat) cor1<-dat$cor1 harna<-dat$harna error<-dat$error dat<-dat$data_MAF print(paste0('一共有' ,(r-length(harna)-length(error)),'個SNP成功匹配EAF,占比' ,(r-length(harna)-length(error))/r*100 ,'%' )) print(paste0('一共有' ,length(cor1),'個SNP是major allele,EAF被計算為1-MAF,在成功匹配數(shù)目中占比' ,length(cor1)/(r-length(harna)-length(error))*100 ,'%' )) print(paste0('一共有' ,length(harna),'個SNP在1000G找不到,占比' ,length(harna)/r*100 ,'%' )) print(paste0('一共有' ,length(error),'個SNP在輸入數(shù)據(jù)與1000G中效應(yīng)列與參照列,將剔除eaf,占比' ,length(error)/r*100 ,'%' )) print('輸出數(shù)據(jù)中的type列說明:' ) print('raw:EAF直接等于1000G里的MAF數(shù)值,因為效應(yīng)列是minor allele' ) print('corrected:EAF等于1000G中1-MAF,因為效應(yīng)列是major allele' ) print('error:輸入數(shù)據(jù)與1000G里面提供的數(shù)據(jù)完全不一致,比如這個SNP輸入的效應(yīng)列是C,參照列是G,但是1000G提供的是A-T,這種情況下,EAF會被清空(NA),當(dāng)成匹配失敗' ) return (dat) } else {return (dat)} }
運行這個函數(shù)需要注意,如果你的數(shù)據(jù)是自己整理的本地數(shù)據(jù),那就要提前將數(shù)據(jù)整理一下:
exp_dat <- read_exposure_data( filename = './Mediators/newcytokines_exposure.txt' , ##你的數(shù)據(jù) clump = FALSE , sep= '\t' , snp_col = 'rsid' , beta_col = 'BETA_exposure' , se_col = 'SE_exposure' , effect_allele_col ='EA_exposure' , other_allele_col = 'NEA_exposure' , eaf_col = 'eaf.exposure' , pval_col = 'P' ) head(exp_dat)# 從1000G的MAF文件中提取EAF并將其與輸入數(shù)據(jù)匹配 newdat <- get_eaf_from_1000G(exp_dat, 'D:/09-MY_try/MR_Aanalyses/' , type = 'exposure' ) ##這里的路徑是包含fileFrequency.frq這個文件的文件夾!
fileFrequency.frq
文件來源: