R语言做基因表达量和变异位点的关联分析eQTL

用户7010445

发布于 2024-05-09 14:28:28

810

发布于 2024-05-09 14:28:28

文章被收录于专栏：小明的数据分析笔记本小明的数据分析笔记本

参考链接

http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/runit.html

表达数据来源于论文

Population level gene expression can repeatedly link genes to functions in maize

https://www.biorxiv.org/content/10.1101/2023.10.31.565032v1

数据下载链接 https://doi.org/10.6084/m9.figshare.24470758.v1

变异数据来源于论文

A common resequencing-based genetic marker data set for global maize diversity

https://onlinelibrary.wiley.com/doi/10.1111/tpj.16123

数据下载链接 https://datadryad.org/stash/dataset/doi:10.5061/dryad.bnzs7h4f1

这篇论文对应的代码 https://github.com/mgrzy/Maize_Genetic_Variants_v5/tree/main

论文用到的参考基因组

B73_RefGen_V5 maize reference genome

De novo assembly, annotation, and comparative analysis of 26 diverse maize genomes

https://www.science.org/doi/full/10.1126/science.abg5289

参考基因组下载链接https://download.maizegdb.org/Zm-B73-REFERENCE-NAM-5.0/

变异数据的处理

只下载了8 9 10 号染色体数据，只保留插入缺失变异，只保留了100个样本，最小等位基因频率0.05

使用 VCF2PCACluster 这个软件计算PCA , 这个软件只计算snp ，需要自己写脚本editRefAlt.py修改vcf文件里的ref和alt列

自己写脚本convertVcfTo012Matrix.py把vcf文件转换成 0 1 2 矩阵

表达量数据处理

8 9 10 号染色体的基因，只保留100个样本。在>=80个样本中 TPM > 0.05 的基因保留，最后只保留了4000多个基因，标准化，然后peer 计算隐藏因子 run_peer.R

最终的输入数据

R语言里的代码

library(MatrixEQTL)
SNP_file_name<-"eQTL_maize/chr.onlyInDels.impute.maf.recode.012.matrix"
snps = SlicedData$new();
snps$fileDelimiter = "\t";      # the TAB character
snps$fileOmitCharacters = "NA"; # denote missing values;
snps$fileSkipRows = 1;          # one row of column labels
snps$fileSkipColumns = 1;       # one column of row labels
snps$fileSliceSize = 2000;      # read file in slices of 2,000 rows
snps$LoadFile(SNP_file_name);

expression_file_name<-"eQTL_maize/quant.norm.tpm.tsv"

gene = SlicedData$new();
gene$fileDelimiter = "\t";      # the TAB character
gene$fileOmitCharacters = "NA"; # denote missing values;
gene$fileSkipRows = 1;          # one row of column labels
gene$fileSkipColumns = 1;       # one column of row labels
gene$fileSliceSize = 2000;      # read file in slices of 2,000 rows
gene$LoadFile(expression_file_name);

covariates_file_name<-"eQTL_maize/covariates.tsv"

cvrt = SlicedData$new();
cvrt$fileDelimiter = "\t";      # the TAB character
cvrt$fileOmitCharacters = "NA"; # denote missing values;
cvrt$fileSkipRows = 1;          # one row of column labels
cvrt$fileSkipColumns = 1;       # one column of row labels
cvrt$LoadFile(covariates_file_name);

snps_location_file_name<-"eQTL_maize/snpPos.tsv"
gene_location_file_name<-"eQTL_maize/genePos.tsv"

snpspos = read.table(snps_location_file_name, header = TRUE, stringsAsFactors = FALSE);
genepos = read.table(gene_location_file_name, header = TRUE, stringsAsFactors = FALSE);

genepos %>% head()

cisDist<-5000
pvOutputThreshold_cis<-0.1
pvOutputThreshold_tra<-0.00000000001

errorCovariance<-numeric()

useModel<-modelLINEAR

output_file_name_cis = tempfile();
output_file_name_tra = tempfile();

me = Matrix_eQTL_main(
  snps = snps,
  gene = gene,
  cvrt = cvrt,
  output_file_name     = output_file_name_tra,
  pvOutputThreshold     = pvOutputThreshold_tra,
  useModel = useModel,
  errorCovariance = errorCovariance,
  verbose = TRUE,
  output_file_name.cis = output_file_name_cis,
  pvOutputThreshold.cis = pvOutputThreshold_cis,
  snpspos = snpspos,
  genepos = genepos,
  cisDist = cisDist,
  pvalue.hist = "qqplot",
  min.pv.by.genesnp = FALSE,
  noFDRsaveMemory = FALSE);


unlink(output_file_name_tra);
unlink(output_file_name_cis);

me$cis$eqtls %>% 
  left_join(snpspos,by=c("snps"="svid")) %>% 
  ggplot(aes(x=pos,y=-log10(FDR)))+
  geom_point(aes(color=chr))+
  facet_wrap(~chr,nrow = 1)+
  theme_bw()+
  theme(panel.grid = element_blank(),
        panel.border = element_blank())

me$cis$eqtls %>% head()
me$cis$eqtls %>% pull(gene) %>% unique() %>% length()
genepos %>% dim()
read_tsv("eQTL_maize/quant.norm.tpm.tsv") %>% dim()

me$cis$eqtls %>% 
  filter(-log10(FDR)>=25)
genepos %>% filter(ID=="Zm00001eb347950")