# 使用 Annovar 注释 VCF 文件
变异注释是基因组学研究中的关键步骤,帮助研究人员理解变异的功能意义。Annovar (ANNOtate VARiation) 是一个广泛使用的工具,用于对变异进行功能注释。
# 安装 Annovar
下载 Annovar 需要在 此处 用教育邮箱注册,然后邮箱中会收到下载链接。
如果你没有教育邮箱
这里 或许是一个可以下载的链接
解压压缩包
tar -zxvf annovar.latest.tar.gz |
根据自己的习惯看要不要配置环境变量,此处以不配置环境变量为例
Annovar 的可执行文件存放在 ~/annovar/
下,因此后续使用都要带上这个前缀,为绝对路径调用
# 准备数据
我的目录结构如下:
. | |
└── ── 16794B/ | |
└── ── clean_data/ | |
└── ── gvcf/ | |
└── ── result_alignment/ | |
└── ── result_variation/ | |
└── ── cnv/ | |
└── ── indel/ | |
└── ── snp/ | |
└── ── sv/ | |
└── ── 16794N/ | |
└── ── clean_data/ | |
└── ── gvcf/ | |
└── ── result_alignment/ | |
└── ── result_variation/ | |
└── ── cnv/ | |
└── ── indel/ | |
└── ── snp/ | |
└── ── sv/ | |
... |
其中需要注释的 VCF 文件存在于每个样本的 gvcf/
目录下,是一个 .g.vcf.gz
文件
# 简单的示例
# 步骤 1:转换 VCF 文件
Annovar 使用其专有的输入格式,因此首先需要将 VCF 文件转换为 Annovar 的输入格式。例如对 16794N.g.vcf.gz
文件进行转换:
perl ~/annovar/convert2annovar.pl -format vcf4 16794N/gvcf/16794N.g.vcf.gz > 16794N.avinput |
这将生成一个 sample.avinput
文件,作为后续注释的输入。
# 步骤 2:下载 Annovar 数据库
在进行注释之前,需要下载相关的数据库。所有支持的数据库可以在 官网 上找到:
例如,我使用 RefGene, cytoBand, 1000g2015aug, dbnsfp42c, cg69, ljb26_all, snp138 数据库进行注释:
RefGene 数据库已经在 humandb/
目录下,不需要下载
perl ~/annovar/annotate_variation.pl --downdb --buildver hg19 cytoBand humandb/ | |
perl ~/annovar/annotate_variation.pl --downdb --webfrom annovar --buildver hg19 1000g2015aug humandb/ | |
perl ~/annovar/annotate_variation.pl --downdb --webfrom annovar --buildver hg19 dbnsfp42c humandb/ | |
perl ~/annovar/annotate_variation.pl --downdb --webfrom annovar --buildver hg19 cg69 humandb/ | |
perl ~/annovar/annotate_variation.pl --downdb --webfrom annovar --buildver hg19 ljb26_all humandb/ | |
perl ~/annovar/annotate_variation.pl --downdb --webfrom annovar --buildver hg19 snp138 humandb/ |
# 步骤 3:变异注释
下载完数据库后,可以开始注释 VCF 文件中的变异。例如对样本 16794N
进行注释:
perl ~/annovar/table_annovar.pl 16794N.avinput ~/annovar/humandb/ -buildver hg19 -out 16794N -remove -protocol refGene,cytoBand,1000g2015aug_all,1000g2015aug_eur,1000g2015aug_afr,dbnsfp42c,cg69,ljb26_all,snp138 -operation g,r,f,f,f,f,f,f,f -nastring . -csvout |
在这个命令中:
-buildver hg19
指定参考基因组版本。-out sample
指定输出文件的前缀。-remove
表示注释完成后删除中间文件。-protocol
指定使用的数据库,这里使用了refGene
、cytoBand
、1000g2015aug_all
、1000g2015aug_eur
、1000g2015aug_afr
、dbnsfp42c
、cg69
、ljb26_all
、snp138
。-operation
指定每个数据库的操作类型,g
表示基因注释,r
表示区域注释,f
表示过滤数据库。-nastring .
指定将缺失值表示为 "."-csvout
表示输出文件为 CSV 格式。
# 步骤 4:查看结果
由于我们删除了中间文件,注释完成后我们将只得到 1 个 csv 输出文件 16794B.hg19_multianno.csv
# 批量操作
现在我们对 22 个样本进行批量注释:
# 使用循环
如果服务器比较拥挤,可以使用循环:
nano process_vcfs.sh |
写入以下代码:
#!/bin/bash | |
# 定义样本文件夹的根目录 | |
root_dir=~/WGS_SD/result | |
# 循环处理每个样本文件夹 | |
for sample_dir in "$root_dir"/*; do | |
if [ -d "$sample_dir" ]; then | |
sample_name=$(basename "$sample_dir") | |
vcf_file="$sample_dir/gvcf/${sample_name}.g.vcf.gz" | |
avinput_file="$sample_dir/gvcf/${sample_name}.avinput" | |
output_prefix="$sample_dir/gvcf/${sample_name}.annovar" | |
# 检查 VCF 文件是否存在 | |
if [ -f "$vcf_file" ]; then | |
echo "Processing $vcf_file" | |
# 检查 AVINPUT 文件是否存在 | |
if [ ! -f "$avinput_file" ]; then | |
# 将 VCF 文件转换为 ANNOVAR 输入格式 | |
echo "Converting $vcf_file to $avinput_file" | |
perl ~/annovar/convert2annovar.pl -format vcf4 "$vcf_file" -outfile "$avinput_file" | |
else | |
echo "$avinput_file already exists, skipping conversion." | |
fi | |
# 进行注释 | |
perl ~/annovar/table_annovar.pl "$avinput_file" ~/annovar/humandb/ -buildver hg19 -out "$output_prefix" -remove -protocol refGene,cytoBand,1000g2015aug_all,1000g2015aug_eur,1000g2015aug_afr,dbnsfp42c,cg69,ljb26_all,snp138 -operation g,r,f,f,f,f,f,f,f -nastring . -csvout | |
else | |
echo "VCF file $vcf_file not found, skipping." | |
fi | |
fi | |
done |
赋予执行权限,并使用 nohup 命令后台运行:
chmod +x process_vcfs.sh | |
nohup ./process_vcfs.sh > process_vcfs.log 2>&1 & |
# 同时进行
当我们的 vcf 文件很大的时候,转换和注释通常非常耗时,因此在服务器空闲的情况下我们可以同时进行:
nano run_all_samples.sh |
写入以下代码:
#!/bin/bash | |
# 定义样本文件夹的根目录 | |
root_dir=~/WGS_SD/result | |
# 循环处理每个样本文件夹 | |
for sample_dir in "$root_dir"/*; do | |
if [ -d "$sample_dir" ]; then | |
sample_name=$(basename "$sample_dir") | |
vcf_file="$sample_dir/gvcf/${sample_name}.g.vcf.gz" | |
avinput_file="$sample_dir/gvcf/${sample_name}.avinput" | |
output_prefix="$sample_dir/gvcf/${sample_name}.annovar" | |
output_file="$sample_dir/gvcf/${sample_name}.annovar.hg19_multianno.csv" | |
script_file="${sample_name}_process.sh" | |
# 检查是否已经完成注释 | |
if [ -f "$output_file" ]; then | |
echo "Annotation for $sample_name already completed, skipping." | |
continue | |
fi | |
# 检查 VCF 文件是否存在 | |
if [ -f "$vcf_file" ]; then | |
echo "Processing $vcf_file" | |
# 生成处理脚本 | |
cat <<EOT > $script_file | |
#!/bin/bash | |
# 定义变量 | |
vcf_file="$vcf_file" | |
avinput_file="$avinput_file" | |
output_prefix="$output_prefix" | |
# 检查 AVINPUT 文件是否存在 | |
if [ ! -f "\$avinput_file" ]; then | |
# 将 VCF 文件转换为 ANNOVAR 输入格式 | |
echo "Converting \$vcf_file to \$avinput_file" | |
perl ~/annovar/convert2annovar.pl -format vcf4 "\$vcf_file" -outfile "\$avinput_file" | |
else | |
echo "\$avinput_file already exists, skipping conversion." | |
fi | |
# 进行注释 | |
perl ~/annovar/table_annovar.pl "\$avinput_file" ~/annovar/humandb/ -buildver hg19 -out "\$output_prefix" -remove -protocol refGene,cytoBand,1000g2015aug_all,1000g2015aug_eur,1000g2015aug_afr,dbnsfp42c,cg69,ljb26_all,snp138 -operation g,r,f,f,f,f,f,f,f -nastring . -csvout | |
EOT | |
# 确保脚本具有可执行权限 | |
chmod +x $script_file | |
# 使用 nohup 运行处理脚本 | |
nohup ./$script_file > ${sample_name}_process.log 2>&1 & | |
else | |
echo "VCF file $vcf_file not found, skipping." | |
fi | |
fi | |
done |
赋予执行权限,并使用 nohup 命令后台运行:
chmod +x run_all_samples.sh | |
nohup ./run_all_samples.sh > run_all_samples.log 2>&1 & |
# 结果处理
查看注释结果
[limin@localhost result]$ head -n 3 16794B/gvcf/16794B.annovar.hg19_multianno.csv | |
Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,cytoBand,1000g2015aug_all,1000g2015aug_eur,1000g2015aug_afr,SIFT_score,SIFT_converted_rankscore,SIFT_pred,SIFT4G_score,SIFT4G_converted_rankscore,SIFT4G_pred,LRT_score,LRT_converted_rankscore,LRT_pred,MutationTaster_score,MutationTaster_converted_rankscore,MutationTaster_pred,MutationAssessor_score,MutationAssessor_rankscore,MutationAssessor_pred,FATHMM_score,FATHMM_converted_rankscore,FATHMM_pred,PROVEAN_score,PROVEAN_converted_rankscore,PROVEAN_pred,MetaSVM_score,MetaSVM_rankscore,MetaSVM_pred,MetaLR_score,MetaLR_rankscore,MetaLR_pred,MetaRNN_score,MetaRNN_rankscore,MetaRNN_pred,M-CAP_score,M-CAP_rankscore,M-CAP_pred,MutPred_score,MutPred_rankscore,MVP_score,MVP_rankscore,MPC_score,MPC_rankscore,PrimateAI_score,PrimateAI_rankscore,PrimateAI_pred,DEOGEN2_score,DEOGEN2_rankscore,DEOGEN2_pred,BayesDel_addAF_score,BayesDel_addAF_rankscore,BayesDel_addAF_pred,BayesDel_noAF_score,BayesDel_noAF_rankscore,BayesDel_noAF_pred,ClinPred_score,ClinPred_rankscore,ClinPred_pred,LIST-S2_score,LIST-S2_rankscore,LIST-S2_pred,Aloft_pred,Aloft_Confidence,DANN_score,DANN_rankscore,fathmm-MKL_coding_score,fathmm-MKL_coding_rankscore,fathmm-MKL_coding_pred,fathmm-XF_coding_score,fathmm-XF_coding_rankscore,fathmm-XF_coding_pred,Eigen-raw_coding,Eigen-raw_coding_rankscore,Eigen-PC-raw_coding,Eigen-PC-raw_coding_rankscore,integrated_fitCons_score,integrated_fitCons_rankscore,integrated_confidence_value,GERP++_NR,GERP++_RS,GERP++_RS_rankscore,phyloP100way_vertebrate,phyloP100way_vertebrate_rankscore,phyloP30way_mammalian,phyloP30way_mammalian_rankscore,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons30way_mammalian,phastCons30way_mammalian_rankscore,SiPhy_29way_logOdds,SiPhy_29way_logOdds_rankscore,Interpro_domain,GTEx_V8_gene,GTEx_V8_tissue,cg69,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,LRT_score,LRT_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,RadialSVM_score,RadialSVM_pred,LR_score,LR_pred,VEST3_score,CADD_raw,CADD_phred,GERP++_RS,phyloP46way_placental,phyloP100way_vertebrate,SiPhy_29way_logOdds,snp138 | |
chrM,150,150,T,C,"upstream","RNR1","dist=500",.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,"rs62581312" | |
chrM,195,195,C,T,"upstream","RNR1","dist=455",.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,"rs2857291" |
发现有些值两侧有引号,这会对后面的处理产生影响,需要去除引号
nano remove_quotes.sh |
写入以下代码:
#!/bin/bash | |
# 定义样本文件夹的根目录 | |
root_dir="/home/limin/WGS_SD/result" | |
# 遍历每个样本文件夹 | |
for sample_dir in "$root_dir"/*; do | |
# 检查是否为目录 | |
if [ -d "$sample_dir" ]; then | |
# 定义原始 CSV 文件路径 | |
csv_file="$sample_dir/gvcf/$(basename "$sample_dir").annovar.hg19_multianno.csv" | |
# 定义去除引号后的新文件路径 | |
new_csv_file="$sample_dir/gvcf/$(basename "$sample_dir").annovar.hg19_multianno_no_quotes.csv" | |
# 检查原始 CSV 文件是否存在 | |
if [ -f "$csv_file" ]; then | |
# 去除 CSV 文件中的引号,并保存到新文件 | |
tail -n +1 "$csv_file" | sed 's/"//g' > "$new_csv_file" | |
echo "Processed $csv_file -> $new_csv_file" | |
else | |
echo "File $csv_file does not exist." | |
fi | |
fi | |
done |
赋予执行权限,并使用 nohup 命令后台运行:
chmod +x remove_quotes.sh | |
nohup ./remove_quotes.sh > remove_quotes.log 2>&1 & |
# 筛选并合并所有样本的注释
nano merge_annotations.sh |
写入以下代码:
#!/bin/bash | |
# 定义样本文件夹的根目录 | |
root_dir=~/WGS_SD/result | |
output_file="merged_annotations.csv" | |
# 创建合并文件并添加表头 | |
echo "Chr,Start,End,Ref,Alt,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,Func.refGene,cytoBand,1000g2015aug_all,Tumor_Sample_Barcode" > $output_file | |
# 遍历所有样本文件夹 | |
for sample_dir in "$root_dir"/*; do | |
if [ -d "$sample_dir" ]; then | |
# 查找注释 CSV 文件 | |
sample_file="$sample_dir/gvcf/$(basename "$sample_dir").annovar.hg19_multianno_no_quotes.csv" | |
if [ -f "$sample_file" ]; then | |
sample_name=$(basename "$sample_file" | cut -d. -f1) | |
# 使用 awk 处理文件 | |
awk -v sample="$sample_name" -F',' ' | |
BEGIN { OFS="," } | |
NR == 1 { next } # 跳过表头 | |
$6 != "intergenic" && $6 != "intronic" && $12 < 0.001 && $12 != "." { | |
print $1, $2, $3, $4, $5, $7, $8, $9, $10, $6, $11, $12, sample | |
} | |
' "$sample_file" >> $output_file | |
fi | |
fi | |
done |
赋予执行权限,并使用 nohup 命令后台运行:
chmod +x merge_annotations.sh | |
nohup ./merge_annotations.sh > merge_annotations.log 2>&1 & |
# Annovar 注释结果与 MAF 文件的转换
MAF 文件是 Mutation Annotation Format 文件的缩写,是一种用于存储基因突变注释信息的标准文件格式。
# 如果你的 Annovar 注释结果比较多
可以使用 annovar2maf
这是一个 python 编写的脚本,不过他是针对 txt 文件的,由于我们的注释结果保存为了 csv 文件,需要将脚本中的 line.split("\t")
替换为 line.split(",")
以下是替换后的脚本
#!/usr/bin/env python | |
# File: annovar2maf.py | |
# Author: Anand Mayakonda [https://github.com/PoisonAlien] | |
# Created: July 19, 2023 | |
# Description: This script converts annovar annotations and bcftools csq output to MAF | |
# MIT License | |
# Copyright (c) [2023] [Anand Mayakonda] | |
import argparse | |
import os.path | |
import re | |
def get_variant_type(ref, alt, vc): | |
""" | |
Estimate Variany Type based on ref and alt alleles | |
""" | |
variant_type_mappings = { | |
"Frameshift_INDEL": "INS" if len(alt) > len(ref) else "DEL", | |
"Inframe_INDEL": "INS" if len(alt) > len(ref) else "DEL", | |
"Missense_Mutation": "SNP" | |
} | |
ref_alt = f"{ref}>{alt}" | |
ref_alt_len = len(ref) + len(alt) | |
ref_alt_diff = len(ref) - len(alt) | |
if vc in variant_type_mappings: | |
return variant_type_mappings[vc] | |
elif ref_alt_diff < 0: | |
return "INS" | |
elif ref_alt_diff > 0: | |
return "DEL" | |
elif ref_alt in ["->A", "->C", "->T", "->G"]: | |
return "INS" | |
elif ref_alt in ["A>-", "C>-", "T>-", "G>-"]: | |
return "DEL" | |
elif ref_alt_len == 2: | |
return "SNP" | |
elif ref_alt_len == 4: | |
return "DNP" | |
elif ref_alt_len == 6: | |
return "TNP" | |
elif ref_alt_len > 6: | |
return "ONP" | |
else: | |
return "NA" | |
def reformat_aachange(input_string): | |
""" | |
Reformat bcftools csq aminio acid change info to standard HGVSp format | |
Example: | |
55HC>55HG to HC55HG | |
253Y to Y253Y | |
215E>215* to E215* | |
""" | |
spl = input_string.split(">") | |
if len(spl) == 2: | |
hgvsp = re.sub('[^A-Za-z]+', '', spl[0]) + re.sub('[^0-9]+', '', spl[0]) + re.sub('[0-9]+', '', spl[1]) | |
else: | |
hgvsp = re.sub('[^A-Za-z]+', '', spl[0]) + re.sub('[^0-9]+', '', spl[0]) + re.sub('[^A-Za-z]+', '', spl[0]) | |
return hgvsp | |
def csq2maf(csq, tsb): | |
""" | |
Takes bcftools csq formatted output and converts to maf | |
bcftools example command: | |
bcftools norm -f GRCh37.fa -m -both -Oz foo.vcf.gz | bcftools csq -c CSQ -f GRCh37.fa -g Homo_sapiens.GRCh37.82.gff3.gz -p a | bcftools +split-vep /dev/stdin -Oz -o foo.csq.vcf.gz -c - -s worst | |
bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%gene\t%transcript\t%Consequence\t%amino_acid_change\t%dna_change\n' foo.csq.vcf.gz > foo.csq.tsv | |
foo.csq.tsv would be the input | |
""" | |
csq_to_vc_dict = {"synonymous": "Silent", | |
"missense": "Missense_Mutation", | |
"stop_lost": "Nonstop_Mutation", | |
"stop_gained": "Nonsense_Mutation", | |
"inframe_deletion": "In_Frame_Del", | |
"inframe_insertion": "In_Frame_Ins", | |
"frameshift": "INDEL", | |
"splice_acceptor": "Splice_Site", | |
"splice_donor": "Splice_Site", | |
"start_lost": "Translation_Start_Site", | |
"splice_region": "Splice_Region", | |
"stop_retained": "Silent", | |
"5_prime_utr": "5'UTR", | |
"3_prime_utr": "3'UTR", | |
"non_coding": "RNA", | |
"intron": "Intron", | |
"intergenic": "IGR", | |
"inframe_altering": "", | |
"coding_sequence": "Missense_Mutation", | |
"feature_elongation": "Targeted_Region", | |
"start_retained": "Silent", | |
".": "IGR", | |
"NA": "NA"} | |
with open(csq, 'r') as csq_file: | |
for line in csq_file: | |
line_spl = line.strip().split("\t") | |
variant_classification = csq_to_vc_dict.get(line_spl[6], "NA") | |
# Add Variant-type annotations based on the difference between ref and alt alleles | |
variant_type = get_variant_type( | |
line_spl[2], line_spl[3], variant_classification) | |
# Reformat amino acid change to HGVSp convention | |
if line_spl[7] == ".": | |
aachange = "NA" | |
else: | |
aachange = reformat_aachange(line_spl[7]) | |
# Refgene in Unknonw if IGR or missing | |
if line_spl[4] in ["NA", "NONE", "."]: | |
refgene = "Unknown" | |
else: | |
refgene = line_spl[4] | |
maf = [meta, refgene, line_spl[0], line_spl[1], | |
line_spl[1], variant_classification, variant_type, line_spl[2], line_spl[3], line_spl[5], line_spl[8], line_spl[7], aachange] | |
print('\t'.join([str(x) for x in maf])) | |
def parse_record(line, col_idx, col_idx_non): | |
""" | |
Convert an annovar line to MAF. Retain all other columns as is | |
""" | |
# Annovar to MAF mappings (http://annovar.openbioinformatics.org/en/latest/user-guide/gene/) | |
annovar_values = { | |
'exonic': 'RNA', | |
'splicing': 'Splice_Site', | |
'UTR5': "5'UTR", | |
'UTR3': "3'UTR", | |
'intronic': 'Intron', | |
'upstream': "5'Flank", | |
'downstream': "3'Flank", | |
'intergenic': 'IGR', | |
'frameshift insertion': 'Frame_Shift_Ins', | |
'frameshift deletion': 'Frame_Shift_Del', | |
'frameshift block substitution': 'Frameshift_INDEL', | |
'frameshift substitution': 'Frameshift_INDEL', | |
'stopgain': 'Nonsense_Mutation', | |
'stoploss': 'Nonstop_Mutation', | |
'startloss': 'Translation_Start_Site', | |
'startgain': 'Unknown', | |
'nonframeshift insertion': 'In_Frame_Ins', | |
'nonframeshift deletion': 'In_Frame_Del', | |
'nonframeshift block substitution': 'Inframe_INDEL', | |
'nonframeshift substitution': 'Inframe_INDEL', | |
'nonsynonymous SNV': 'Missense_Mutation', | |
'synonymous SNV': 'Silent', | |
'unknown': 'Unknown', | |
'ncRNA_exonic': 'RNA', | |
'ncRNA_intronic': 'RNA', | |
'ncRNA_UTR3': 'RNA', | |
'ncRNA_UTR5': 'RNA', | |
'ncRNA': 'RNA', | |
'ncRNA_splicing': 'RNA', | |
'NA': 'NA', | |
'.': 'NA' | |
} | |
# Take the fisrt functional entry | |
linespl = line.split(",") | |
func_refgene = linespl[col_idx.get("Func.refGene")] | |
func_refgene = func_refgene.split(";")[0] | |
#Take the fisrt functional entry | |
exonicFunc_refgene = linespl[col_idx.get("ExonicFunc.refGene")] | |
exonicFunc_refgene = exonicFunc_refgene.split(";")[0] | |
# Refgene in Unknonw if IGR or missing | |
refgene = linespl[col_idx.get("Gene.refGene")] | |
refgene = refgene.split(";")[0] | |
if refgene in ["NA", "NONE"]: | |
refgene = "Unknown" | |
#Use first transcript changes by default | |
aa_change = linespl[col_idx.get("AAChange.refGene")] | |
aa_change = aa_change.split(",")[0] | |
aa_change = aa_change.split(":") | |
# "Transcript_ID", "Exon_Number", "HGVSc", "HGVSp" | |
if len(aa_change) == 5: | |
aa_change = [aa_change[1], aa_change[2], aa_change[3], aa_change[4]] | |
else: | |
aa_change = ["NA", "NA", "NA", "NA"] | |
if exonicFunc_refgene == "NA": | |
variant_classification = annovar_values[func_refgene] | |
else: | |
variant_classification = annovar_values[exonicFunc_refgene] | |
# Add Variant-type annotations based on the difference between ref and alt alleles | |
variant_type = get_variant_type(linespl[col_idx.get("Ref")], linespl[col_idx.get("Alt")], variant_classification) | |
res = [refgene, linespl[col_idx.get("Chr")], linespl[col_idx.get("Start")], linespl[col_idx.get("End")], | |
variant_classification, variant_type, linespl[col_idx.get("Ref")], linespl[col_idx.get("Alt")]] + aa_change | |
for idx in list(col_idx_non.values()): | |
if len(linespl)-1 >= idx: | |
res.append(linespl[idx]) | |
else: | |
res.append("NA") | |
return '\t'.join([str(x) for x in res]) | |
def Diff(li1, li2): | |
""" | |
Tiny function to diff two lists. From internet, dont remeber where unfortunately :\ | |
""" | |
li_dif = [i for i in li1 + li2 if i not in li1 or i not in li2] | |
return li_dif | |
def read_annovar_file(file_path, meta, protocol): | |
with open(file_path, 'r') as annovar_file: | |
first_line = next(annovar_file).strip() | |
if protocol == "refGene": | |
essential_cols = ['Chr', 'Start', 'End', 'Ref', 'Alt', 'Func.refGene', 'Gene.refGene', 'GeneDetail.refGene', | |
'ExonicFunc.refGene', 'AAChange.refGene'] | |
else: | |
essential_cols = ['Chr', 'Start', 'End', 'Ref', 'Alt', 'Func.ensGene', 'Gene.ensGene', 'GeneDetail.ensGene', | |
'ExonicFunc.ensGene', 'AAChange.ensGene'] | |
nonessential_cols = Diff(first_line.split(","), essential_cols) | |
essential_cols_dict = {col: idx for idx, col in enumerate(first_line.split(",")) if col in essential_cols} | |
nonessential_cols_dict = {col: idx for idx, col in enumerate(first_line.split(",")) if col in nonessential_cols} | |
# In case ensGene is used as a protocol, rename the keys to refGene to harmonise the input | |
if protocol == "ensGene": | |
key_mapping = {'Chr': 'Chr', 'Start': 'Start', 'End': 'End', 'Func.ensGene': 'Func.refGene', | |
'Gene.ensGene': 'Gene.refGene', 'GeneDetail.ensGene': 'GeneDetail.refGene', | |
'ExonicFunc.ensGene': 'ExonicFunc.refGene', 'AAChange.ensGene': 'AAChange.refGene'} | |
for old_key, new_key in key_mapping.items(): | |
essential_cols_dict[new_key] = essential_cols_dict.pop(old_key) | |
hdr = ["Tumor_Sample_Barcode", "NCBI_Build", "Center", "Hugo_Symbol", "Chromosome", "Start_Position", "End_Position", "Variant_Classification", "Variant_Type", | |
"Reference_Allele", "Tumor_Seq_Allele2","Transcript_ID", "Exon_Number", "HGVSc", "HGVSp"] + nonessential_cols | |
yield '\t'.join(hdr) | |
for line in annovar_file: | |
maf_line = [meta, parse_record(line.strip(), essential_cols_dict, nonessential_cols_dict)] | |
print('\t'.join([str(x) for x in maf_line])) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
description="Convert annovar and bcftools-csq annotations to MAF", prog="annovar2maf") | |
parser.add_argument( | |
'input', help="Annovar anotations file [Ex: myanno.hg19_multianno.txt] or a csq formatted file.") | |
parser.add_argument("-t", "--tsb", help="Sample name. Default parses from the file name") | |
parser.add_argument( | |
"-b", "--build", help="Reference genome build [Default: hg38]", default="hg38") | |
parser.add_argument( | |
"-p", "--protocol", help="Protocol used to generate annovar annotations [Default: refGene]", default="refGene", choices=["refGene", "ensGene"]) | |
parser.add_argument( | |
"-c", "--csq", help="Input file is a bcftools csq formatted output", action='store_true') | |
args = parser.parse_args() | |
if args.tsb is None: | |
tsb = os.path.basename(args.input).split(".")[0] | |
else: | |
tsb = args.tsb | |
meta = '\t'.join([str(x) for x in [tsb, args.build, "NA"]]) | |
if args.csq == True: | |
hdr = ["Tumor_Sample_Barcode", "NCBI_Build", "Center", "Hugo_Symbol", "Chromosome", "Start_Position", "End_Position", "Variant_Classification", "Variant_Type", | |
"Reference_Allele", "Tumor_Seq_Allele2", "Transcript_ID", "HGVSc", "amino_acid_change", "HGVSp"] | |
print('\t'.join(hdr)) | |
csq2maf(args.input, tsb) | |
else: | |
for output_line in read_annovar_file(args.input, meta, args.protocol): | |
print(output_line) |
此脚本会在第一列添加一列脚本自定义的 Tumor_Sample_Barcode
,使用以下命令去除:
cut -f2- merged_annotations.maf > merged_annotations_python.maf |
生成的 merged_annotations_python.maf
文件可以使用 R 语言的 maftools
包来直接进行分析和绘图
# 如果你的 Annovar 注释结果比较少
我们可以使用 R 语言的 maftools
包来直接进行转换、分析和绘图:
library(maftools) | |
# 转换 | |
var.annovar.maf <- annovarToMaf(annovar = ("merged_annotations.csv"), | |
refBuild = 'hg19', | |
tsbCol = 'Sample', | |
table = 'refGene', | |
sep = ",") | |
write.table(var.annovar.maf, file = "merged_annotations_R.maf", quote = F, sep = "\t", row.names = F) | |
# 绘图 | |
var_maf <- read.maf(maf = "merged_annotations_R.maf") | |
png(filename = "WES_maf_summary.png", width = 10, height = 6, units = "in", res = 300) | |
plotmafSummary(maf = var_maf, rmOutlier = TRUE, addStat = 'median') | |
dev.off() | |
png(filename = "WES_oncoplot.png", width = 15, height = 10, units = "in", res = 300) | |
oncoplot(maf = var_maf, top = 30) | |
dev.off() |
使用 R 包转换的文件会稍大一点,但是它们的绘图结果是一致的。