5 Variant Annotation
5.1 Variant Effect Predictor
We annotate the variants with VEP for molecular consequence, allele frequency, clinical variant databases, i.e. Clinvar, Mastermind, and a set of pathogenicity prediction tools, i.e., CADD, PrimateAI, SpliceAI, EVE, MutationTaster, REVEL.
apptainer exec -B vep_data:/opt/vep/.vep vep.sif /opt/vep/src/ensembl-vep/vep \
\
--dir_cache /opt/vep/.vep/ \
--dir_plugins /opt/vep/.vep/Plugins/ \
-i /opt/vep/.vep/output_vqsr_snps_hard_indels.vcf.gz \
-o /opt/vep/.vep/ensembl_annotated.txt \
--individual EC27_proband \
--everything \
--check_existing \
--xref_refseq \
--fasta /opt/vep/.vep/Homo_sapiens.GRCh38.fa \
--hgvs \
--hgvsg \
--plugin CADD,/opt/vep/.vep/Plugins/data/grch38/whole_genome_SNVs_grch38.tsv.gz,/opt/vep/.vep/Plugins/data/grch38/gnomad.genomes.r3.0.indel_grch38.tsv.gz \
--plugin PrimateAI,/opt/vep/.vep/Plugins/data/grch38/PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz \
--plugin SpliceAI,snv=/opt/vep/.vep/Plugins/data/grch38/spliceai_scores.raw.snv.hg38.vcf.gz,indel=/opt/vep/.vep/Plugins/data/grch38/spliceai_scores.raw.indel.hg38.vcf.gz \
--plugin EVE,file=/opt/vep/.vep/Plugins/data/grch38/eve_merged.vcf.gz \
--plugin Mastermind,/opt/vep/.vep/Plugins/data/grch38/mastermind_cited_variants_reference-2023.04.02-grch38.vcf.gz \
--plugin dbNSFP,/opt/vep/.vep/Plugins/data/grch38/dbNSFP4.3a_grch38.gz,MutationTaster_score,MutationTaster_pred \
--plugin REVEL,/opt/vep/.vep/Plugins/data/grch38/new_tabbed_revel_grch38.tsv.gz \
--custom /opt/vep/.vep/Plugins/data/grch38/clinvar.vcf.gz,ClinVar,vcf,exact,0,CLNSIG,CLNREVSTAT,CLNDN \
--cache \
--merged \
--offline \
--use_given_ref \
--force_overwrite \
--assembly GRCh38 \
--fork 2 \
--buffer_size 1000 --tab
5.2 Filtration
We perform a first round of filtering with VEP to reduce the size of the VCF to be loaded into R. In particular, we select variants with high or moderate impact and a gnomAD exome allele frequency smaller than 0.05.
apptainer exec -B vep_data:/opt/vep/.vep vep.sif /opt/vep/src/ensembl-vep/filter_vep \
\
-i /opt/vep/.vep/ensembl_annotated.txt \
-out /opt/vep/.vep/ensembl_ann_vep_filt.txt "(IMPACT is HIGH or IMPACT is MODERATE) and (gnomADe_AF <= 0.05 or not gnomADe_AF)" \
--filter \
--force_overwrite --format tab
We load the phased variants into R and select autosomal variants that have passed all variant calling filters.
library(tidyverse)
library(VariantAnnotation)
# Set sample names
= "HG004"
mother = "HG003"
father = "HG002"
proband # Load WhatsHap phased variants
<- readVcf("data/AJ_phased_whatshap_refined.vcf", "hg38")
wh <- expand(wh)
wh # Variant info
<- rowRanges(wh) %>% as.data.frame() %>% as_tibble()
pos # Genotype
<- geno(wh)$GT %>% as_tibble() %>%
wh_geno ::rename("mother"= sym(mother), "father"= sym(father), "proband"= sym(proband))
dplyr# Phasing block
<- geno(wh)$PS %>% as_tibble() %>%
wh_ps ::rename("PS_mother" = sym(mother), "PS_father"= sym(father), "PS_proband"= sym(proband))
dplyr# Genotype quality
<- geno(wh)$GQ %>% as_tibble() %>%
wh_gq ::rename("GQ_mother" = sym(mother), "GQ_father"= sym(father), "GQ_proband"= sym(proband))
dplyr# Allelic depth
<- geno(wh)$AD %>% as_tibble() %>%
wh_ad ::rename("AD_ref_mother" = paste0(sym(mother),".1"), "AD_ref_father"= paste0(sym(father),".1"), "AD_ref_proband"= paste0(sym(proband),".1")) %>%
dplyr::rename("AD_alt_mother" = paste0(sym(mother),".2"), "AD_alt_father"= paste0(sym(father),".2"), "AD_alt_proband"= paste0(sym(proband),".2"))
dplyr# Join variant info, genotype, phasing block, genotype quality and allelic depth
<- bind_cols(pos,wh_geno,wh_gq,wh_ps,wh_ad)
wh_geno0 # Identify alternate alleles with number
<- wh_geno0 %>%
wh_geno01 group_by(seqnames,start,end,width,REF) %>%
mutate(counter = row_number()) %>%
ungroup() %>%
::filter(FILTER=="PASS")
dplyr# Convert variant to VEP format. Add 1 to indel start site.
<- wh_geno01 %>%
wh_geno01 mutate(Location = ifelse(
==1, ifelse(
widthnchar(ALT)==1, paste(seqnames,start,sep=":"), paste0(seqnames,":",start,"-",start+1)), ifelse(
+1==end, paste0(seqnames,":",start+1), paste0(seqnames,":",start+1,"-",end)))) %>%
startmutate(Allele = ifelse(substr(REF,1,1)==substr(ALT,1,1), substr(ALT,2,nchar(ALT)), ALT))
# Convert variant to gnomAD format.
<- function(a1,a2) {
trim <- str_sub(a1,2,-1)
b1 <- str_sub(a2,2,-1)
b2 repeat {
if (nchar(b1)==0 | nchar(b2)==0) break
if (str_sub(b1,-1,-1)!=str_sub(b2,-1,-1)) break
<- str_sub(b1,1,-2)
b1 <- str_sub(b2,1,-2)
b2
}paste(str_sub(a1,1,1),b1,"-",str_sub(a2,1,1),b2,sep="")
}<- wh_geno01 %>% rowwise %>%
wh_geno01 ::mutate(gnomADid = paste(substr(seqnames,4,5),start,trim(REF,ALT),sep="-"))
dplyr$Allele[wh_geno01$Allele==""] <- "-"
wh_geno01# Overlapping deletions
<- wh_geno01 %>% dplyr::filter(ALT=="")
wh_geno_overlap <- wh_geno01 %>% dplyr::filter(ALT!="")
wh_geno1 # Select columns
<- wh_geno1[c(27,28,11:25,26,29)]
wh_geno1 # Select X chromosome
<- wh_geno1 %>% dplyr::filter(str_detect(Location,"chrX"))
wh_geno_X # Select autosomal variants
<- wh_geno1 %>%
wh_geno_auto ::filter(!str_detect(Location,"chrX")) %>%
dplyr::filter(!str_detect(Location,"chrY")) %>%
dplyr::filter(!str_detect(Location,"M")) dplyr
We load the VEP annotated variants into R and select variants with an allele frequency <0.01 that fall in RefSeq transcripts.
<- read_table("data/AJ_ensembl_ann_vep_filt_09_01_23.txt", comment = "##", na="-", col_types = list(
result `#Uploaded_variation`=col_factor(), Gene=col_factor(), Feature=col_factor(), MANE_PLUS_CLINICAL = col_character(),
MOTIF_NAME = col_character(), HIGH_INF_POS = col_character(), TRANSCRIPTION_FACTORS = col_character(),
PUBMED=col_character(), MOTIF_POS = col_character())) %>%
::rename(Uploaded_variation=`#Uploaded_variation`)
dplyr$Allele[is.na(result$Allele)] <- "-"
result<- result %>% as_tibble %>% left_join(wh_geno1, by = c("Location","Allele"))
result1 <- result1 %>%
refseq0 ::filter(SOURCE == "RefSeq") %>%
dplyr::filter(gnomADe_AF<=0.01|is.na(gnomADe_AF)) %>%
dplyr::filter(gnomADg_AF<=0.01|is.na(gnomADg_AF)) %>%
dplyr::filter(!(str_detect(Feature,"XM"))) %>%
dplyrselect_if(~any(!is.na(.)))
We add gene-level information from OMIM, Human Phenotype Ontology (HPO), ClinGen, GenCC, Orphanet. We select HPO terms for each gene that are associated with a particular phenotype, in this case, kidney disease.
library(ontologyIndex)
# OMIM
<- read_tsv("data/genemap2_edit.txt", name_repair="universal", col_types=list(Entrez.Gene.ID = col_factor())) %>%
omim ::rename(OMIM_Phenotypes = Phenotypes) %>%
dplyr::select(Entrez.Gene.ID,OMIM_Phenotypes)
dplyr# Human Phenotype Ontology
<- read_tsv("data/genes_to_phenotype.txt", col_types=list(entrez_gene_id = col_factor()))
hpo_data <- get_ontology(file = 'data/hp_obo.txt', propagate_relationships = "is_a", extract_tags = "minimal")
hpo <- get_descendants(hpo, roots="HP:0000077") #Abnormality of the kidney
hpo_kidney <- hpo_data %>%
gen_hpo_kidney ::filter(hpo_term_id %in% hpo_kidney) %>%
dplyr::arrange(frequency_hpo) %>%
dplyr::mutate(frequency = case_when(
dplyr== "HP:0040284" ~ " (very rare)", frequency_hpo == "HP:0040283" ~ " (occasional)",
frequency_hpo == "HP:0040282" ~ " (frequent)", frequency_hpo == "HP:0040281" ~ " (very frequent)",
frequency_hpo == "HP:0040280" ~ " (obligate)", TRUE ~ "")) %>%
frequency_hpo unite("renal_hpo_term", c(hpo_term_name, frequency), remove = FALSE, sep = "") %>%
::group_by(entrez_gene_id) %>%
dplyr::summarise(renal_hpo_term = toString(unique(renal_hpo_term)))
dplyr<- hpo_data %>%
gen_hpo_extra ::filter(frequency_hpo %in% c("HP:0040281", "HP:0040280")) %>%
dplyr::arrange(frequency_hpo) %>%
dplyr::filter(!(hpo_term_id %in% hpo_kidney)) %>%
dplyr::group_by(entrez_gene_id) %>%
dplyr::summarise(extrarenal_hpo_term = toString(unique(hpo_term_name)))
dplyr# ClinGen
<- read_csv("data/Clingen-Curation.csv", skip = 3) %>%
clingen ::mutate(assertion=gsub("\\s*\\([^\\)]+\\)","", gene_disease_validity_assertion_classifications)) %>%
dplyr::mutate(assertion=paste("(",assertion,")",sep="")) %>%
dplyr::mutate(assertion=if_else(assertion=="(NA)",NA,assertion)) %>%
dplyrunite("disease", c(disease_label, assertion), remove = FALSE, na.rm=TRUE, sep = " ") %>%
::group_by(hgnc_id) %>%
dplyr::summarise(clingen_disease_label = paste(disease,collapse="; "))
dplyr# GenCC including Orphanet data
<- read_tsv("data/gencc-submissions.tsv")
gencc <- gencc %>%
gencc_def ::filter(submitter_title != "Orphanet") %>%
dplyr::filter(classification_title == "Definitive") %>%
dplyr::group_by(gene_symbol, gene_curie, disease_title) %>%
dplyr::summarise(
dplyrsubmitter_title = toString(unique(submitter_title)), moi_title = toString(unique(moi_title)),
disease_curie = toString(unique(disease_curie)), disease_original_curie = toString(unique(disease_original_curie)),
classification_title = toString(classification_title), submitted_as_pmids = toString(unique(submitted_as_pmids))) %>%
::mutate(submitter = paste("(", submitter_title, ")", sep = "")) %>%
dplyr::unite("gencc_disease_definitive", c(disease_title, submitter), remove = FALSE, sep = " ") %>%
tidyr::select(gene_curie, gencc_disease_definitive) %>%
dplyr::group_by(gene_curie) %>%
dplyr::summarise(gencc_disease_definitive = paste(unique(gencc_disease_definitive), collapse="; "))
dplyr<- gencc %>%
gencc_strong ::filter(submitter_title != "Orphanet") %>%
dplyr::filter(classification_title == "Strong") %>%
dplyr::group_by(gene_symbol, gene_curie, disease_title) %>%
dplyr::summarise(
dplyrsubmitter_title = toString(unique(submitter_title)), moi_title = toString(unique(moi_title)),
disease_curie = toString(unique(disease_curie)), disease_original_curie = toString(unique(disease_original_curie)),
classification_title = toString(classification_title), submitted_as_pmids = toString(unique(submitted_as_pmids))) %>%
::mutate(submitter = paste("(", submitter_title, ")", sep = "")) %>%
dplyr::unite("gencc_disease_strong", c(disease_title, submitter), remove = FALSE, sep = " ") %>%
tidyr::select(gene_curie, gencc_disease_strong) %>%
dplyr::group_by(gene_curie) %>%
dplyr::summarise(gencc_disease_strong = paste(unique(gencc_disease_strong), collapse="; "))
dplyr<- gencc %>%
orphanet ::filter(submitter_title == "Orphanet") %>%
dplyr::group_by(gene_curie) %>%
dplyr::summarise(orphanet_disease = toString(disease_title), orphanet_moi = toString(unique(moi_title)))
dplyr# Join
<- refseq0 %>%
refseq01 left_join(omim, by=join_by(Gene==Entrez.Gene.ID)) %>%
left_join(gen_hpo_kidney, by=join_by(Gene==entrez_gene_id)) %>%
left_join(gen_hpo_extra, by=join_by(Gene==entrez_gene_id)) %>%
left_join(clingen, by=join_by(HGNC_ID==hgnc_id)) %>%
left_join(gencc_def, by=join_by(HGNC_ID==gene_curie)) %>%
left_join(gencc_strong, by=join_by(HGNC_ID==gene_curie)) %>%
left_join(orphanet, by=join_by(HGNC_ID==gene_curie)) %>%
::select(SYMBOL, Uploaded_variation, Location, Feature, HGVSg, HGVSc, HGVSp, Consequence, ZYG, EXON, INTRON, gnomADe_AF, gnomADg_AF, MAX_AF, MAX_AF_POPS, ClinVar, ClinVar_CLNSIG, OMIM_Phenotypes, renal_hpo_term, extrarenal_hpo_term, REVEL, SIFT, PolyPhen, CADD_PHRED, PrimateAI, SpliceAI_pred, EVE_CLASS, EVE_SCORE, MutationTaster_pred, MutationTaster_score, orphanet_disease, orphanet_moi, Mastermind_MMID3, Existing_variation, PUBMED, mother, father, proband, Allele, counter, AD_ref_mother, AD_ref_father, AD_ref_proband, AD_alt_mother, AD_alt_father, AD_alt_proband, GQ_mother, GQ_father, GQ_proband, PS_mother, PS_father, PS_proband, CANONICAL, MANE_SELECT, MANE_PLUS_CLINICAL, IMPACT, Gene, HGNC_ID, Amino_acids, Codons, AF, AFR_AF, AMR_AF, EAS_AF, EUR_AF, SAS_AF, gnomADe_AFR_AF, gnomADe_AMR_AF, gnomADe_ASJ_AF, gnomADe_EAS_AF, gnomADe_FIN_AF, gnomADe_NFE_AF, gnomADe_OTH_AF, gnomADe_SAS_AF, gnomADg_AFR_AF, gnomADg_AMI_AF, gnomADg_AMR_AF, gnomADg_ASJ_AF, gnomADg_EAS_AF, gnomADg_FIN_AF, gnomADg_MID_AF, gnomADg_NFE_AF, gnomADg_OTH_AF, gnomADg_SAS_AF, VARIANT_CLASS, gnomADid)
dplyr# X chromosome
<- refseq01 %>% dplyr::filter(str_detect(Uploaded_variation,"X"))
refseq_X # Autosomal chromosomes
<- refseq01 %>%
refseq ::filter(!str_detect(Uploaded_variation,"X")) %>%
dplyr::filter(!str_detect(Uploaded_variation,"Y")) %>%
dplyr::filter(!str_detect(Uploaded_variation,"M")) dplyr
5.3 De Novo Variants
We select de novo variants in the proband.
<- wh_geno_auto %>%
geno_novo_all ::filter(! ( ( ( str_detect(mother, substr(proband,1,1)) | str_detect(mother, "\\.") ) &
dplyrstr_detect(father, substr(proband,3,3)) | str_detect(father, "\\.") ) ) |
(str_detect(father, substr(proband,1,1)) | str_detect(father, "\\.") ) &
( ( str_detect(mother, substr(proband,3,3)) | str_detect(mother, "\\.") ) ) ) ) %>%
(::filter(!(str_detect(mother,as.character(counter)) | str_detect(father,as.character(counter))) |
dplyrstr_detect(mother, substr(proband,1,1)) & str_detect(mother, substr(proband,3,3))) |
(str_detect(father, substr(proband,1,1)) & str_detect(father, substr(proband,3,3)))
( )
We discard de novo mutations back to the reference allele (e.g. 1/1, 1/1 –> 0/1) since they are unlikely to be pathogenic.
<- wh_geno_auto %>%
geno_novo_not_ref ::filter(! ( ( ( str_detect(mother, substr(proband,1,1)) | str_detect(mother, "\\.") | substr(proband,1,1)==0 ) &
dplyrstr_detect(father, substr(proband,3,3)) | str_detect(father, "\\.") | substr(proband,3,3)==0 ) ) |
(str_detect(father, substr(proband,1,1)) | str_detect(father, "\\.") | substr(proband,1,1)==0 ) &
( ( str_detect(mother, substr(proband,3,3)) | str_detect(mother, "\\.") | substr(proband,3,3)==0 ) ) ) ) %>%
(::filter(!(str_detect(mother,as.character(counter)) | str_detect(father,as.character(counter))) |
dplyrstr_detect(mother, substr(proband,1,1)) & str_detect(mother, substr(proband,3,3))) |
(str_detect(father, substr(proband,1,1)) & str_detect(father, substr(proband,3,3)))
( )
We increase the cutoff for genotype quality to ensure that the the de novo variants are not a result of incorrect variant calling. Set GQ>=30, i.e, the probability that the call is incorrect is =<0.1%.
<- geno_novo_not_ref %>% dplyr::filter(GQ_mother>=30) %>% dplyr::filter(GQ_father>=30) %>% dplyr::filter(GQ_proband>=30) geno_novo_not_ref_high_qual
We select de novo variants found in the filtered VEP annotation. For each variant, we chose a single gene transcript that contains the variant, prioritizing canonical and curated transcripts.
<- refseq %>% inner_join(geno_novo_not_ref)
prob_novo <- prob_novo %>% group_by(Uploaded_variation) %>%
prob_novo arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>%
::filter(Feature == Feature[1])
dplyr<- refseq %>% inner_join(geno_novo_not_ref_high_qual)
prob_novo_high_qual <- prob_novo_high_qual %>% group_by(Uploaded_variation) %>%
prob_novo_high_qual arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>%
::filter(Feature == Feature[1]) dplyr
5.4 Homozygous Variants
We select the homozygous variants in the proband.
<- wh_geno_auto %>%
geno_hom ::filter(substr(proband,1,1)==substr(proband,3,3) & substr(proband,1,1) != 0 & substr(proband,1,1) != ".") dplyr
We discard the homozygous variants for which one of the parents is also homozygous (e.g. 1/1, 0/1 -> 1/1) since these variants are unlikely to be pathogenic.
<- geno_hom %>%
geno_hom2 ::filter(! ( (substr(mother,1,1)==substr(mother,3,3) & substr(mother,1,1)==substr(proband,1,1)) |
dplyrsubstr(father,1,1)==substr(father,3,3) & substr(father,1,1)==substr(proband,1,1)) )
( )
We select the homozygous variants found in the filtered VEP annotation.
<- refseq %>% inner_join(geno_hom2)
prob_hom <- prob_hom %>% group_by(Uploaded_variation) %>%
prob_hom arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>%
::filter(Feature == Feature[1]) dplyr
For each of these variants, we query the gnomAD database for the number of individuals that are homozygous for that variant.
library(ghql)
library(jsonlite)
<- Sys.getenv("GNOMAD_TOKEN")
token <- GraphqlClient$new(
con url = "https://gnomad.broadinstitute.org/api",
headers = list(Authorization = paste0("Bearer ", token))
)<- Query$new()
qry $query('homozygous',
qry'query Variant($var: String!){
variant(variantId: $var, dataset: gnomad_r4) {
exome {
homozygote_count
}
genome {
homozygote_count
}
}
}')
<- prob_hom$gnomADid
gnomADid <- tibble(gnomADid=NA, gnomADe_hom=NA, gnomADg_hom=NA)
s for (i in gnomADid) {
<- list(var = i)
variables <- 1
tries repeat {
<- try(con$exec(qry$queries$homozygous, variables), silent = TRUE)
x if (!is(x, 'try-error')) {
<- jsonlite::fromJSON(x)$data$variant$exome$homozygote_count
x1 <- jsonlite::fromJSON(x)$data$variant$genome$homozygote_count
x2 message(paste(tries,"success", collapse=" "))
break
}Sys.sleep(2*tries)
message(tries)
<- tries + 1
tries
}if (is.null(x1)) x1 <- NA
if (is.null(x2)) x2 <- NA
<- s %>% add_row(gnomADid=variables$var,gnomADe_hom=x1, gnomADg_hom=x2)
s
}<- left_join(prob_hom, s) %>%
prob_hom ::relocate(c(gnomADe_hom,gnomADg_hom),.after=MAX_AF_POPS) dplyr
5.5 Compound Heterozygous Variants
We select the unphased compound heterozygous variants.
<- refseq %>% dplyr::filter(substr(proband,1,1)!=substr(proband,3,3))
prob_2plus_het <- prob_2plus_het %>% group_by(Feature) %>%
prob_2plus_het ::filter(length(unique(PS_proband))>=2 | length(is.na(PS_proband)[is.na(PS_proband)==TRUE])>=2 ) %>%
dplyrungroup()
<- prob_2plus_het %>% group_by(Gene) %>%
prob_2plus_het2 arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>%
::filter(Feature == Feature[1]) dplyr
We select the phased compound heterozygous variants.
<- refseq %>% group_by(Feature, PS_proband) %>%
prob_comp_het ::filter(any(substr(proband,1,1)>=1 & substr(proband,2,2)=="|") & any(substr(proband,3,3)>=1 & substr(proband,2,2)=="|"))
dplyr<- prob_comp_het %>% group_by(Gene) %>%
prob_comp_het2 arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>%
::filter(Feature == Feature[1]) dplyr
For two variants within a gene that cannot be phased with trio data, we can estimate phase using EM-based haplotype estimation from the gnomAD database.
# Obtain the GRCh37 variant ID to query gnomAD
<- prob_2plus_het2$gnomADid
grch38id $query('liftover',
qry'query liftOver($varID: String!){
liftover(liftover_variant_id: $varID, reference_genome:GRCh38) {
source {
variant_id
}
}
}')
<- c()
grch37id for (i in grch38id) {
<- list(varID = i)
variables <- 1
tries repeat {
<- try(con$exec(qry$queries$liftover, variables), silent = TRUE)
x if (!is(x, 'try-error')) {
<- jsonlite::fromJSON(x)$data$liftover$source$variant_id
x message(paste(tries,"success", collapse=" "))
break
}Sys.sleep(2*tries)
message(tries)
<- tries + 1
tries
}if (is.null(x)) x <- NA
<- append(grch37id, x)
grch37id
}<- tibble(gnomADid=grch38id, grch37id)
liftover <- left_join(prob_2plus_het2, liftover)
prob_2plus_het2 # Obtain probability that two variants within a gene are compound heterozygous and phase variants using cutoff of <0.2 for cis and >0.55 for trans
$query('coocurrence',
qry'query Coocurrence($var1: String!, $var2: String!){
variant_cooccurrence(dataset: gnomad_r2_1, variants: [$var1, $var2]) {
p_compound_heterozygous
}
}'
)<- prob_2plus_het2 %>%
prob_2plus_het_gnom ::filter(!is.na(grch37id)) %>%
dplyrgroup_by(SYMBOL) %>%
::filter(n()!=1)
dplyr<- tibble(grch37id=NA, gnomAD_A=NA, gnomAD_B=NA, gnomAD_PS=NA)
s <- tibble(grch37id=NA, gnomAD_cis=NA, gnomAD_trans=NA)
r for (a in unique(prob_2plus_het_gnom$SYMBOL)) {
<- prob_2plus_het_gnom[prob_2plus_het_gnom$SYMBOL==a,]$grch37id
var <- tibble(grch37id=var, gnomAD_A=NA, gnomAD_B=NA, gnomAD_PS=NA)
sub $gnomAD_A[1] <- 1
sub$gnomAD_PS[1] <- 1
subfor (i in 1:(length(var)-1)) {
<- c()
c <- c()
t for (j in (i+1):length(var)) {
<- list(var1 = var[i], var2 = var[j])
variables <- 1
tries repeat {
<- try(con$exec(qry$queries$coocurrence, variables), silent = TRUE)
y if (!is(y, 'try-error')) {
<- jsonlite::fromJSON(y)$data$variant_cooccurrence$p_compound_heterozygous
y message(paste(tries,"success", collapse=" "))
break
}Sys.sleep(2*tries)
message(tries)
<- tries + 1
tries
}if (is.null(y)) y <- "NA"
if (y<0.2) c <- append(c, paste0(variables$var2," (",round(y,digits=2),")"))
if (y>0.55 & y!="NA") t <- append(t, paste0(variables$var2," (",round(y,digits=2),")"))
if (!is.na(sub[[i,"gnomAD_A"]]) & is.na(sub[[i,"gnomAD_B"]])) {
if (y<0.2) {
$gnomAD_A[j] <- i
sub$gnomAD_PS[j] <- sub$gnomAD_PS[i]
sub
}if (y>0.55 & y!="NA") {
$gnomAD_B[j] <- i
sub$gnomAD_PS[j] <- sub$gnomAD_PS[i]
sub
}
}if (!is.na(sub[[i,"gnomAD_B"]]) & is.na(sub[[i,"gnomAD_A"]])) {
if (y<0.2) {
$gnomAD_B[j] <- i
sub$gnomAD_PS[j] <- sub$gnomAD_PS[i]
sub
}if (y>0.55 & y!="NA") {
$gnomAD_A[j] <- i
sub$gnomAD_PS[j] <- sub$gnomAD_PS[i]
sub
}
}if (is.na(sub[[i,"gnomAD_A"]]) & is.na(sub[[i,"gnomAD_B"]])) {
$gnomAD_A[i] <- i
sub$gnomAD_PS[i] <- i
subif (y<0.2) {
$gnomAD_A[j] <- i
sub$gnomAD_PS[j] <- i
sub
}if (y>0.55 & y!="NA") {
$gnomAD_B[j] <- i
sub$gnomAD_PS[j] <- i
sub
}
}
}<- r %>% add_row(grch37id=var[i], gnomAD_cis=str_flatten_comma(c), gnomAD_trans=str_flatten_comma(t))
r
}<- rbind(s,sub)
s
}<- prob_2plus_het2 %>% left_join(r) %>% left_join(s) %>%
prob_2plus_het2 ::relocate(c(gnomAD_A,gnomAD_B,gnomAD_PS), .after=PS_proband) dplyr
5.6 X Chromosome
We remove X chromosome calls that are “heterozygous” for male individuals, i.e., proband and father. However, some of these calls may be correct if they lie in the pseudoautosomal region.
<- wh_geno_X %>%
wh_geno_X_filt ::filter(( substr(proband,1,1)==substr(proband,3,3) & substr(father,1,1)==substr(father,3,3)))
dplyr<- inner_join(refseq_X, wh_geno_X_filt) %>% group_by(Gene) %>%
prob_X arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>%
::filter(Feature == Feature[1]) dplyr
5.7 Disease Genes
We select one entry for each variant in the refseq table, prioritizing canonical and curated transcripts.
<- refseq01 %>% group_by(Uploaded_variation) %>%
refseq2 arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>%
::filter(Feature == Feature[1]) dplyr
As an example, we select variants that fall in kidney disease genes, i.e., green genes from the UK PanelApp renal broad superpanel and all genes from the Australia PanelApp kidneyome superpanel.
# UK Panel App
<- read_tsv("data/renal_superpanel_broad.tsv")
panel <- panel %>% dplyr::filter(GEL_Status==3)
panel_green <- refseq2 %>% dplyr::filter(HGNC_ID %in% panel_green$HGNC)
refseq_green #Australia Panel App
<- read_tsv("data/Kidneyome_SuperPanel.tsv") %>%
panel_au ::filter(!duplicated(HGNC)) %>%
dplyr::mutate(Status=case_when(
dplyr==4|GEL_Status==3 ~ "Green",
GEL_Status==2 ~ "Amber",
GEL_Status==1|GEL_Status==0 ~ "Red")) %>%
GEL_Status::select(HGNC,Status)
dplyr<- refseq2 %>% inner_join(panel_au, join_by(HGNC_ID==HGNC)) refseq_au
We combine annotations into a single table.
<- refseq01 %>% group_by(Gene) %>%
refseq3 arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>% ungroup()
<- refseq3 %>%
refseq4 left_join(add_column(geno_novo_not_ref,de_novo="de_novo")) %>%
left_join(add_column(geno_novo_not_ref_high_qual,de_novo_gq="de_novo_gq>=30")) %>%
left_join(add_column(geno_hom2, homozygote="homozygote")) %>%
left_join(add_column(prob_2plus_het, unphased_compound_heterozygote="unphased_compound_heterozygote")) %>%
left_join(add_column(prob_comp_het, phased_compound_heterozygote="phased_compound_heterozygote")) %>%
mutate(zygosity=coalesce(de_novo_gq,de_novo,homozygote,phased_compound_heterozygote,unphased_compound_heterozygote)) %>%
::select(-c(de_novo_gq,de_novo,homozygote,phased_compound_heterozygote,unphased_compound_heterozygote)) %>%
dplyr::relocate(zygosity,.after=counter)
dplyr<- refseq4 %>% group_by(Feature) %>%
refseq_sum summarize(z=paste(c(Uploaded_variation,Location,Allele,Consequence,Amino_acids,Codons), collapse = " ")) %>%
distinct(z,.keep_all = T)
<- refseq4 %>% dplyr::filter(Feature %in% refseq_sum$Feature) refseq5