5 Variant Annotation
5.1 Variant Effect Predictor
We annotate the variants with VEP for molecular consequence, allele frequency, clinical variant databases, i.e. Clinvar, Mastermind, and a set of pathogenicity prediction tools, i.e., CADD, PrimateAI, SpliceAI, EVE, MutationTaster, REVEL.
apptainer exec -B vep_data:/opt/vep/.vep vep.sif /opt/vep/src/ensembl-vep/vep \
--dir_cache /opt/vep/.vep/ \
--dir_plugins /opt/vep/.vep/Plugins/ \
-i /opt/vep/.vep/output_vqsr_snps_hard_indels.vcf.gz \
-o /opt/vep/.vep/ensembl_annotated.txt \
--individual EC27_proband \
--everything \
--check_existing \
--xref_refseq \
--fasta /opt/vep/.vep/Homo_sapiens.GRCh38.fa \
--hgvs \
--hgvsg \
--plugin CADD,/opt/vep/.vep/Plugins/data/grch38/whole_genome_SNVs_grch38.tsv.gz,/opt/vep/.vep/Plugins/data/grch38/gnomad.genomes.r3.0.indel_grch38.tsv.gz \
--plugin PrimateAI,/opt/vep/.vep/Plugins/data/grch38/PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz \
--plugin SpliceAI,snv=/opt/vep/.vep/Plugins/data/grch38/spliceai_scores.raw.snv.hg38.vcf.gz,indel=/opt/vep/.vep/Plugins/data/grch38/spliceai_scores.raw.indel.hg38.vcf.gz \
--plugin EVE,file=/opt/vep/.vep/Plugins/data/grch38/eve_merged.vcf.gz \
--plugin Mastermind,/opt/vep/.vep/Plugins/data/grch38/mastermind_cited_variants_reference-2023.04.02-grch38.vcf.gz \
--plugin dbNSFP,/opt/vep/.vep/Plugins/data/grch38/dbNSFP4.3a_grch38.gz,MutationTaster_score,MutationTaster_pred \
--plugin REVEL,/opt/vep/.vep/Plugins/data/grch38/new_tabbed_revel_grch38.tsv.gz \
--custom /opt/vep/.vep/Plugins/data/grch38/clinvar.vcf.gz,ClinVar,vcf,exact,0,CLNSIG,CLNREVSTAT,CLNDN \
--cache \
--merged \
--offline \
--use_given_ref \
--force_overwrite \
--assembly GRCh38 \
--fork 2 \
--buffer_size 1000 \
--tab5.2 Filtration
We perform a first round of filtering with VEP to reduce the size of the VCF to be loaded into R. In particular, we select variants with high or moderate impact and a gnomAD exome allele frequency smaller than 0.05.
apptainer exec -B vep_data:/opt/vep/.vep vep.sif /opt/vep/src/ensembl-vep/filter_vep \
-i /opt/vep/.vep/ensembl_annotated.txt \
-out /opt/vep/.vep/ensembl_ann_vep_filt.txt \
--filter "(IMPACT is HIGH or IMPACT is MODERATE) and (gnomADe_AF <= 0.05 or not gnomADe_AF)" \
--force_overwrite \
--format tab We load the phased variants into R and select autosomal variants that have passed all variant calling filters.
library(tidyverse)
library(VariantAnnotation)
# Set sample names
mother = "HG004"
father = "HG003"
proband = "HG002"
# Load WhatsHap phased variants
wh <- readVcf("data/AJ_phased_whatshap_refined.vcf", "hg38")
wh <- expand(wh)
# Variant info
pos <- rowRanges(wh) %>% as.data.frame() %>% as_tibble()
# Genotype
wh_geno <- geno(wh)$GT %>% as_tibble() %>%
dplyr::rename("mother"= sym(mother), "father"= sym(father), "proband"= sym(proband))
# Phasing block
wh_ps <- geno(wh)$PS %>% as_tibble() %>%
dplyr::rename("PS_mother" = sym(mother), "PS_father"= sym(father), "PS_proband"= sym(proband))
# Genotype quality
wh_gq <- geno(wh)$GQ %>% as_tibble() %>%
dplyr::rename("GQ_mother" = sym(mother), "GQ_father"= sym(father), "GQ_proband"= sym(proband))
# Allelic depth
wh_ad <- geno(wh)$AD %>% as_tibble() %>%
dplyr::rename("AD_ref_mother" = paste0(sym(mother),".1"), "AD_ref_father"= paste0(sym(father),".1"), "AD_ref_proband"= paste0(sym(proband),".1")) %>%
dplyr::rename("AD_alt_mother" = paste0(sym(mother),".2"), "AD_alt_father"= paste0(sym(father),".2"), "AD_alt_proband"= paste0(sym(proband),".2"))
# Join variant info, genotype, phasing block, genotype quality and allelic depth
wh_geno0 <- bind_cols(pos,wh_geno,wh_gq,wh_ps,wh_ad)
# Identify alternate alleles with number
wh_geno01 <- wh_geno0 %>%
group_by(seqnames,start,end,width,REF) %>%
mutate(counter = row_number()) %>%
ungroup() %>%
dplyr::filter(FILTER=="PASS")
# Convert variant to VEP format. Add 1 to indel start site.
wh_geno01 <- wh_geno01 %>%
mutate(Location = ifelse(
width==1, ifelse(
nchar(ALT)==1, paste(seqnames,start,sep=":"), paste0(seqnames,":",start,"-",start+1)), ifelse(
start+1==end, paste0(seqnames,":",start+1), paste0(seqnames,":",start+1,"-",end)))) %>%
mutate(Allele = ifelse(substr(REF,1,1)==substr(ALT,1,1), substr(ALT,2,nchar(ALT)), ALT))
# Convert variant to gnomAD format.
trim <- function(a1,a2) {
b1 <- str_sub(a1,2,-1)
b2 <- str_sub(a2,2,-1)
repeat {
if (nchar(b1)==0 | nchar(b2)==0) break
if (str_sub(b1,-1,-1)!=str_sub(b2,-1,-1)) break
b1 <- str_sub(b1,1,-2)
b2 <- str_sub(b2,1,-2)
}
paste(str_sub(a1,1,1),b1,"-",str_sub(a2,1,1),b2,sep="")
}
wh_geno01 <- wh_geno01 %>% rowwise %>%
dplyr::mutate(gnomADid = paste(substr(seqnames,4,5),start,trim(REF,ALT),sep="-"))
wh_geno01$Allele[wh_geno01$Allele==""] <- "-"
# Overlapping deletions
wh_geno_overlap <- wh_geno01 %>% dplyr::filter(ALT=="")
wh_geno1 <- wh_geno01 %>% dplyr::filter(ALT!="")
# Select columns
wh_geno1 <- wh_geno1[c(27,28,11:25,26,29)]
# Select X chromosome
wh_geno_X <- wh_geno1 %>% dplyr::filter(str_detect(Location,"chrX"))
# Select autosomal variants
wh_geno_auto <- wh_geno1 %>%
dplyr::filter(!str_detect(Location,"chrX")) %>%
dplyr::filter(!str_detect(Location,"chrY")) %>%
dplyr::filter(!str_detect(Location,"M"))We load the VEP annotated variants into R and select variants with an allele frequency <0.01 that fall in RefSeq transcripts.
result <- read_table("data/AJ_ensembl_ann_vep_filt_09_01_23.txt", comment = "##", na="-", col_types = list(
`#Uploaded_variation`=col_factor(), Gene=col_factor(), Feature=col_factor(), MANE_PLUS_CLINICAL = col_character(),
MOTIF_NAME = col_character(), HIGH_INF_POS = col_character(), TRANSCRIPTION_FACTORS = col_character(),
PUBMED=col_character(), MOTIF_POS = col_character())) %>%
dplyr::rename(Uploaded_variation=`#Uploaded_variation`)
result$Allele[is.na(result$Allele)] <- "-"
result1 <- result %>% as_tibble %>% left_join(wh_geno1, by = c("Location","Allele"))
refseq0 <- result1 %>%
dplyr::filter(SOURCE == "RefSeq") %>%
dplyr::filter(gnomADe_AF<=0.01|is.na(gnomADe_AF)) %>%
dplyr::filter(gnomADg_AF<=0.01|is.na(gnomADg_AF)) %>%
dplyr::filter(!(str_detect(Feature,"XM"))) %>%
select_if(~any(!is.na(.)))We add gene-level information from OMIM, Human Phenotype Ontology (HPO), ClinGen, GenCC, Orphanet. We select HPO terms for each gene that are associated with a particular phenotype, in this case, kidney disease.
library(ontologyIndex)
# OMIM
omim <- read_tsv("data/genemap2_edit.txt", name_repair="universal", col_types=list(Entrez.Gene.ID = col_factor())) %>%
dplyr::rename(OMIM_Phenotypes = Phenotypes) %>%
dplyr::select(Entrez.Gene.ID,OMIM_Phenotypes)
# Human Phenotype Ontology
hpo_data <- read_tsv("data/genes_to_phenotype.txt", col_types=list(entrez_gene_id = col_factor()))
hpo <- get_ontology(file = 'data/hp_obo.txt', propagate_relationships = "is_a", extract_tags = "minimal")
hpo_kidney <- get_descendants(hpo, roots="HP:0000077") #Abnormality of the kidney
gen_hpo_kidney <- hpo_data %>%
dplyr::filter(hpo_term_id %in% hpo_kidney) %>%
dplyr::arrange(frequency_hpo) %>%
dplyr::mutate(frequency = case_when(
frequency_hpo == "HP:0040284" ~ " (very rare)", frequency_hpo == "HP:0040283" ~ " (occasional)",
frequency_hpo == "HP:0040282" ~ " (frequent)", frequency_hpo == "HP:0040281" ~ " (very frequent)",
frequency_hpo == "HP:0040280" ~ " (obligate)", TRUE ~ "")) %>%
unite("renal_hpo_term", c(hpo_term_name, frequency), remove = FALSE, sep = "") %>%
dplyr::group_by(entrez_gene_id) %>%
dplyr::summarise(renal_hpo_term = toString(unique(renal_hpo_term)))
gen_hpo_extra <- hpo_data %>%
dplyr::filter(frequency_hpo %in% c("HP:0040281", "HP:0040280")) %>%
dplyr::arrange(frequency_hpo) %>%
dplyr::filter(!(hpo_term_id %in% hpo_kidney)) %>%
dplyr::group_by(entrez_gene_id) %>%
dplyr::summarise(extrarenal_hpo_term = toString(unique(hpo_term_name)))
# ClinGen
clingen <- read_csv("data/Clingen-Curation.csv", skip = 3) %>%
dplyr::mutate(assertion=gsub("\\s*\\([^\\)]+\\)","", gene_disease_validity_assertion_classifications)) %>%
dplyr::mutate(assertion=paste("(",assertion,")",sep="")) %>%
dplyr::mutate(assertion=if_else(assertion=="(NA)",NA,assertion)) %>%
unite("disease", c(disease_label, assertion), remove = FALSE, na.rm=TRUE, sep = " ") %>%
dplyr::group_by(hgnc_id) %>%
dplyr::summarise(clingen_disease_label = paste(disease,collapse="; "))
# GenCC including Orphanet data
gencc <- read_tsv("data/gencc-submissions.tsv")
gencc_def <- gencc %>%
dplyr::filter(submitter_title != "Orphanet") %>%
dplyr::filter(classification_title == "Definitive") %>%
dplyr::group_by(gene_symbol, gene_curie, disease_title) %>%
dplyr::summarise(
submitter_title = toString(unique(submitter_title)), moi_title = toString(unique(moi_title)),
disease_curie = toString(unique(disease_curie)), disease_original_curie = toString(unique(disease_original_curie)),
classification_title = toString(classification_title), submitted_as_pmids = toString(unique(submitted_as_pmids))) %>%
dplyr::mutate(submitter = paste("(", submitter_title, ")", sep = "")) %>%
tidyr::unite("gencc_disease_definitive", c(disease_title, submitter), remove = FALSE, sep = " ") %>%
dplyr::select(gene_curie, gencc_disease_definitive) %>%
dplyr::group_by(gene_curie) %>%
dplyr::summarise(gencc_disease_definitive = paste(unique(gencc_disease_definitive), collapse="; "))
gencc_strong <- gencc %>%
dplyr::filter(submitter_title != "Orphanet") %>%
dplyr::filter(classification_title == "Strong") %>%
dplyr::group_by(gene_symbol, gene_curie, disease_title) %>%
dplyr::summarise(
submitter_title = toString(unique(submitter_title)), moi_title = toString(unique(moi_title)),
disease_curie = toString(unique(disease_curie)), disease_original_curie = toString(unique(disease_original_curie)),
classification_title = toString(classification_title), submitted_as_pmids = toString(unique(submitted_as_pmids))) %>%
dplyr::mutate(submitter = paste("(", submitter_title, ")", sep = "")) %>%
tidyr::unite("gencc_disease_strong", c(disease_title, submitter), remove = FALSE, sep = " ") %>%
dplyr::select(gene_curie, gencc_disease_strong) %>%
dplyr::group_by(gene_curie) %>%
dplyr::summarise(gencc_disease_strong = paste(unique(gencc_disease_strong), collapse="; "))
orphanet <- gencc %>%
dplyr::filter(submitter_title == "Orphanet") %>%
dplyr::group_by(gene_curie) %>%
dplyr::summarise(orphanet_disease = toString(disease_title), orphanet_moi = toString(unique(moi_title)))
# Join
refseq01 <- refseq0 %>%
left_join(omim, by=join_by(Gene==Entrez.Gene.ID)) %>%
left_join(gen_hpo_kidney, by=join_by(Gene==entrez_gene_id)) %>%
left_join(gen_hpo_extra, by=join_by(Gene==entrez_gene_id)) %>%
left_join(clingen, by=join_by(HGNC_ID==hgnc_id)) %>%
left_join(gencc_def, by=join_by(HGNC_ID==gene_curie)) %>%
left_join(gencc_strong, by=join_by(HGNC_ID==gene_curie)) %>%
left_join(orphanet, by=join_by(HGNC_ID==gene_curie)) %>%
dplyr::select(SYMBOL, Uploaded_variation, Location, Feature, HGVSg, HGVSc, HGVSp, Consequence, ZYG, EXON, INTRON, gnomADe_AF, gnomADg_AF, MAX_AF, MAX_AF_POPS, ClinVar, ClinVar_CLNSIG, OMIM_Phenotypes, renal_hpo_term, extrarenal_hpo_term, REVEL, SIFT, PolyPhen, CADD_PHRED, PrimateAI, SpliceAI_pred, EVE_CLASS, EVE_SCORE, MutationTaster_pred, MutationTaster_score, orphanet_disease, orphanet_moi, Mastermind_MMID3, Existing_variation, PUBMED, mother, father, proband, Allele, counter, AD_ref_mother, AD_ref_father, AD_ref_proband, AD_alt_mother, AD_alt_father, AD_alt_proband, GQ_mother, GQ_father, GQ_proband, PS_mother, PS_father, PS_proband, CANONICAL, MANE_SELECT, MANE_PLUS_CLINICAL, IMPACT, Gene, HGNC_ID, Amino_acids, Codons, AF, AFR_AF, AMR_AF, EAS_AF, EUR_AF, SAS_AF, gnomADe_AFR_AF, gnomADe_AMR_AF, gnomADe_ASJ_AF, gnomADe_EAS_AF, gnomADe_FIN_AF, gnomADe_NFE_AF, gnomADe_OTH_AF, gnomADe_SAS_AF, gnomADg_AFR_AF, gnomADg_AMI_AF, gnomADg_AMR_AF, gnomADg_ASJ_AF, gnomADg_EAS_AF, gnomADg_FIN_AF, gnomADg_MID_AF, gnomADg_NFE_AF, gnomADg_OTH_AF, gnomADg_SAS_AF, VARIANT_CLASS, gnomADid)
# X chromosome
refseq_X <- refseq01 %>% dplyr::filter(str_detect(Uploaded_variation,"X"))
# Autosomal chromosomes
refseq <- refseq01 %>%
dplyr::filter(!str_detect(Uploaded_variation,"X")) %>%
dplyr::filter(!str_detect(Uploaded_variation,"Y")) %>%
dplyr::filter(!str_detect(Uploaded_variation,"M"))5.3 De Novo Variants
We select de novo variants in the proband.
geno_novo_all <- wh_geno_auto %>%
dplyr::filter(! ( ( ( str_detect(mother, substr(proband,1,1)) | str_detect(mother, "\\.") ) &
(str_detect(father, substr(proband,3,3)) | str_detect(father, "\\.") ) ) |
( ( str_detect(father, substr(proband,1,1)) | str_detect(father, "\\.") ) &
(str_detect(mother, substr(proband,3,3)) | str_detect(mother, "\\.") ) ) ) ) %>%
dplyr::filter(!(str_detect(mother,as.character(counter)) | str_detect(father,as.character(counter))) |
(str_detect(mother, substr(proband,1,1)) & str_detect(mother, substr(proband,3,3))) |
(str_detect(father, substr(proband,1,1)) & str_detect(father, substr(proband,3,3)))
)We discard de novo mutations back to the reference allele (e.g. 1/1, 1/1 –> 0/1) since they are unlikely to be pathogenic.
geno_novo_not_ref <- wh_geno_auto %>%
dplyr::filter(! ( ( ( str_detect(mother, substr(proband,1,1)) | str_detect(mother, "\\.") | substr(proband,1,1)==0 ) &
(str_detect(father, substr(proband,3,3)) | str_detect(father, "\\.") | substr(proband,3,3)==0 ) ) |
( ( str_detect(father, substr(proband,1,1)) | str_detect(father, "\\.") | substr(proband,1,1)==0 ) &
(str_detect(mother, substr(proband,3,3)) | str_detect(mother, "\\.") | substr(proband,3,3)==0 ) ) ) ) %>%
dplyr::filter(!(str_detect(mother,as.character(counter)) | str_detect(father,as.character(counter))) |
(str_detect(mother, substr(proband,1,1)) & str_detect(mother, substr(proband,3,3))) |
(str_detect(father, substr(proband,1,1)) & str_detect(father, substr(proband,3,3)))
)We increase the cutoff for genotype quality to ensure that the the de novo variants are not a result of incorrect variant calling. Set GQ>=30, i.e, the probability that the call is incorrect is =<0.1%.
geno_novo_not_ref_high_qual <- geno_novo_not_ref %>% dplyr::filter(GQ_mother>=30) %>% dplyr::filter(GQ_father>=30) %>% dplyr::filter(GQ_proband>=30)We select de novo variants found in the filtered VEP annotation. For each variant, we chose a single gene transcript that contains the variant, prioritizing canonical and curated transcripts.
prob_novo <- refseq %>% inner_join(geno_novo_not_ref)
prob_novo <- prob_novo %>% group_by(Uploaded_variation) %>%
arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>%
dplyr::filter(Feature == Feature[1])
prob_novo_high_qual <- refseq %>% inner_join(geno_novo_not_ref_high_qual)
prob_novo_high_qual <- prob_novo_high_qual %>% group_by(Uploaded_variation) %>%
arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>%
dplyr::filter(Feature == Feature[1])5.4 Homozygous Variants
We select the homozygous variants in the proband.
geno_hom <- wh_geno_auto %>%
dplyr::filter(substr(proband,1,1)==substr(proband,3,3) & substr(proband,1,1) != 0 & substr(proband,1,1) != ".")We discard the homozygous variants for which one of the parents is also homozygous (e.g. 1/1, 0/1 -> 1/1) since these variants are unlikely to be pathogenic.
geno_hom2 <- geno_hom %>%
dplyr::filter(! ( (substr(mother,1,1)==substr(mother,3,3) & substr(mother,1,1)==substr(proband,1,1)) |
(substr(father,1,1)==substr(father,3,3) & substr(father,1,1)==substr(proband,1,1)) )
)We select the homozygous variants found in the filtered VEP annotation.
prob_hom <- refseq %>% inner_join(geno_hom2)
prob_hom <- prob_hom %>% group_by(Uploaded_variation) %>%
arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>%
dplyr::filter(Feature == Feature[1])For each of these variants, we query the gnomAD database for the number of individuals that are homozygous for that variant.
library(ghql)
library(jsonlite)
token <- Sys.getenv("GNOMAD_TOKEN")
con <- GraphqlClient$new(
url = "https://gnomad.broadinstitute.org/api",
headers = list(Authorization = paste0("Bearer ", token))
)
qry <- Query$new()
qry$query('homozygous',
'query Variant($var: String!){
variant(variantId: $var, dataset: gnomad_r4) {
exome {
homozygote_count
}
genome {
homozygote_count
}
}
}')
gnomADid <- prob_hom$gnomADid
s <- tibble(gnomADid=NA, gnomADe_hom=NA, gnomADg_hom=NA)
for (i in gnomADid) {
variables <- list(var = i)
tries <- 1
repeat {
x <- try(con$exec(qry$queries$homozygous, variables), silent = TRUE)
if (!is(x, 'try-error')) {
x1 <- jsonlite::fromJSON(x)$data$variant$exome$homozygote_count
x2 <- jsonlite::fromJSON(x)$data$variant$genome$homozygote_count
message(paste(tries,"success", collapse=" "))
break
}
Sys.sleep(2*tries)
message(tries)
tries <- tries + 1
}
if (is.null(x1)) x1 <- NA
if (is.null(x2)) x2 <- NA
s <- s %>% add_row(gnomADid=variables$var,gnomADe_hom=x1, gnomADg_hom=x2)
}
prob_hom <- left_join(prob_hom, s) %>%
dplyr::relocate(c(gnomADe_hom,gnomADg_hom),.after=MAX_AF_POPS)5.5 Compound Heterozygous Variants
We select the unphased compound heterozygous variants.
prob_2plus_het <- refseq %>% dplyr::filter(substr(proband,1,1)!=substr(proband,3,3))
prob_2plus_het <- prob_2plus_het %>% group_by(Feature) %>%
dplyr::filter(length(unique(PS_proband))>=2 | length(is.na(PS_proband)[is.na(PS_proband)==TRUE])>=2 ) %>%
ungroup()
prob_2plus_het2 <- prob_2plus_het %>% group_by(Gene) %>%
arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>%
dplyr::filter(Feature == Feature[1])We select the phased compound heterozygous variants.
prob_comp_het <- refseq %>% group_by(Feature, PS_proband) %>%
dplyr::filter(any(substr(proband,1,1)>=1 & substr(proband,2,2)=="|") & any(substr(proband,3,3)>=1 & substr(proband,2,2)=="|"))
prob_comp_het2 <- prob_comp_het %>% group_by(Gene) %>%
arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>%
dplyr::filter(Feature == Feature[1])For two variants within a gene that cannot be phased with trio data, we can estimate phase using EM-based haplotype estimation from the gnomAD database.
# Obtain the GRCh37 variant ID to query gnomAD
grch38id <- prob_2plus_het2$gnomADid
qry$query('liftover',
'query liftOver($varID: String!){
liftover(liftover_variant_id: $varID, reference_genome:GRCh38) {
source {
variant_id
}
}
}')
grch37id <- c()
for (i in grch38id) {
variables <- list(varID = i)
tries <- 1
repeat {
x <- try(con$exec(qry$queries$liftover, variables), silent = TRUE)
if (!is(x, 'try-error')) {
x <- jsonlite::fromJSON(x)$data$liftover$source$variant_id
message(paste(tries,"success", collapse=" "))
break
}
Sys.sleep(2*tries)
message(tries)
tries <- tries + 1
}
if (is.null(x)) x <- NA
grch37id <- append(grch37id, x)
}
liftover <- tibble(gnomADid=grch38id, grch37id)
prob_2plus_het2 <- left_join(prob_2plus_het2, liftover)
# Obtain probability that two variants within a gene are compound heterozygous and phase variants using cutoff of <0.2 for cis and >0.55 for trans
qry$query('coocurrence',
'query Coocurrence($var1: String!, $var2: String!){
variant_cooccurrence(dataset: gnomad_r2_1, variants: [$var1, $var2]) {
p_compound_heterozygous
}
}'
)
prob_2plus_het_gnom <- prob_2plus_het2 %>%
dplyr::filter(!is.na(grch37id)) %>%
group_by(SYMBOL) %>%
dplyr::filter(n()!=1)
s <- tibble(grch37id=NA, gnomAD_A=NA, gnomAD_B=NA, gnomAD_PS=NA)
r <- tibble(grch37id=NA, gnomAD_cis=NA, gnomAD_trans=NA)
for (a in unique(prob_2plus_het_gnom$SYMBOL)) {
var <- prob_2plus_het_gnom[prob_2plus_het_gnom$SYMBOL==a,]$grch37id
sub <- tibble(grch37id=var, gnomAD_A=NA, gnomAD_B=NA, gnomAD_PS=NA)
sub$gnomAD_A[1] <- 1
sub$gnomAD_PS[1] <- 1
for (i in 1:(length(var)-1)) {
c <- c()
t <- c()
for (j in (i+1):length(var)) {
variables <- list(var1 = var[i], var2 = var[j])
tries <- 1
repeat {
y <- try(con$exec(qry$queries$coocurrence, variables), silent = TRUE)
if (!is(y, 'try-error')) {
y <- jsonlite::fromJSON(y)$data$variant_cooccurrence$p_compound_heterozygous
message(paste(tries,"success", collapse=" "))
break
}
Sys.sleep(2*tries)
message(tries)
tries <- tries + 1
}
if (is.null(y)) y <- "NA"
if (y<0.2) c <- append(c, paste0(variables$var2," (",round(y,digits=2),")"))
if (y>0.55 & y!="NA") t <- append(t, paste0(variables$var2," (",round(y,digits=2),")"))
if (!is.na(sub[[i,"gnomAD_A"]]) & is.na(sub[[i,"gnomAD_B"]])) {
if (y<0.2) {
sub$gnomAD_A[j] <- i
sub$gnomAD_PS[j] <- sub$gnomAD_PS[i]
}
if (y>0.55 & y!="NA") {
sub$gnomAD_B[j] <- i
sub$gnomAD_PS[j] <- sub$gnomAD_PS[i]
}
}
if (!is.na(sub[[i,"gnomAD_B"]]) & is.na(sub[[i,"gnomAD_A"]])) {
if (y<0.2) {
sub$gnomAD_B[j] <- i
sub$gnomAD_PS[j] <- sub$gnomAD_PS[i]
}
if (y>0.55 & y!="NA") {
sub$gnomAD_A[j] <- i
sub$gnomAD_PS[j] <- sub$gnomAD_PS[i]
}
}
if (is.na(sub[[i,"gnomAD_A"]]) & is.na(sub[[i,"gnomAD_B"]])) {
sub$gnomAD_A[i] <- i
sub$gnomAD_PS[i] <- i
if (y<0.2) {
sub$gnomAD_A[j] <- i
sub$gnomAD_PS[j] <- i
}
if (y>0.55 & y!="NA") {
sub$gnomAD_B[j] <- i
sub$gnomAD_PS[j] <- i
}
}
}
r <- r %>% add_row(grch37id=var[i], gnomAD_cis=str_flatten_comma(c), gnomAD_trans=str_flatten_comma(t))
}
s <- rbind(s,sub)
}
prob_2plus_het2 <- prob_2plus_het2 %>% left_join(r) %>% left_join(s) %>%
dplyr::relocate(c(gnomAD_A,gnomAD_B,gnomAD_PS), .after=PS_proband)5.6 X Chromosome
We remove X chromosome calls that are “heterozygous” for male individuals, i.e., proband and father. However, some of these calls may be correct if they lie in the pseudoautosomal region.
wh_geno_X_filt <- wh_geno_X %>%
dplyr::filter(( substr(proband,1,1)==substr(proband,3,3) & substr(father,1,1)==substr(father,3,3)))
prob_X <- inner_join(refseq_X, wh_geno_X_filt) %>% group_by(Gene) %>%
arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>%
dplyr::filter(Feature == Feature[1])5.7 Disease Genes
We select one entry for each variant in the refseq table, prioritizing canonical and curated transcripts.
refseq2 <- refseq01 %>% group_by(Uploaded_variation) %>%
arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>%
dplyr::filter(Feature == Feature[1])As an example, we select variants that fall in kidney disease genes, i.e., green genes from the UK PanelApp renal broad superpanel and all genes from the Australia PanelApp kidneyome superpanel.
# UK Panel App
panel <- read_tsv("data/renal_superpanel_broad.tsv")
panel_green <- panel %>% dplyr::filter(GEL_Status==3)
refseq_green <- refseq2 %>% dplyr::filter(HGNC_ID %in% panel_green$HGNC)
#Australia Panel App
panel_au <- read_tsv("data/Kidneyome_SuperPanel.tsv") %>%
dplyr::filter(!duplicated(HGNC)) %>%
dplyr::mutate(Status=case_when(
GEL_Status==4|GEL_Status==3 ~ "Green",
GEL_Status==2 ~ "Amber",
GEL_Status==1|GEL_Status==0 ~ "Red")) %>%
dplyr::select(HGNC,Status)
refseq_au <- refseq2 %>% inner_join(panel_au, join_by(HGNC_ID==HGNC))We combine annotations into a single table.
refseq3 <- refseq01 %>% group_by(Gene) %>%
arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>% ungroup()
refseq4 <- refseq3 %>%
left_join(add_column(geno_novo_not_ref,de_novo="de_novo")) %>%
left_join(add_column(geno_novo_not_ref_high_qual,de_novo_gq="de_novo_gq>=30")) %>%
left_join(add_column(geno_hom2, homozygote="homozygote")) %>%
left_join(add_column(prob_2plus_het, unphased_compound_heterozygote="unphased_compound_heterozygote")) %>%
left_join(add_column(prob_comp_het, phased_compound_heterozygote="phased_compound_heterozygote")) %>%
mutate(zygosity=coalesce(de_novo_gq,de_novo,homozygote,phased_compound_heterozygote,unphased_compound_heterozygote)) %>%
dplyr::select(-c(de_novo_gq,de_novo,homozygote,phased_compound_heterozygote,unphased_compound_heterozygote)) %>%
dplyr::relocate(zygosity,.after=counter)
refseq_sum <- refseq4 %>% group_by(Feature) %>%
summarize(z=paste(c(Uploaded_variation,Location,Allele,Consequence,Amino_acids,Codons), collapse = " ")) %>%
distinct(z,.keep_all = T)
refseq5 <- refseq4 %>% dplyr::filter(Feature %in% refseq_sum$Feature)