5 Variant Annotation

5.1 Variant Effect Predictor

We annotate the variants with VEP for molecular consequence, allele frequency, clinical variant databases, i.e. Clinvar, Mastermind, and a set of pathogenicity prediction tools, i.e., CADD, PrimateAI, SpliceAI, EVE, MutationTaster, REVEL.

apptainer exec -B vep_data:/opt/vep/.vep vep.sif /opt/vep/src/ensembl-vep/vep \
--dir_cache /opt/vep/.vep/ \
--dir_plugins /opt/vep/.vep/Plugins/ \
-i /opt/vep/.vep/output_vqsr_snps_hard_indels.vcf.gz \
-o /opt/vep/.vep/ensembl_annotated.txt \
--individual EC27_proband \
--everything \
--check_existing \
--xref_refseq \
--fasta /opt/vep/.vep/Homo_sapiens.GRCh38.fa \
--hgvs \
--hgvsg \
--plugin CADD,/opt/vep/.vep/Plugins/data/grch38/whole_genome_SNVs_grch38.tsv.gz,/opt/vep/.vep/Plugins/data/grch38/gnomad.genomes.r3.0.indel_grch38.tsv.gz \
--plugin PrimateAI,/opt/vep/.vep/Plugins/data/grch38/PrimateAI_scores_v0.2_GRCh38_sorted.tsv.bgz \
--plugin SpliceAI,snv=/opt/vep/.vep/Plugins/data/grch38/spliceai_scores.raw.snv.hg38.vcf.gz,indel=/opt/vep/.vep/Plugins/data/grch38/spliceai_scores.raw.indel.hg38.vcf.gz \
--plugin EVE,file=/opt/vep/.vep/Plugins/data/grch38/eve_merged.vcf.gz \
--plugin Mastermind,/opt/vep/.vep/Plugins/data/grch38/mastermind_cited_variants_reference-2023.04.02-grch38.vcf.gz \
--plugin dbNSFP,/opt/vep/.vep/Plugins/data/grch38/dbNSFP4.3a_grch38.gz,MutationTaster_score,MutationTaster_pred \
--plugin REVEL,/opt/vep/.vep/Plugins/data/grch38/new_tabbed_revel_grch38.tsv.gz \
--custom /opt/vep/.vep/Plugins/data/grch38/clinvar.vcf.gz,ClinVar,vcf,exact,0,CLNSIG,CLNREVSTAT,CLNDN \
--cache \
--merged \
--offline \
--use_given_ref \
--force_overwrite \
--assembly GRCh38 \
--fork 2 \
--buffer_size 1000 \
--tab

5.2 Filtration

We perform a first round of filtering with VEP to reduce the size of the VCF to be loaded into R. In particular, we select variants with high or moderate impact and a gnomAD exome allele frequency smaller than 0.05.

apptainer exec -B vep_data:/opt/vep/.vep vep.sif /opt/vep/src/ensembl-vep/filter_vep \
-i /opt/vep/.vep/ensembl_annotated.txt \
-out /opt/vep/.vep/ensembl_ann_vep_filt.txt \
--filter "(IMPACT is HIGH or IMPACT is MODERATE) and (gnomADe_AF <= 0.05 or not gnomADe_AF)" \
--force_overwrite \
--format tab 

We load the phased variants into R and select autosomal variants that have passed all variant calling filters.

library(tidyverse)
library(VariantAnnotation)
# Set sample names
mother = "HG004"
father = "HG003"
proband = "HG002"
# Load WhatsHap phased variants
wh <- readVcf("data/AJ_phased_whatshap_refined.vcf", "hg38")
wh <- expand(wh)
# Variant info
pos <- rowRanges(wh) %>% as.data.frame() %>% as_tibble()
# Genotype
wh_geno <- geno(wh)$GT %>% as_tibble() %>% 
  dplyr::rename("mother"= sym(mother), "father"= sym(father), "proband"= sym(proband))
# Phasing block
wh_ps <- geno(wh)$PS %>% as_tibble() %>% 
  dplyr::rename("PS_mother" = sym(mother), "PS_father"= sym(father), "PS_proband"= sym(proband))
# Genotype quality
wh_gq <- geno(wh)$GQ %>% as_tibble() %>% 
  dplyr::rename("GQ_mother" = sym(mother), "GQ_father"= sym(father), "GQ_proband"= sym(proband))
# Allelic depth
wh_ad <- geno(wh)$AD %>% as_tibble() %>% 
  dplyr::rename("AD_ref_mother" = paste0(sym(mother),".1"), "AD_ref_father"= paste0(sym(father),".1"), "AD_ref_proband"= paste0(sym(proband),".1")) %>%
  dplyr::rename("AD_alt_mother" = paste0(sym(mother),".2"), "AD_alt_father"= paste0(sym(father),".2"), "AD_alt_proband"= paste0(sym(proband),".2"))
# Join variant info, genotype, phasing block, genotype quality and allelic depth
wh_geno0 <- bind_cols(pos,wh_geno,wh_gq,wh_ps,wh_ad)
# Identify alternate alleles with number
wh_geno01 <- wh_geno0 %>% 
  group_by(seqnames,start,end,width,REF) %>% 
  mutate(counter = row_number()) %>%
  ungroup() %>%
  dplyr::filter(FILTER=="PASS")
# Convert variant to VEP format. Add 1 to indel start site.
wh_geno01 <- wh_geno01 %>% 
  mutate(Location = ifelse(
    width==1, ifelse(
      nchar(ALT)==1, paste(seqnames,start,sep=":"), paste0(seqnames,":",start,"-",start+1)), ifelse(
        start+1==end, paste0(seqnames,":",start+1), paste0(seqnames,":",start+1,"-",end)))) %>% 
  mutate(Allele = ifelse(substr(REF,1,1)==substr(ALT,1,1), substr(ALT,2,nchar(ALT)), ALT))
# Convert variant to gnomAD format.
trim <- function(a1,a2) {
  b1 <- str_sub(a1,2,-1)
  b2 <- str_sub(a2,2,-1)
  repeat {
    if (nchar(b1)==0 | nchar(b2)==0) break
    if (str_sub(b1,-1,-1)!=str_sub(b2,-1,-1)) break
    b1 <- str_sub(b1,1,-2)
    b2 <- str_sub(b2,1,-2)
  }
  paste(str_sub(a1,1,1),b1,"-",str_sub(a2,1,1),b2,sep="")
}
wh_geno01 <- wh_geno01 %>% rowwise %>% 
  dplyr::mutate(gnomADid = paste(substr(seqnames,4,5),start,trim(REF,ALT),sep="-"))
wh_geno01$Allele[wh_geno01$Allele==""] <- "-"
# Overlapping deletions
wh_geno_overlap <- wh_geno01 %>% dplyr::filter(ALT=="")
wh_geno1 <- wh_geno01 %>% dplyr::filter(ALT!="")
# Select columns
wh_geno1 <- wh_geno1[c(27,28,11:25,26,29)]
# Select X chromosome
wh_geno_X <- wh_geno1 %>% dplyr::filter(str_detect(Location,"chrX"))
# Select autosomal variants
wh_geno_auto <- wh_geno1 %>%
  dplyr::filter(!str_detect(Location,"chrX")) %>% 
  dplyr::filter(!str_detect(Location,"chrY")) %>%
  dplyr::filter(!str_detect(Location,"M"))

We load the VEP annotated variants into R and select variants with an allele frequency <0.01 that fall in RefSeq transcripts.

result <- read_table("data/AJ_ensembl_ann_vep_filt_09_01_23.txt", comment = "##", na="-", col_types = list(
  `#Uploaded_variation`=col_factor(), Gene=col_factor(), Feature=col_factor(), MANE_PLUS_CLINICAL = col_character(), 
  MOTIF_NAME = col_character(), HIGH_INF_POS = col_character(), TRANSCRIPTION_FACTORS = col_character(), 
  PUBMED=col_character(), MOTIF_POS = col_character())) %>%
  dplyr::rename(Uploaded_variation=`#Uploaded_variation`)
result$Allele[is.na(result$Allele)] <- "-"
result1 <- result %>% as_tibble %>% left_join(wh_geno1, by = c("Location","Allele"))
refseq0 <- result1 %>% 
  dplyr::filter(SOURCE == "RefSeq") %>% 
  dplyr::filter(gnomADe_AF<=0.01|is.na(gnomADe_AF)) %>% 
  dplyr::filter(gnomADg_AF<=0.01|is.na(gnomADg_AF)) %>%
  dplyr::filter(!(str_detect(Feature,"XM"))) %>%
  select_if(~any(!is.na(.)))

We add gene-level information from OMIM, Human Phenotype Ontology (HPO), ClinGen, GenCC, Orphanet. We select HPO terms for each gene that are associated with a particular phenotype, in this case, kidney disease.

library(ontologyIndex)
# OMIM
omim <- read_tsv("data/genemap2_edit.txt", name_repair="universal", col_types=list(Entrez.Gene.ID = col_factor())) %>% 
  dplyr::rename(OMIM_Phenotypes = Phenotypes) %>% 
  dplyr::select(Entrez.Gene.ID,OMIM_Phenotypes)
# Human Phenotype Ontology
hpo_data <- read_tsv("data/genes_to_phenotype.txt", col_types=list(entrez_gene_id = col_factor()))
hpo <- get_ontology(file = 'data/hp_obo.txt', propagate_relationships = "is_a", extract_tags = "minimal")
hpo_kidney <- get_descendants(hpo, roots="HP:0000077") #Abnormality of the kidney
gen_hpo_kidney <- hpo_data %>% 
  dplyr::filter(hpo_term_id %in% hpo_kidney) %>% 
  dplyr::arrange(frequency_hpo) %>% 
  dplyr::mutate(frequency = case_when(
    frequency_hpo == "HP:0040284" ~ " (very rare)", frequency_hpo == "HP:0040283" ~ " (occasional)", 
    frequency_hpo == "HP:0040282" ~ " (frequent)", frequency_hpo == "HP:0040281" ~ " (very frequent)", 
    frequency_hpo == "HP:0040280" ~ " (obligate)", TRUE ~ "")) %>% 
  unite("renal_hpo_term", c(hpo_term_name, frequency), remove = FALSE, sep = "") %>% 
  dplyr::group_by(entrez_gene_id) %>% 
  dplyr::summarise(renal_hpo_term = toString(unique(renal_hpo_term)))
gen_hpo_extra <- hpo_data %>% 
  dplyr::filter(frequency_hpo %in% c("HP:0040281", "HP:0040280")) %>%
  dplyr::arrange(frequency_hpo) %>%
  dplyr::filter(!(hpo_term_id %in% hpo_kidney)) %>% 
  dplyr::group_by(entrez_gene_id) %>% 
  dplyr::summarise(extrarenal_hpo_term = toString(unique(hpo_term_name)))
# ClinGen 
clingen <- read_csv("data/Clingen-Curation.csv", skip = 3) %>% 
  dplyr::mutate(assertion=gsub("\\s*\\([^\\)]+\\)","", gene_disease_validity_assertion_classifications)) %>%
  dplyr::mutate(assertion=paste("(",assertion,")",sep="")) %>%
  dplyr::mutate(assertion=if_else(assertion=="(NA)",NA,assertion)) %>%
  unite("disease", c(disease_label, assertion), remove = FALSE, na.rm=TRUE, sep = " ") %>%
  dplyr::group_by(hgnc_id) %>% 
  dplyr::summarise(clingen_disease_label = paste(disease,collapse="; "))
# GenCC including Orphanet data
gencc <- read_tsv("data/gencc-submissions.tsv")
gencc_def <- gencc %>% 
  dplyr::filter(submitter_title != "Orphanet") %>% 
  dplyr::filter(classification_title == "Definitive") %>% 
  dplyr::group_by(gene_symbol, gene_curie, disease_title) %>% 
  dplyr::summarise(
    submitter_title = toString(unique(submitter_title)), moi_title = toString(unique(moi_title)), 
    disease_curie = toString(unique(disease_curie)), disease_original_curie = toString(unique(disease_original_curie)), 
    classification_title = toString(classification_title), submitted_as_pmids = toString(unique(submitted_as_pmids))) %>%
  dplyr::mutate(submitter = paste("(", submitter_title, ")", sep = "")) %>% 
  tidyr::unite("gencc_disease_definitive", c(disease_title, submitter), remove = FALSE, sep = " ") %>% 
  dplyr::select(gene_curie, gencc_disease_definitive) %>% 
  dplyr::group_by(gene_curie) %>% 
  dplyr::summarise(gencc_disease_definitive = paste(unique(gencc_disease_definitive), collapse="; "))
gencc_strong <- gencc %>% 
  dplyr::filter(submitter_title != "Orphanet") %>% 
  dplyr::filter(classification_title == "Strong") %>% 
  dplyr::group_by(gene_symbol, gene_curie, disease_title) %>% 
  dplyr::summarise(
    submitter_title = toString(unique(submitter_title)), moi_title = toString(unique(moi_title)), 
    disease_curie = toString(unique(disease_curie)), disease_original_curie = toString(unique(disease_original_curie)), 
    classification_title = toString(classification_title), submitted_as_pmids = toString(unique(submitted_as_pmids))) %>%
  dplyr::mutate(submitter = paste("(", submitter_title, ")", sep = "")) %>% 
  tidyr::unite("gencc_disease_strong", c(disease_title, submitter), remove = FALSE, sep = " ") %>% 
  dplyr::select(gene_curie, gencc_disease_strong) %>% 
  dplyr::group_by(gene_curie) %>% 
  dplyr::summarise(gencc_disease_strong = paste(unique(gencc_disease_strong), collapse="; "))
orphanet <- gencc %>% 
  dplyr::filter(submitter_title == "Orphanet") %>% 
  dplyr::group_by(gene_curie) %>% 
  dplyr::summarise(orphanet_disease = toString(disease_title), orphanet_moi = toString(unique(moi_title)))
# Join
refseq01 <- refseq0 %>% 
  left_join(omim, by=join_by(Gene==Entrez.Gene.ID)) %>%
  left_join(gen_hpo_kidney, by=join_by(Gene==entrez_gene_id)) %>%
  left_join(gen_hpo_extra, by=join_by(Gene==entrez_gene_id)) %>%
  left_join(clingen, by=join_by(HGNC_ID==hgnc_id)) %>%
  left_join(gencc_def, by=join_by(HGNC_ID==gene_curie)) %>%
  left_join(gencc_strong, by=join_by(HGNC_ID==gene_curie)) %>%
  left_join(orphanet, by=join_by(HGNC_ID==gene_curie)) %>%
  dplyr::select(SYMBOL, Uploaded_variation, Location, Feature, HGVSg, HGVSc,    HGVSp, Consequence, ZYG, EXON,  INTRON, gnomADe_AF, gnomADg_AF, MAX_AF, MAX_AF_POPS,    ClinVar,    ClinVar_CLNSIG, OMIM_Phenotypes, renal_hpo_term, extrarenal_hpo_term,   REVEL,  SIFT,   PolyPhen,   CADD_PHRED, PrimateAI,  SpliceAI_pred,  EVE_CLASS,  EVE_SCORE,  MutationTaster_pred,    MutationTaster_score,   orphanet_disease,   orphanet_moi,   Mastermind_MMID3,   Existing_variation, PUBMED, mother, father, proband, Allele, counter, AD_ref_mother, AD_ref_father, AD_ref_proband, AD_alt_mother, AD_alt_father, AD_alt_proband, GQ_mother,    GQ_father,  GQ_proband, PS_mother,  PS_father,  PS_proband, CANONICAL,  MANE_SELECT,    MANE_PLUS_CLINICAL, IMPACT, Gene, HGNC_ID, Amino_acids, Codons, AF, AFR_AF, AMR_AF, EAS_AF, EUR_AF, SAS_AF, gnomADe_AFR_AF, gnomADe_AMR_AF, gnomADe_ASJ_AF, gnomADe_EAS_AF, gnomADe_FIN_AF, gnomADe_NFE_AF, gnomADe_OTH_AF, gnomADe_SAS_AF, gnomADg_AFR_AF, gnomADg_AMI_AF, gnomADg_AMR_AF, gnomADg_ASJ_AF, gnomADg_EAS_AF, gnomADg_FIN_AF, gnomADg_MID_AF, gnomADg_NFE_AF, gnomADg_OTH_AF, gnomADg_SAS_AF, VARIANT_CLASS, gnomADid)
# X chromosome
refseq_X <- refseq01 %>% dplyr::filter(str_detect(Uploaded_variation,"X"))
# Autosomal chromosomes
refseq <- refseq01 %>%
  dplyr::filter(!str_detect(Uploaded_variation,"X")) %>% 
  dplyr::filter(!str_detect(Uploaded_variation,"Y")) %>% 
  dplyr::filter(!str_detect(Uploaded_variation,"M"))

5.3 De Novo Variants

We select de novo variants in the proband.

geno_novo_all <- wh_geno_auto %>% 
  dplyr::filter(! ( ( ( str_detect(mother, substr(proband,1,1)) | str_detect(mother, "\\.") ) & 
                 (str_detect(father, substr(proband,3,3)) | str_detect(father, "\\.") ) ) | 
               ( ( str_detect(father, substr(proband,1,1)) | str_detect(father, "\\.") ) & 
                   (str_detect(mother, substr(proband,3,3)) | str_detect(mother, "\\.") ) ) ) ) %>%
  dplyr::filter(!(str_detect(mother,as.character(counter)) | str_detect(father,as.character(counter))) |
                  (str_detect(mother, substr(proband,1,1)) & str_detect(mother, substr(proband,3,3))) |
                  (str_detect(father, substr(proband,1,1)) & str_detect(father, substr(proband,3,3)))
                  )

We discard de novo mutations back to the reference allele (e.g. 1/1, 1/1 –> 0/1) since they are unlikely to be pathogenic.

geno_novo_not_ref <- wh_geno_auto %>% 
  dplyr::filter(! ( ( ( str_detect(mother, substr(proband,1,1)) | str_detect(mother, "\\.") | substr(proband,1,1)==0 ) & 
                 (str_detect(father, substr(proband,3,3)) | str_detect(father, "\\.") | substr(proband,3,3)==0 ) ) | 
               ( ( str_detect(father, substr(proband,1,1)) | str_detect(father, "\\.") | substr(proband,1,1)==0 ) & 
                   (str_detect(mother, substr(proband,3,3)) | str_detect(mother, "\\.") | substr(proband,3,3)==0 ) ) ) ) %>%
  dplyr::filter(!(str_detect(mother,as.character(counter)) | str_detect(father,as.character(counter))) |
                  (str_detect(mother, substr(proband,1,1)) & str_detect(mother, substr(proband,3,3))) |
                  (str_detect(father, substr(proband,1,1)) & str_detect(father, substr(proband,3,3)))
                  )

We increase the cutoff for genotype quality to ensure that the the de novo variants are not a result of incorrect variant calling. Set GQ>=30, i.e, the probability that the call is incorrect is =<0.1%.

geno_novo_not_ref_high_qual <- geno_novo_not_ref %>% dplyr::filter(GQ_mother>=30) %>% dplyr::filter(GQ_father>=30) %>% dplyr::filter(GQ_proband>=30)

We select de novo variants found in the filtered VEP annotation. For each variant, we chose a single gene transcript that contains the variant, prioritizing canonical and curated transcripts.

prob_novo <- refseq %>% inner_join(geno_novo_not_ref)
prob_novo <- prob_novo %>% group_by(Uploaded_variation) %>% 
  arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>% 
  dplyr::filter(Feature == Feature[1])
prob_novo_high_qual <- refseq %>% inner_join(geno_novo_not_ref_high_qual)
prob_novo_high_qual <- prob_novo_high_qual %>% group_by(Uploaded_variation) %>% 
  arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>% 
  dplyr::filter(Feature == Feature[1])

5.4 Homozygous Variants

We select the homozygous variants in the proband.

geno_hom <- wh_geno_auto %>% 
  dplyr::filter(substr(proband,1,1)==substr(proband,3,3) & substr(proband,1,1) != 0 & substr(proband,1,1) != ".")

We discard the homozygous variants for which one of the parents is also homozygous (e.g. 1/1, 0/1 -> 1/1) since these variants are unlikely to be pathogenic.

geno_hom2 <- geno_hom %>%
  dplyr::filter(! ( (substr(mother,1,1)==substr(mother,3,3) & substr(mother,1,1)==substr(proband,1,1)) |
                      (substr(father,1,1)==substr(father,3,3) & substr(father,1,1)==substr(proband,1,1)) )
                  )

We select the homozygous variants found in the filtered VEP annotation.

prob_hom <- refseq %>% inner_join(geno_hom2)
prob_hom <- prob_hom %>% group_by(Uploaded_variation) %>% 
  arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>% 
  dplyr::filter(Feature == Feature[1])

For each of these variants, we query the gnomAD database for the number of individuals that are homozygous for that variant.

library(ghql)
library(jsonlite)
token <- Sys.getenv("GNOMAD_TOKEN")
con <- GraphqlClient$new(
  url = "https://gnomad.broadinstitute.org/api",
  headers = list(Authorization = paste0("Bearer ", token))
)
qry <- Query$new()
qry$query('homozygous',
'query Variant($var: String!){
  variant(variantId: $var, dataset: gnomad_r4) {
    exome {
      homozygote_count
    }
    genome {
      homozygote_count
    }
  }
}')
gnomADid <- prob_hom$gnomADid
s <- tibble(gnomADid=NA, gnomADe_hom=NA, gnomADg_hom=NA)
for (i in gnomADid) {
  variables <- list(var = i)
  tries <- 1
  repeat {
    x <- try(con$exec(qry$queries$homozygous, variables), silent = TRUE)
    if (!is(x, 'try-error')) {
      x1 <- jsonlite::fromJSON(x)$data$variant$exome$homozygote_count
      x2 <- jsonlite::fromJSON(x)$data$variant$genome$homozygote_count
      message(paste(tries,"success", collapse=" "))
      break
    }
    Sys.sleep(2*tries)
    message(tries)
    tries <- tries + 1
  }
  if (is.null(x1)) x1 <- NA
  if (is.null(x2)) x2 <- NA
  s <- s %>% add_row(gnomADid=variables$var,gnomADe_hom=x1, gnomADg_hom=x2)
}
prob_hom <- left_join(prob_hom, s) %>% 
  dplyr::relocate(c(gnomADe_hom,gnomADg_hom),.after=MAX_AF_POPS)

5.5 Compound Heterozygous Variants

We select the unphased compound heterozygous variants.

prob_2plus_het <- refseq %>% dplyr::filter(substr(proband,1,1)!=substr(proband,3,3)) 
prob_2plus_het <- prob_2plus_het %>% group_by(Feature) %>% 
  dplyr::filter(length(unique(PS_proband))>=2 | length(is.na(PS_proband)[is.na(PS_proband)==TRUE])>=2 ) %>% 
  ungroup()
prob_2plus_het2 <- prob_2plus_het %>% group_by(Gene) %>% 
  arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>% 
  dplyr::filter(Feature == Feature[1])

We select the phased compound heterozygous variants.

prob_comp_het <- refseq %>% group_by(Feature, PS_proband) %>% 
  dplyr::filter(any(substr(proband,1,1)>=1 & substr(proband,2,2)=="|") & any(substr(proband,3,3)>=1 & substr(proband,2,2)=="|"))
prob_comp_het2 <- prob_comp_het %>% group_by(Gene) %>% 
  arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>% 
  dplyr::filter(Feature == Feature[1])

For two variants within a gene that cannot be phased with trio data, we can estimate phase using EM-based haplotype estimation from the gnomAD database.

# Obtain the GRCh37 variant ID to query gnomAD
grch38id <- prob_2plus_het2$gnomADid
qry$query('liftover',
'query liftOver($varID: String!){
  liftover(liftover_variant_id: $varID, reference_genome:GRCh38) {
    source {
      variant_id
    }
  }
}')
grch37id <- c()
for (i in grch38id) {
  variables <- list(varID = i)
  tries <- 1
  repeat {
    x <- try(con$exec(qry$queries$liftover, variables), silent = TRUE)
    if (!is(x, 'try-error')) {
      x <- jsonlite::fromJSON(x)$data$liftover$source$variant_id 
      message(paste(tries,"success", collapse=" "))
      break
    }
    Sys.sleep(2*tries)
    message(tries)
    tries <- tries + 1
  }
  if (is.null(x)) x <- NA
  grch37id <- append(grch37id, x)
}
liftover <- tibble(gnomADid=grch38id, grch37id)
prob_2plus_het2 <- left_join(prob_2plus_het2, liftover)
# Obtain probability that two variants within a gene are compound heterozygous and phase variants using cutoff of <0.2 for cis and >0.55 for trans
qry$query('coocurrence',
'query Coocurrence($var1: String!, $var2: String!){
  variant_cooccurrence(dataset: gnomad_r2_1, variants: [$var1, $var2]) {
    p_compound_heterozygous
  }
}'
)
prob_2plus_het_gnom <- prob_2plus_het2 %>% 
  dplyr::filter(!is.na(grch37id)) %>%
  group_by(SYMBOL) %>%
  dplyr::filter(n()!=1)
s <- tibble(grch37id=NA, gnomAD_A=NA, gnomAD_B=NA, gnomAD_PS=NA)
r <- tibble(grch37id=NA, gnomAD_cis=NA, gnomAD_trans=NA)
for (a in unique(prob_2plus_het_gnom$SYMBOL)) {
  var <- prob_2plus_het_gnom[prob_2plus_het_gnom$SYMBOL==a,]$grch37id
  sub <- tibble(grch37id=var, gnomAD_A=NA, gnomAD_B=NA, gnomAD_PS=NA)
  sub$gnomAD_A[1] <- 1
  sub$gnomAD_PS[1] <- 1
  for (i in 1:(length(var)-1)) {
    c <- c()
    t <- c()
    for (j in (i+1):length(var)) {
      variables <- list(var1 = var[i], var2 = var[j])
      tries <- 1
      repeat {
        y <- try(con$exec(qry$queries$coocurrence, variables), silent = TRUE)
        if (!is(y, 'try-error')) {
          y <- jsonlite::fromJSON(y)$data$variant_cooccurrence$p_compound_heterozygous
          message(paste(tries,"success", collapse=" "))
        break
        }
      Sys.sleep(2*tries)
      message(tries)
      tries <- tries + 1
      }
      if (is.null(y)) y <- "NA"
      if (y<0.2) c <- append(c, paste0(variables$var2," (",round(y,digits=2),")"))
      if (y>0.55 & y!="NA") t <- append(t, paste0(variables$var2," (",round(y,digits=2),")"))
      if (!is.na(sub[[i,"gnomAD_A"]]) & is.na(sub[[i,"gnomAD_B"]])) {
        if (y<0.2) {
          sub$gnomAD_A[j] <- i
          sub$gnomAD_PS[j] <- sub$gnomAD_PS[i]
        }
        if (y>0.55 & y!="NA") {
          sub$gnomAD_B[j] <- i
          sub$gnomAD_PS[j] <- sub$gnomAD_PS[i]
        }
      }
      if (!is.na(sub[[i,"gnomAD_B"]]) & is.na(sub[[i,"gnomAD_A"]])) {
        if (y<0.2) {
          sub$gnomAD_B[j] <- i
          sub$gnomAD_PS[j] <- sub$gnomAD_PS[i]
        }
        if (y>0.55 & y!="NA") {
          sub$gnomAD_A[j] <- i
          sub$gnomAD_PS[j] <- sub$gnomAD_PS[i]
        }
      }
      if (is.na(sub[[i,"gnomAD_A"]]) & is.na(sub[[i,"gnomAD_B"]])) {
        sub$gnomAD_A[i] <- i
        sub$gnomAD_PS[i] <- i
        if (y<0.2) { 
          sub$gnomAD_A[j] <- i
          sub$gnomAD_PS[j] <- i
        }
        if (y>0.55 & y!="NA") {
          sub$gnomAD_B[j] <- i
          sub$gnomAD_PS[j] <- i
        }
      }  
    }
    r <- r %>% add_row(grch37id=var[i], gnomAD_cis=str_flatten_comma(c), gnomAD_trans=str_flatten_comma(t))
  }
  s <- rbind(s,sub)
}
prob_2plus_het2 <- prob_2plus_het2 %>% left_join(r) %>% left_join(s) %>% 
  dplyr::relocate(c(gnomAD_A,gnomAD_B,gnomAD_PS), .after=PS_proband)

5.6 X Chromosome

We remove X chromosome calls that are “heterozygous” for male individuals, i.e., proband and father. However, some of these calls may be correct if they lie in the pseudoautosomal region.

wh_geno_X_filt <- wh_geno_X %>% 
  dplyr::filter(( substr(proband,1,1)==substr(proband,3,3) & substr(father,1,1)==substr(father,3,3)))
prob_X <- inner_join(refseq_X, wh_geno_X_filt) %>% group_by(Gene) %>% 
  arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>% 
  dplyr::filter(Feature == Feature[1])

5.7 Disease Genes

We select one entry for each variant in the refseq table, prioritizing canonical and curated transcripts.

refseq2 <- refseq01 %>% group_by(Uploaded_variation) %>% 
  arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>% 
  dplyr::filter(Feature == Feature[1])

As an example, we select variants that fall in kidney disease genes, i.e., green genes from the UK PanelApp renal broad superpanel and all genes from the Australia PanelApp kidneyome superpanel.

# UK Panel App
panel <- read_tsv("data/renal_superpanel_broad.tsv")
panel_green <- panel %>% dplyr::filter(GEL_Status==3)
refseq_green <- refseq2 %>% dplyr::filter(HGNC_ID %in% panel_green$HGNC)
#Australia Panel App
panel_au <- read_tsv("data/Kidneyome_SuperPanel.tsv") %>% 
  dplyr::filter(!duplicated(HGNC)) %>%
  dplyr::mutate(Status=case_when(
    GEL_Status==4|GEL_Status==3 ~ "Green", 
    GEL_Status==2 ~ "Amber", 
    GEL_Status==1|GEL_Status==0 ~ "Red")) %>%
  dplyr::select(HGNC,Status)
refseq_au <- refseq2 %>% inner_join(panel_au, join_by(HGNC_ID==HGNC))

We combine annotations into a single table.

refseq3 <- refseq01 %>% group_by(Gene) %>% 
  arrange(desc(CANONICAL), as.character(Feature),.by_group = T) %>% ungroup()
refseq4 <- refseq3 %>% 
  left_join(add_column(geno_novo_not_ref,de_novo="de_novo")) %>%
  left_join(add_column(geno_novo_not_ref_high_qual,de_novo_gq="de_novo_gq>=30")) %>%
  left_join(add_column(geno_hom2, homozygote="homozygote")) %>%
  left_join(add_column(prob_2plus_het, unphased_compound_heterozygote="unphased_compound_heterozygote")) %>%
  left_join(add_column(prob_comp_het, phased_compound_heterozygote="phased_compound_heterozygote")) %>%
  mutate(zygosity=coalesce(de_novo_gq,de_novo,homozygote,phased_compound_heterozygote,unphased_compound_heterozygote)) %>%
  dplyr::select(-c(de_novo_gq,de_novo,homozygote,phased_compound_heterozygote,unphased_compound_heterozygote)) %>%
  dplyr::relocate(zygosity,.after=counter)
refseq_sum <- refseq4 %>% group_by(Feature) %>% 
  summarize(z=paste(c(Uploaded_variation,Location,Allele,Consequence,Amino_acids,Codons), collapse = " ")) %>% 
  distinct(z,.keep_all = T)
refseq5 <- refseq4 %>% dplyr::filter(Feature %in% refseq_sum$Feature)