Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. SPDX-License-Identifier: MIT-0

## Install required R packages

make sure to use R kernel

In [None]:
if (!require("pacman")) install.packages("pacman")

pacman::p_load('R.utils', "visNetwork", "data.table")

## Sync variant calling files to local directory

In [None]:
system('aws s3 sync s3://roda3/results_breast_cancer/Annotation/ ./results-vcf', intern=TRUE)

## Extract mutations from variant calling files

In [None]:
library(data.table)
library(dplyr)
library(stringr)

In [None]:
filenames <- Sys.glob("./results-vcf/*/VEP/*ann.vcf.gz")
filenames

In [None]:
extract_variants_snp <- function(filename){
base = basename(filename)
base = str_extract(base, "SAMPLE_\\d\\d+")

dt = fread(filename, sep=";", header=FALSE)
dt_mutations = dt %>% dplyr::filter(grepl("BRCA2",V1)) %>%
dplyr::mutate(gene = gsub(".*MODIFIER|","",V1)) %>%
 dplyr::mutate(gene = gsub("ENSG.*","",gene)) %>%
 dplyr::mutate(SNP = gsub(".*rs","rs",V1)) %>%
 dplyr::mutate(SNP = stringr::str_extract(SNP, "rs[0-9]+")) %>%
 dplyr::select(gene,SNP) %>%
 dplyr::mutate(gene = gsub("\\|","",gene)) %>%
 dplyr::filter(SNP != "NA") %>%
 dplyr::filter(gene == "BRCA2") %>%
 dplyr::mutate(name = base)
dt_mutations
}

Extract and show mutations of single file

In [None]:
extract_variants_snp("./results-vcf/SAMPLE_01/VEP/Mutect2_filtered_SAMPLE_01_VEP.ann.vcf.gz")


Extract mutations from all files and combine to data.table

In [None]:
dt_all = lapply(filenames,extract_variants_snp)

In [None]:
dt_links = rbindlist(dt_all)

In [None]:
#dt_links[,from:=name]
#dt_links[,to:=SNP]
#dt_links[,c("name","SNP","gene"):=NULL]
#dt_links

In [None]:
dt_links

## Use graph visualization to show gene mutation to sample relationships

In [None]:
require(visNetwork, quietly = TRUE)


define the nodes of the graph

In [None]:
samples = unique(dt_links[["name"]])
nodes_samples <- data.table(id = samples, 
 label = samples) 

nodes_samples[,group:= "sample"]
nodes_samples[,value:= 1]
nodes_samples[,shape:= "dot"]
nodes_samples[,title:= "title"]
nodes_samples[,color:= "#a6cee3"]

mutations = unique(dt_links[["SNP"]])

nodes_mutations <- data.table(id = mutations, 
 label = mutations) 

nodes_mutations[,group:= "mutation"]
nodes_mutations[,value:= 1]
nodes_mutations[,shape:= "dot"]
nodes_mutations[,title:= "title"]
nodes_mutations[,color:= "#fb9a99"]

nodes = rbindlist(list(nodes_samples,nodes_mutations))

define the edges (links) of the graph

In [None]:
edges = dt_links
edges[, from:=name]
edges[, to:=SNP]
edges[,arrows:= "to"]

In [None]:
network = visNetwork(nodes, edges, width = 1000, height = 900) %>% 
 visLegend(position = "right" ,useGroups = FALSE, 
 addNodes = data.frame(label = c("patient sample","mutation (SNP)"), shape = c("dot","dot"), color = c("#a6cee3","#fb9a99")))
network

## Export graph visualization to html

In [None]:
dir_analytics = "./results-analytics"
dir.create(file.path(dir_analytics), showWarnings = FALSE)


In [None]:
network %>% visSave(file = paste0(dir_analytics,"/network-mutations.html"), background = "white")

## Export graph structure to file

In [None]:
fwrite(nodes, paste0(dir_analytics,"/nodes.csv"))
fwrite(edges, paste0(dir_analytics,"edges.csv"))
