{ "cells": [ { "cell_type": "markdown", "id": "4ea103da", "metadata": {}, "source": [ "Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. SPDX-License-Identifier: MIT-0" ] }, { "cell_type": "markdown", "id": "4ea103da", "metadata": {}, "source": [ "## Install required R packages" ] }, { "cell_type": "markdown", "id": "777e2cde", "metadata": {}, "source": [ "make sure to use R kernel" ] }, { "cell_type": "code", "execution_count": null, "id": "c3798984", "metadata": {}, "outputs": [], "source": [ "if (!require(\"pacman\")) install.packages(\"pacman\")\n", "\n", "pacman::p_load('R.utils', \"visNetwork\", \"data.table\")" ] }, { "cell_type": "markdown", "id": "ff3a0377", "metadata": {}, "source": [ "## Sync variant calling files to local directory" ] }, { "cell_type": "code", "execution_count": null, "id": "e852cc48", "metadata": {}, "outputs": [], "source": [ "system('aws s3 sync s3://roda3/results_breast_cancer/Annotation/ ./results-vcf', intern=TRUE)" ] }, { "cell_type": "markdown", "id": "7685f3b1", "metadata": {}, "source": [ "## Extract mutations from variant calling files" ] }, { "cell_type": "code", "execution_count": null, "id": "fde2e0ed", "metadata": {}, "outputs": [], "source": [ "library(data.table)\n", "library(dplyr)\n", "library(stringr)" ] }, { "cell_type": "code", "execution_count": null, "id": "327dd140", "metadata": {}, "outputs": [], "source": [ "filenames <- Sys.glob(\"./results-vcf/*/VEP/*ann.vcf.gz\")\n", "filenames" ] }, { "cell_type": "code", "execution_count": null, "id": "d9d3a240", "metadata": {}, "outputs": [], "source": [ "extract_variants_snp <- function(filename){\n", "base = basename(filename)\n", "base = str_extract(base, \"SAMPLE_\\\\d\\\\d+\")\n", "\n", "dt = fread(filename, sep=\";\", header=FALSE)\n", "dt_mutations = dt %>% dplyr::filter(grepl(\"BRCA2\",V1)) %>%\n", "dplyr::mutate(gene = gsub(\".*MODIFIER|\",\"\",V1)) %>%\n", " dplyr::mutate(gene = gsub(\"ENSG.*\",\"\",gene)) %>%\n", " dplyr::mutate(SNP = gsub(\".*rs\",\"rs\",V1)) %>%\n", " dplyr::mutate(SNP = stringr::str_extract(SNP, \"rs[0-9]+\")) %>%\n", " dplyr::select(gene,SNP) %>%\n", " dplyr::mutate(gene = gsub(\"\\\\|\",\"\",gene)) %>%\n", " dplyr::filter(SNP != \"NA\") %>%\n", " dplyr::filter(gene == \"BRCA2\") %>%\n", " dplyr::mutate(name = base)\n", "dt_mutations\n", "}" ] }, { "cell_type": "markdown", "id": "c1c73101", "metadata": {}, "source": [ "Extract and show mutations of single file" ] }, { "cell_type": "code", "execution_count": null, "id": "c638ca3d", "metadata": {}, "outputs": [], "source": [ "extract_variants_snp(\"./results-vcf/SAMPLE_01/VEP/Mutect2_filtered_SAMPLE_01_VEP.ann.vcf.gz\")\n" ] }, { "cell_type": "markdown", "id": "1423b074", "metadata": {}, "source": [ "Extract mutations from all files and combine to data.table" ] }, { "cell_type": "code", "execution_count": null, "id": "3c3b02df", "metadata": {}, "outputs": [], "source": [ "dt_all = lapply(filenames,extract_variants_snp)" ] }, { "cell_type": "code", "execution_count": null, "id": "28a6b44f", "metadata": {}, "outputs": [], "source": [ "dt_links = rbindlist(dt_all)" ] }, { "cell_type": "code", "execution_count": null, "id": "d46b24ea", "metadata": {}, "outputs": [], "source": [ "#dt_links[,from:=name]\n", "#dt_links[,to:=SNP]\n", "#dt_links[,c(\"name\",\"SNP\",\"gene\"):=NULL]\n", "#dt_links" ] }, { "cell_type": "code", "execution_count": null, "id": "eb2f75a5", "metadata": {}, "outputs": [], "source": [ "dt_links" ] }, { "cell_type": "markdown", "id": "6f7ee8d4", "metadata": {}, "source": [ "## Use graph visualization to show gene mutation to sample relationships" ] }, { "cell_type": "code", "execution_count": null, "id": "80eb226f", "metadata": {}, "outputs": [], "source": [ "require(visNetwork, quietly = TRUE)\n" ] }, { "cell_type": "markdown", "id": "75e41cef", "metadata": {}, "source": [ "define the nodes of the graph" ] }, { "cell_type": "code", "execution_count": null, "id": "9d475e51", "metadata": {}, "outputs": [], "source": [ "samples = unique(dt_links[[\"name\"]])\n", "nodes_samples <- data.table(id = samples, \n", " label = samples) \n", "\n", "nodes_samples[,group:= \"sample\"]\n", "nodes_samples[,value:= 1]\n", "nodes_samples[,shape:= \"dot\"]\n", "nodes_samples[,title:= \"title\"]\n", "nodes_samples[,color:= \"#a6cee3\"]\n", "\n", "mutations = unique(dt_links[[\"SNP\"]])\n", "\n", "nodes_mutations <- data.table(id = mutations, \n", " label = mutations) \n", "\n", "nodes_mutations[,group:= \"mutation\"]\n", "nodes_mutations[,value:= 1]\n", "nodes_mutations[,shape:= \"dot\"]\n", "nodes_mutations[,title:= \"title\"]\n", "nodes_mutations[,color:= \"#fb9a99\"]\n", "\n", "nodes = rbindlist(list(nodes_samples,nodes_mutations))" ] }, { "cell_type": "markdown", "id": "287cc62b", "metadata": {}, "source": [ "define the edges (links) of the graph" ] }, { "cell_type": "code", "execution_count": null, "id": "a902e4f8", "metadata": {}, "outputs": [], "source": [ "edges = dt_links\n", "edges[, from:=name]\n", "edges[, to:=SNP]\n", "edges[,arrows:= \"to\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "d48fe49d", "metadata": {}, "outputs": [], "source": [ "network = visNetwork(nodes, edges, width = 1000, height = 900) %>% \n", " visLegend(position = \"right\" ,useGroups = FALSE, \n", " addNodes = data.frame(label = c(\"patient sample\",\"mutation (SNP)\"), shape = c(\"dot\",\"dot\"), color = c(\"#a6cee3\",\"#fb9a99\")))\n", "network" ] }, { "cell_type": "markdown", "id": "071ac7bd", "metadata": {}, "source": [ "## Export graph visualization to html" ] }, { "cell_type": "code", "execution_count": null, "id": "911d7c91", "metadata": {}, "outputs": [], "source": [ "dir_analytics = \"./results-analytics\"\n", "dir.create(file.path(dir_analytics), showWarnings = FALSE)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ccdff33c", "metadata": {}, "outputs": [], "source": [ "network %>% visSave(file = paste0(dir_analytics,\"/network-mutations.html\"), background = \"white\")" ] }, { "cell_type": "markdown", "id": "e5918368", "metadata": {}, "source": [ "## Export graph structure to file" ] }, { "cell_type": "code", "execution_count": null, "id": "e52e4e82", "metadata": {}, "outputs": [], "source": [ "fwrite(nodes, paste0(dir_analytics,\"/nodes.csv\"))\n", "fwrite(edges, paste0(dir_analytics,\"edges.csv\"))\n" ] } ], "metadata": { "kernelspec": { "display_name": "R", "language": "R", "name": "ir" }, "language_info": { "codemirror_mode": "r", "file_extension": ".r", "mimetype": "text/x-r-source", "name": "R", "pygments_lexer": "r", "version": "4.1.2" } }, "nbformat": 4, "nbformat_minor": 5 }