{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/\n", "\n", "import requests\n", "import json\n", "import boto3\n", "import re\n", "import gzip\n", "import pandas as pd\n", "import dask\n", "from dask.distributed import Client\n", "\n", "data_endpt = 'https://api.gdc.cancer.gov/data'\n", "cases_endpt = 'https://api.gdc.cancer.gov/cases'\n", "files_endpt = 'https://api.gdc.cancer.gov/files'\n", "indexd_endpt = 'https://nci-crdc.datacommons.io/index/index/'\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "## Query Settings\n", "\n", "# primary_site = \"Breast\"\n", "project_id = \"TCGA-BRCA\"\n", "\n", "data_type = \"Gene Expression Quantification\" # RNA-Seq\n", "workflow_type = \"HTSeq - Counts\"\n", "size = 2000\n", "\n", "# The 'fields' parameter is passed as a comma-separated string of single names.\n", "fields = [\n", " \"file_name\"\n", " , \"cases.primary_site\"\n", " , \"cases.case_id\"\n", " , \"cases.project.project_id\"\n", " , \"cases.days_to_lost_to_followup\"\n", " , \"cases.submitter_id\"\n", " , \"cases.samples.submitter_id\"\n", " , \"cases.samples.sample_id\"\n", "\n", "]\n", "\n", "fields = ','.join(fields)\n", "\n", "#cases.project.project_id in [\"TCGA-BRCA\"] and files.data_type in [\"Gene Expression Quantification\"]\n", "filters = {\n", " \"op\":\"and\",\n", " \"content\":[\n", " {\"op\": \"in\",\n", " \"content\":{\n", " \"field\": \"cases.project.project_id\",\n", " \"value\": [project_id]\n", " }\n", " },\n", " {\"op\": \"in\",\n", " \"content\":{\n", " \"field\": \"files.data_type\",\n", " \"value\": [data_type]\n", " }\n", " },\n", " {\"op\": \"in\",\n", " \"content\":{\n", " \"field\": \"files.analysis.workflow_type\",\n", " \"value\": [workflow_type]\n", " }\n", " }\n", " ]\n", "}\n", "\n", "# With a GET request, the filters parameter needs to be converted\n", "# from a dictionary to JSON-formatted string\n", "\n", "params = {\n", " \"filters\": json.dumps(filters),\n", " \"fields\": fields,\n", " \"format\": \"JSON\",\n", " \"size\": size\n", " }\n", "\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1222\n" ] } ], "source": [ "## Get Files\n", "\n", "query_response = requests.get(files_endpt, params = params)\n", "\n", "json_response = json.loads(query_response.content.decode(\"utf-8\"))[\"data\"][\"hits\"]\n", "\n", "print (len(json_response))\n", "##print(json_response)\n", "\n", "files_json = json_response" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "## Scale out Dask Cluster\n", "ecs = boto3.client('ecs')\n", "resp = ecs.list_clusters()\n", "clusters = resp['clusterArns']\n", "if len(clusters) > 1:\n", " print(\"Please manually select your cluster\")\n", "cluster = clusters[0]\n", "\n", "numWorkers=10\n", "ecs.update_service(cluster=cluster, service='Dask-Worker', desiredCount=numWorkers)\n", "ecs.get_waiter('services_stable').wait(cluster=cluster, services=['Dask-Worker'])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/distributed/client.py:1130: VersionMismatchWarning: Mismatched versions found\n", "\n", "+---------+----------------+---------------+---------------+\n", "| Package | client | scheduler | workers |\n", "+---------+----------------+---------------+---------------+\n", "| python | 3.6.10.final.0 | 3.7.4.final.0 | 3.7.4.final.0 |\n", "+---------+----------------+---------------+---------------+\n", "Notes: \n", "- python: Variation is sometimes ok, sometimes not. It depends on your workloads\n", " warnings.warn(version_module.VersionMismatchWarning(msg[0][\"warning\"]))\n" ] }, { "data": { "text/html": [ "
\n",
"Client\n", "
| \n",
"\n",
"Cluster\n", "
| \n",
"
\n", " | TCGA-E9-A1RI-11A | \n", "TCGA-C8-A8HQ-01A | \n", "TCGA-BH-A0BJ-11A | \n", "TCGA-OL-A66O-01A | \n", "TCGA-A7-A13E-11A | \n", "TCGA-PL-A8LX-01A | \n", "TCGA-A8-A09V-01A | \n", "TCGA-AR-A5QM-01A | \n", "TCGA-BH-A0AY-11A | \n", "TCGA-A7-A2KD-01A | \n", "... | \n", "TCGA-E9-A226-01A | \n", "TCGA-AR-A0U0-01A | \n", "TCGA-AO-A129-01A | \n", "TCGA-C8-A26Y-01A | \n", "TCGA-AR-A2LQ-01A | \n", "TCGA-D8-A1XD-01A | \n", "TCGA-E2-A570-01A | \n", "TCGA-E2-A574-01A | \n", "TCGA-D8-A73W-01A | \n", "TCGA-B6-A2IU-01A | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
ENSG00000000003.13 | \n", "7639 | \n", "1505 | \n", "5050 | \n", "2796 | \n", "3400 | \n", "982 | \n", "4952 | \n", "1268 | \n", "4451 | \n", "4883 | \n", "... | \n", "3253 | \n", "1599 | \n", "3205 | \n", "2240 | \n", "3107 | \n", "2846 | \n", "4394 | \n", "4610 | \n", "1538 | \n", "570 | \n", "
ENSG00000000005.5 | \n", "3541 | \n", "3 | \n", "154 | \n", "6 | \n", "992 | \n", "0 | \n", "2 | \n", "44 | \n", "660 | \n", "3 | \n", "... | \n", "1 | \n", "0 | \n", "31 | \n", "0 | \n", "50 | \n", "0 | \n", "1 | \n", "4 | \n", "4 | \n", "24 | \n", "
ENSG00000000419.11 | \n", "1999 | \n", "1803 | \n", "1577 | \n", "1890 | \n", "1242 | \n", "1721 | \n", "1919 | \n", "1375 | \n", "1717 | \n", "8033 | \n", "... | \n", "1788 | \n", "2079 | \n", "4173 | \n", "4838 | \n", "1109 | \n", "2802 | \n", "2196 | \n", "3899 | \n", "1853 | \n", "603 | \n", "
ENSG00000000457.12 | \n", "1287 | \n", "1763 | \n", "2007 | \n", "1174 | \n", "931 | \n", "759 | \n", "4909 | \n", "1490 | \n", "1343 | \n", "2769 | \n", "... | \n", "3167 | \n", "1034 | \n", "1237 | \n", "1345 | \n", "1619 | \n", "1310 | \n", "2831 | \n", "643 | \n", "2096 | \n", "2349 | \n", "
ENSG00000000460.15 | \n", "290 | \n", "952 | \n", "319 | \n", "625 | \n", "259 | \n", "182 | \n", "1647 | \n", "478 | \n", "314 | \n", "1359 | \n", "... | \n", "1990 | \n", "658 | \n", "1101 | \n", "635 | \n", "394 | \n", "448 | \n", "677 | \n", "668 | \n", "313 | \n", "615 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
__no_feature | \n", "5349982 | \n", "4816279 | \n", "3100857 | \n", "3337177 | \n", "2634237 | \n", "2462158 | \n", "10968241 | \n", "2830905 | \n", "3348946 | \n", "5958795 | \n", "... | \n", "4836012 | \n", "2733435 | \n", "3332673 | \n", "5664496 | \n", "2753546 | \n", "3919188 | \n", "2826989 | \n", "2007457 | \n", "4368255 | \n", "2230344 | \n", "
__ambiguous | \n", "2732915 | \n", "2862191 | \n", "2153519 | \n", "3728715 | \n", "1877124 | \n", "2213731 | \n", "3061277 | \n", "2389458 | \n", "2337298 | \n", "3105025 | \n", "... | \n", "2483817 | \n", "2712071 | \n", "2919876 | \n", "3011731 | \n", "1890224 | \n", "1856044 | \n", "2007640 | \n", "2516161 | \n", "2235132 | \n", "1295034 | \n", "
__too_low_aQual | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
__not_aligned | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
__alignment_not_unique | \n", "31446784 | \n", "14754141 | \n", "14392549 | \n", "20705051 | \n", "14325766 | \n", "12664213 | \n", "19475816 | \n", "13953752 | \n", "15784782 | \n", "20035954 | \n", "... | \n", "13753513 | \n", "15441474 | \n", "21031100 | \n", "22256249 | \n", "12859023 | \n", "11556864 | \n", "12852218 | \n", "36710840 | \n", "14982769 | \n", "6625883 | \n", "
60488 rows × 1222 columns
\n", "