{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/\n", "\n", "import requests\n", "import json\n", "import boto3\n", "import re\n", "import gzip\n", "import pandas as pd\n", "import dask\n", "from dask.distributed import Client\n", "\n", "data_endpt = 'https://api.gdc.cancer.gov/data'\n", "cases_endpt = 'https://api.gdc.cancer.gov/cases'\n", "files_endpt = 'https://api.gdc.cancer.gov/files'\n", "indexd_endpt = 'https://nci-crdc.datacommons.io/index/index/'\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "## Query Settings\n", "\n", "# primary_site = \"Breast\"\n", "project_id = \"TCGA-BRCA\"\n", "\n", "data_type = \"Gene Expression Quantification\" # RNA-Seq\n", "workflow_type = \"HTSeq - Counts\"\n", "size = 2000\n", "\n", "# The 'fields' parameter is passed as a comma-separated string of single names.\n", "fields = [\n", " \"file_name\"\n", " , \"cases.primary_site\"\n", " , \"cases.case_id\"\n", " , \"cases.project.project_id\"\n", " , \"cases.days_to_lost_to_followup\"\n", " , \"cases.submitter_id\"\n", " , \"cases.samples.submitter_id\"\n", " , \"cases.samples.sample_id\"\n", "\n", "]\n", "\n", "fields = ','.join(fields)\n", "\n", "#cases.project.project_id in [\"TCGA-BRCA\"] and files.data_type in [\"Gene Expression Quantification\"]\n", "filters = {\n", " \"op\":\"and\",\n", " \"content\":[\n", " {\"op\": \"in\",\n", " \"content\":{\n", " \"field\": \"cases.project.project_id\",\n", " \"value\": [project_id]\n", " }\n", " },\n", " {\"op\": \"in\",\n", " \"content\":{\n", " \"field\": \"files.data_type\",\n", " \"value\": [data_type]\n", " }\n", " },\n", " {\"op\": \"in\",\n", " \"content\":{\n", " \"field\": \"files.analysis.workflow_type\",\n", " \"value\": [workflow_type]\n", " }\n", " }\n", " ]\n", "}\n", "\n", "# With a GET request, the filters parameter needs to be converted\n", "# from a dictionary to JSON-formatted string\n", "\n", "params = {\n", " \"filters\": json.dumps(filters),\n", " \"fields\": fields,\n", " \"format\": \"JSON\",\n", " \"size\": size\n", " }\n", "\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1222\n" ] } ], "source": [ "## Get Files\n", "\n", "query_response = requests.get(files_endpt, params = params)\n", "\n", "json_response = json.loads(query_response.content.decode(\"utf-8\"))[\"data\"][\"hits\"]\n", "\n", "print (len(json_response))\n", "##print(json_response)\n", "\n", "files_json = json_response" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "## Scale out Dask Cluster\n", "ecs = boto3.client('ecs')\n", "resp = ecs.list_clusters()\n", "clusters = resp['clusterArns']\n", "if len(clusters) > 1:\n", " print(\"Please manually select your cluster\")\n", "cluster = clusters[0]\n", "\n", "numWorkers=10\n", "ecs.update_service(cluster=cluster, service='Dask-Worker', desiredCount=numWorkers)\n", "ecs.get_waiter('services_stable').wait(cluster=cluster, services=['Dask-Worker'])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/distributed/client.py:1130: VersionMismatchWarning: Mismatched versions found\n", "\n", "+---------+----------------+---------------+---------------+\n", "| Package | client | scheduler | workers |\n", "+---------+----------------+---------------+---------------+\n", "| python | 3.6.10.final.0 | 3.7.4.final.0 | 3.7.4.final.0 |\n", "+---------+----------------+---------------+---------------+\n", "Notes: \n", "- python: Variation is sometimes ok, sometimes not. It depends on your workloads\n", " warnings.warn(version_module.VersionMismatchWarning(msg[0][\"warning\"]))\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
\n", "

Client

\n", "\n", "
\n", "

Cluster

\n", "
    \n", "
  • Workers: 10
  • \n", "
  • Cores: 10
  • \n", "
  • Memory: 70.00 GB
  • \n", "
\n", "
" ], "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client = Client('Dask-Scheduler.local-dask:8786')\n", "client" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "@dask.delayed\n", "def get_data(uuid, sample_submitter_id):\n", " query_response = requests.get(indexd_endpt + \"/\" + uuid)\n", " urls_response = json.loads(query_response.content.decode(\"utf-8\"))[\"urls\"]\n", " url = [x for x in urls_response if x.startswith(\"s3://\")]\n", " if len(url) != 1:\n", " print(\"Something weird with UUID \" + uuid + \"returned \" + str(url))\n", " url = url[0]\n", " content = pd.read_csv(url, compression='gzip', header=None, dtype=str, sep=\"\\t\")\n", " content.index = content[0]\n", " content.columns = ['id', sample_submitter_id]\n", " content = content[[sample_submitter_id]]\n", " return content\n", "\n", "delayed_results = []\n", "for file_entry in files_json:\n", " delayed_results.append(get_data(file_entry[\"id\"], file_entry[\"cases\"][0][\"samples\"][0][\"submitter_id\"]))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 30.3 s, sys: 4.16 s, total: 34.5 s\n", "Wall time: 57.5 s\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TCGA-E9-A1RI-11ATCGA-C8-A8HQ-01ATCGA-BH-A0BJ-11ATCGA-OL-A66O-01ATCGA-A7-A13E-11ATCGA-PL-A8LX-01ATCGA-A8-A09V-01ATCGA-AR-A5QM-01ATCGA-BH-A0AY-11ATCGA-A7-A2KD-01A...TCGA-E9-A226-01ATCGA-AR-A0U0-01ATCGA-AO-A129-01ATCGA-C8-A26Y-01ATCGA-AR-A2LQ-01ATCGA-D8-A1XD-01ATCGA-E2-A570-01ATCGA-E2-A574-01ATCGA-D8-A73W-01ATCGA-B6-A2IU-01A
0
ENSG00000000003.13763915055050279634009824952126844514883...325315993205224031072846439446101538570
ENSG00000000005.535413154699202446603...1031050014424
ENSG00000000419.111999180315771890124217211919137517178033...178820794173483811092802219638991853603
ENSG00000000457.1212871763200711749317594909149013432769...316710341237134516191310283164320962349
ENSG00000000460.1529095231962525918216474783141359...19906581101635394448677668313615
..................................................................
__no_feature53499824816279310085733371772634237246215810968241283090533489465958795...4836012273343533326735664496275354639191882826989200745743682552230344
__ambiguous2732915286219121535193728715187712422137313061277238945823372983105025...2483817271207129198763011731189022418560442007640251616122351321295034
__too_low_aQual0000000000...0000000000
__not_aligned0000000000...0000000000
__alignment_not_unique31446784147541411439254920705051143257661266421319475816139537521578478220035954...1375351315441474210311002225624912859023115568641285221836710840149827696625883
\n", "

60488 rows × 1222 columns

\n", "
" ], "text/plain": [ " TCGA-E9-A1RI-11A TCGA-C8-A8HQ-01A TCGA-BH-A0BJ-11A \\\n", "0 \n", "ENSG00000000003.13 7639 1505 5050 \n", "ENSG00000000005.5 3541 3 154 \n", "ENSG00000000419.11 1999 1803 1577 \n", "ENSG00000000457.12 1287 1763 2007 \n", "ENSG00000000460.15 290 952 319 \n", "... ... ... ... \n", "__no_feature 5349982 4816279 3100857 \n", "__ambiguous 2732915 2862191 2153519 \n", "__too_low_aQual 0 0 0 \n", "__not_aligned 0 0 0 \n", "__alignment_not_unique 31446784 14754141 14392549 \n", "\n", " TCGA-OL-A66O-01A TCGA-A7-A13E-11A TCGA-PL-A8LX-01A \\\n", "0 \n", "ENSG00000000003.13 2796 3400 982 \n", "ENSG00000000005.5 6 992 0 \n", "ENSG00000000419.11 1890 1242 1721 \n", "ENSG00000000457.12 1174 931 759 \n", "ENSG00000000460.15 625 259 182 \n", "... ... ... ... \n", "__no_feature 3337177 2634237 2462158 \n", "__ambiguous 3728715 1877124 2213731 \n", "__too_low_aQual 0 0 0 \n", "__not_aligned 0 0 0 \n", "__alignment_not_unique 20705051 14325766 12664213 \n", "\n", " TCGA-A8-A09V-01A TCGA-AR-A5QM-01A TCGA-BH-A0AY-11A \\\n", "0 \n", "ENSG00000000003.13 4952 1268 4451 \n", "ENSG00000000005.5 2 44 660 \n", "ENSG00000000419.11 1919 1375 1717 \n", "ENSG00000000457.12 4909 1490 1343 \n", "ENSG00000000460.15 1647 478 314 \n", "... ... ... ... \n", "__no_feature 10968241 2830905 3348946 \n", "__ambiguous 3061277 2389458 2337298 \n", "__too_low_aQual 0 0 0 \n", "__not_aligned 0 0 0 \n", "__alignment_not_unique 19475816 13953752 15784782 \n", "\n", " TCGA-A7-A2KD-01A ... TCGA-E9-A226-01A \\\n", "0 ... \n", "ENSG00000000003.13 4883 ... 3253 \n", "ENSG00000000005.5 3 ... 1 \n", "ENSG00000000419.11 8033 ... 1788 \n", "ENSG00000000457.12 2769 ... 3167 \n", "ENSG00000000460.15 1359 ... 1990 \n", "... ... ... ... \n", "__no_feature 5958795 ... 4836012 \n", "__ambiguous 3105025 ... 2483817 \n", "__too_low_aQual 0 ... 0 \n", "__not_aligned 0 ... 0 \n", "__alignment_not_unique 20035954 ... 13753513 \n", "\n", " TCGA-AR-A0U0-01A TCGA-AO-A129-01A TCGA-C8-A26Y-01A \\\n", "0 \n", "ENSG00000000003.13 1599 3205 2240 \n", "ENSG00000000005.5 0 31 0 \n", "ENSG00000000419.11 2079 4173 4838 \n", "ENSG00000000457.12 1034 1237 1345 \n", "ENSG00000000460.15 658 1101 635 \n", "... ... ... ... \n", "__no_feature 2733435 3332673 5664496 \n", "__ambiguous 2712071 2919876 3011731 \n", "__too_low_aQual 0 0 0 \n", "__not_aligned 0 0 0 \n", "__alignment_not_unique 15441474 21031100 22256249 \n", "\n", " TCGA-AR-A2LQ-01A TCGA-D8-A1XD-01A TCGA-E2-A570-01A \\\n", "0 \n", "ENSG00000000003.13 3107 2846 4394 \n", "ENSG00000000005.5 50 0 1 \n", "ENSG00000000419.11 1109 2802 2196 \n", "ENSG00000000457.12 1619 1310 2831 \n", "ENSG00000000460.15 394 448 677 \n", "... ... ... ... \n", "__no_feature 2753546 3919188 2826989 \n", "__ambiguous 1890224 1856044 2007640 \n", "__too_low_aQual 0 0 0 \n", "__not_aligned 0 0 0 \n", "__alignment_not_unique 12859023 11556864 12852218 \n", "\n", " TCGA-E2-A574-01A TCGA-D8-A73W-01A TCGA-B6-A2IU-01A \n", "0 \n", "ENSG00000000003.13 4610 1538 570 \n", "ENSG00000000005.5 4 4 24 \n", "ENSG00000000419.11 3899 1853 603 \n", "ENSG00000000457.12 643 2096 2349 \n", "ENSG00000000460.15 668 313 615 \n", "... ... ... ... \n", "__no_feature 2007457 4368255 2230344 \n", "__ambiguous 2516161 2235132 1295034 \n", "__too_low_aQual 0 0 0 \n", "__not_aligned 0 0 0 \n", "__alignment_not_unique 36710840 14982769 6625883 \n", "\n", "[60488 rows x 1222 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "\n", "df = pd.concat(dask.compute(*delayed_results), axis=1, join=\"outer\")\n", "df" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "numWorkers=0\n", "ecs.update_service(cluster=cluster, service='Dask-Worker', desiredCount=numWorkers)\n", "ecs.get_waiter('services_stable').wait(cluster=cluster, services=['Dask-Worker'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" } }, "nbformat": 4, "nbformat_minor": 4 }