In [1]:
# https://docs.gdc.cancer.gov/API/Users_Guide/Search_and_Retrieval/

import requests
import json
import re
import boto3
import gzip
import pandas as pd 

data_endpt = 'https://api.gdc.cancer.gov/data'
cases_endpt = 'https://api.gdc.cancer.gov/cases'
files_endpt = 'https://api.gdc.cancer.gov/files'

s3_tcga_bucket = 'tcga-2-open'
s3 = boto3.resource('s3')


The code failed because of a fatal error:
	Error sending http request and maximum retry encountered..

Some things to try:
a) Make sure Spark has enough available resources for Jupyter to create a Spark context.
b) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.
c) Restart the kernel.


In [2]:
## Query Settings

# primary_site = "Breast"
project_id = "TCGA-BRCA"

data_type = "Gene Expression Quantification" # RNA-Seq
workflow_type = "HTSeq - Counts"
size = 2000

# The 'fields' parameter is passed as a comma-separated string of single names.
fields = [
 "file_name"
 , "cases.primary_site"
 , "cases.case_id"
 , "cases.project.project_id"
 , "cases.days_to_lost_to_followup"
 , "cases.submitter_id"
 , "cases.samples.submitter_id"
 , "cases.samples.sample_id"

]

fields = ','.join(fields)

#cases.project.project_id in ["TCGA-BRCA"] and files.data_type in ["Gene Expression Quantification"]
filters = {
 "op":"and",
 "content":[
 {"op": "in",
 "content":{
 "field": "cases.project.project_id",
 "value": [project_id]
 }
 },
 {"op": "in",
 "content":{
 "field": "files.data_type",
 "value": [data_type]
 }
 },
 {"op": "in",
 "content":{
 "field": "files.analysis.workflow_type",
 "value": [workflow_type]
 }
 }
 ]
}

# With a GET request, the filters parameter needs to be converted
# from a dictionary to JSON-formatted string

params = {
 "filters": json.dumps(filters),
 "fields": fields,
 "format": "JSON",
 "size": size
 }



The code failed because of a fatal error:
	Error sending http request and maximum retry encountered..

Some things to try:
a) Make sure Spark has enough available resources for Jupyter to create a Spark context.
b) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.
c) Restart the kernel.


In [130]:
## Get Files

query_response = requests.get(files_endpt, params = params)

json_response = json.loads(query_response.content.decode("utf-8"))["data"]["hits"]

print (len(json_response))
print(json_response)

files_json = json_response

1222
[{'id': '3985f626-17e7-40e2-9429-72df429d2325', 'cases': [{'project': {'project_id': 'TCGA-BRCA'}, 'primary_site': 'Breast', 'submitter_id': 'TCGA-E9-A1NF', 'samples': [{'sample_id': '0c30cd0e-a2d2-42b0-9ca9-8297de64e4a8', 'submitter_id': 'TCGA-E9-A1NF-01A'}], 'case_id': 'a8b1f6e7-2bcf-460d-b1c6-1792a9801119'}], 'file_name': 'e9cbd268-573f-41fc-9f17-d25e1d1c9759.htseq.counts.gz'}, {'id': 'c54676b5-d200-4c18-846d-ca510b0d4865', 'cases': [{'project': {'project_id': 'TCGA-BRCA'}, 'primary_site': 'Breast', 'submitter_id': 'TCGA-D8-A27M', 'samples': [{'sample_id': 'd4371446-8326-4e35-9e86-d8a48393efae', 'submitter_id': 'TCGA-D8-A27M-01A'}], 'case_id': 'ae65baeb-6b78-492a-8c63-bb7e93e83dc2'}], 'file_name': '92891ee4-51a0-4e4d-b6db-70fbb2a756bc.htseq.counts.gz'}, {'id': '70fa91a5-a459-4483-a204-2d997863a0fd', 'cases': [{'project': {'project_id': 'TCGA-BRCA'}, 'primary_site': 'Breast', 'submitter_id': 'TCGA-BH-A0GZ', 'samples': [{'sample_id': '1142b1cf-ebf3-485b-aae2-dcf4d8793212', 'submi

In [191]:
## Process Sample Files

df=pd.DataFrame()
#df = pd.DataFrame([[ENSG, 2], [ENSG, 4]], columns=list('EnsGeneID-SampleID'))


# This step populates the download list with the file_ids from the previous query
for file_entry in files_json:
 file_path = file_entry["id"]+"/"+file_entry["file_name"]
 sample_submitter_id = file_entry["cases"][0]["samples"][0]["submitter_id"]
# print(sample_submitter_id)
 obj = s3.Object(s3_tcga_bucket, file_path)
 with gzip.GzipFile(fileobj=obj.get()["Body"]) as gzipfile:
 content = pd.read_csv(gzipfile, header=None, dtype=str, sep="\t")
 content.index = content[0]
 content.columns = ['id', sample_submitter_id]
 df = pd.concat([df, content[[sample_submitter_id]]], axis=1, join="outer")
 
print(df)


 TCGA-E9-A1NF-01A TCGA-D8-A27M-01A TCGA-BH-A0GZ-01A \
0 
ENSG00000000003.13 952 2511 2095 
ENSG00000000005.5 2 40 19 
ENSG00000000419.11 1041 1455 1680 
ENSG00000000457.12 906 1978 960 
ENSG00000000460.15 289 725 397 
... ... ... ... 
__no_feature 2100993 5639562 2883795 
__ambiguous 1797118 3059577 2424177 
__too_low_aQual 0 0 0 
__not_aligned 0 0 0 
__alignment_not_unique 9951921 21527391 17750412 

 TCGA-BH-A18V-01A TCGA-A7-A13G-01B TCGA-C8-A275-01A \
0 
ENSG00000000003.13 1097 1009 2868 
ENSG00000000005.5 24 65 10 
ENSG00000000419.11 3564 547 3009 
ENSG00000000457.12 1629 3914 4780 
ENSG00000000460.15 2180 799 1571 
... ... ... ... 
__no_feature 3925724 78728580 3608563 
__ambiguous 2430874 2571234 3632655 
__too_low_aQual 0 0 0 
__not_aligned 0 0 0 
__alignment_not_unique 13946922 52034848 27174935 

 TCGA-AN-A0XS-01A TCGA-E9-A1NA-11A TCGA-OL-A5RW-01A \
0 
ENSG00000000003.13 987 3553 1217 
ENSG00000000005.5 249 1076 0 
ENSG00000000419.11 1830 1360 1543 
ENSG00000000457.12 1569 116

In [None]:
df.to_csv('automl-train.csv', index=True, header=True) # Make sure features are comma-separated

csv_buffer = StringIO()
df.to_csv(csv_buffer)
s3.Object("amazonhokie", 'df.csv').put(Body=csv_buffer.getvalue())

In [2]:
df.to_csv("s3://amazonhokie/"+"test.csv")

The code failed because of a fatal error:
	Error sending http request and maximum retry encountered..

Some things to try:
a) Make sure Spark has enough available resources for Jupyter to create a Spark context.
b) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.
c) Restart the kernel.


In [None]:
## Read Files using EMR / spark
file_uuids_list =[]
for file_entry in files_json:
 file_path = file_entry["id"]+"/"+file_entry["file_name"]
 file_uuids_list.append(file_path)
 
val input = spark.read.option("header", true).csv(file_uuids_list)


In [194]:
import pyarrow as pa
import pyarrow.parquet as pq


table = pa.Table.from_pandas(df)
pq.write_table(table, project_id+workflow_type +'.parquet')

ModuleNotFoundError: No module named 'pyarrow'

In [184]:
## Query Settings

size = 2000
data_category = "Clinical"
data_format = "bcr xml"

# The 'fields' parameter is passed as a comma-separated string of single names.
fields = [
 "file_name"
 , "cases.demographics.vital_status"
 , "cases.primary_site"
 , "cases.case_id"
 , "cases.project.project_id"
 , "cases.days_to_lost_to_followup"
 , "cases.submitter_id"
 , "cases.samples.submitter_id"
 , "cases.samples.sample_id"

]

fields = ','.join(fields)

#cases.project.project_id in ["TCGA-BRCA"] and files.data_type in ["Gene Expression Quantification"]
filters = {
 "op": "and",
 "content":
 [{"op": "in",
 "content":{
 "field": "cases.project.project_id",
 "value": [project_id]
 }
 },{"op": "in",
 "content":{
 "field": "files.data_category",
 "value": [data_category]
 }
 } ,{"op": "in",
 "content":{
 "field": "files.data_format",
 "value": [data_format]
 }
 } 
 ]
}

# With a GET request, the filters parameter needs to be converted
# from a dictionary to JSON-formatted string

params = {
 "filters": json.dumps(filters),
 "fields": fields,
 "format": "JSON",
 "size": size
 }


## Get Files

clin_response = requests.get(files_endpt, params = params)

json_clin = json.loads(clin_response.content.decode("utf-8"))["data"]["hits"]
print (len(json_clin))
#print(json_clin)


1097
[{'file_name': 'nationwidechildrens.org_clinical.TCGA-4H-AAAK.xml', 'id': '50c36de6-81a2-499e-ada9-da5479329473', 'cases': [{'primary_site': 'Breast', 'case_id': '6623fc5e-00be-4476-967a-cbd55f676ea6', 'submitter_id': 'TCGA-4H-AAAK', 'project': {'project_id': 'TCGA-BRCA'}}]}, {'file_name': 'nationwidechildrens.org_clinical.TCGA-C8-A12P.xml', 'id': '014f5ae1-5862-4165-9a3b-bba7bb08a527', 'cases': [{'primary_site': 'Breast', 'case_id': 'abdc76db-f85e-4337-a57e-6d098789da03', 'submitter_id': 'TCGA-C8-A12P', 'project': {'project_id': 'TCGA-BRCA'}}]}, {'file_name': 'nationwidechildrens.org_clinical.TCGA-AN-A0FF.xml', 'id': '0382de14-79b0-4fdc-bf14-d82494cbcdee', 'cases': [{'primary_site': 'Breast', 'case_id': 'd5cab7f5-e4f1-40ee-a0c8-4c8004c1c9a0', 'submitter_id': 'TCGA-AN-A0FF', 'project': {'project_id': 'TCGA-BRCA'}}]}, {'file_name': 'nationwidechildrens.org_clinical.TCGA-BH-A1EW.xml', 'id': '4cc94cff-842d-4473-9de6-27a404dbe215', 'cases': [{'primary_site': 'Breast', 'case_id': '9d16

In [185]:
## Process Clinical Files

df_clin=pd.DataFrame(columns = list(["pt_id", "vital_status", "days_to_death"]))
import xml.etree.ElementTree as ET

ns= { 'admin': 'http://tcga.nci/bcr/xml/clinical/brca/2.7',
 'clin_shared':"http://tcga.nci/bcr/xml/clinical/shared/2.7",
 'shared':"http://tcga.nci/bcr/xml/shared/2.7"
 }
 #xmlns:brca_shared="http://tcga.nci/bcr/xml/clinical/brca/shared/2.7" xmlns:shared_stage="http://tcga.nci/bcr/xml/clinical/shared/stage/2.7" xmlns:brca_nte="http://tcga.nci/bcr/xml/clinical/brca/shared/new_tumor_event/2.7/1.0" xmlns:nte="http://tcga.nci/bcr/xml/clinical/shared/new_tumor_event/2.7" xmlns:follow_up_v4.0="http://tcga.nci/bcr/xml/clinical/brca/followup/2.7/4.0" xmlns:rx="http://tcga.nci/bcr/xml/clinical/pharmaceutical/2.7" xmlns:rad="http://tcga.nci/bcr/xml/clinical/radiation/2.7">

# This step populates the download list with the file_ids from the previous query
for file_entry in json_clin:
 file_path = file_entry["id"]+"/"+file_entry["file_name"]
 submitter_id = file_entry["cases"][0]["submitter_id"]

 #print(file_path)
 obj = s3.Object(s3_tcga_bucket, file_path)
 tree = ET.parse(obj.get()["Body"])
 root = tree.getroot()
 
 for pt in root.findall('admin:patient', ns):
 vital = pt.find('clin_shared:vital_status', ns).text
 pt_id = pt.find('shared:bcr_patient_barcode', ns).text
 days_to_death = pt.find('clin_shared:days_to_death', ns).text
 #print(pt_id, vital, days_to_death)
 df_clin = df_clin.append({'pt_id': pt_id, 'vital_status': vital, 'days_to_death': days_to_death}, ignore_index=True)

df_clin.index = df_clin["pt_id"] 
print(df_clin)


50c36de6-81a2-499e-ada9-da5479329473/nationwidechildrens.org_clinical.TCGA-4H-AAAK.xml
TCGA-4H-AAAK Alive None
014f5ae1-5862-4165-9a3b-bba7bb08a527/nationwidechildrens.org_clinical.TCGA-C8-A12P.xml
TCGA-C8-A12P Alive None
0382de14-79b0-4fdc-bf14-d82494cbcdee/nationwidechildrens.org_clinical.TCGA-AN-A0FF.xml
TCGA-AN-A0FF Alive None
4cc94cff-842d-4473-9de6-27a404dbe215/nationwidechildrens.org_clinical.TCGA-BH-A1EW.xml
TCGA-BH-A1EW Dead 1694
c2bf0597-0315-417b-b477-c44a9a7eeaea/nationwidechildrens.org_clinical.TCGA-A8-A06Z.xml
TCGA-A8-A06Z Alive None
a1d0889c-a7ef-4c5e-bb7f-7dbc7e5e8e7b/nationwidechildrens.org_clinical.TCGA-E9-A1NI.xml
TCGA-E9-A1NI Alive None
1bd2a479-da63-4cee-a722-31570b97c254/nationwidechildrens.org_clinical.TCGA-BH-A1EY.xml
TCGA-BH-A1EY Dead 538
7fb10bf6-295a-4b1c-9862-b027907f51cb/nationwidechildrens.org_clinical.TCGA-A7-A13G.xml
TCGA-A7-A13G Alive None
4cb4f2d3-d48a-4843-8eb5-ee704ebfb13a/nationwidechildrens.org_clinical.TCGA-A2-A3XW.xml
TCGA-A2-A3XW Alive None
ee9e

In [192]:
## Map Patient to Sample

print(df)
print(df_clin)
response_var = "vital_status"

for i in df.index:
 print(df.index)
 pt_id = df.index[i][0:11]
 df.at[i, response_var] = df_clin[pt_id,response_var]
 
 

SyntaxError: unexpected EOF while parsing (, line 13)

In [None]:
## Save training set to S3
bucket = 'amazonhokie'
prefix = 'sagemaker/DEMO-xgboost-dm'
 
# Define IAM role
from sagemaker import get_execution_role
sess = sagemaker.Session()

sm = boto3.client('sagemaker')
role = get_execution_role()

df.to_csv('automl-train.csv', index=False, header=True) # Make sure features are comma-separated
sess.upload_data(path='automl-train.csv', key_prefix=prefix + '/input')


## Prep Autopilot
input_data_config = [{
 'DataSource': {
 'S3DataSource': {
 'S3DataType': 'S3Prefix',
 'S3Uri': 's3://{}/{}/input'.format(bucket,prefix)
 }
 },
 'TargetAttributeName': response_var
 }
 ]

output_data_config = {
 'S3OutputPath': 's3://{}/{}/output'.format(bucket,prefix)
 }


## Run Autopilot
auto_ml_job_name = 'automl-dm-' + timestamp_suffix
print('AutoMLJobName: ' + auto_ml_job_name)

sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,
 InputDataConfig=input_data_config,
 OutputDataConfig=output_data_config,
 RoleArn=role)