# BACKGROUND # # This script creates lists of train/label artifacts suitable for training the BBBC021 dataset #. https://bbbc.broadinstitute.org/BBBC021 # # To demonstrate prediction of mechanisn of action (MOA), a separate training set is created for each compound of known MOA, such # that the remaining labeled compounds can be used to train a classifier, which in turn can attempt to successfully classify the # left out compound. # # Once the classifier is trained, there are two 'best-matching' methods: # # NSC - not same compound - only considers matches to other compounds # NSCB - not same compound or batch - excludes both same compound and batch before best-match # # In addition to NSC vs NSCB for matching, there are two different scoring methods: # # Per Treatment - averages embeddings across all wells for a given treatment #. Per Well - averages embeddings only within wells # # USAGE # # This script is run after all bbbc021 plate imagery is loaded into bioimage search. # # It requires as input an 'embeddingName' which is used to identify the particular dataset of interest for training, in terms of its image processing # and training parameters. Different trainIds within the context of an embedding correspond to different training runs with different partitions of # training data, but they do not vary in structural parameters for image processing or training. # # We first use utilities to get BBBC-021 metadata that identifies the compound and moa for every plate, well, and image. # # Our output will consist of a separate image artifact "exclusion" list per compound with a known MOA, corresponding to the compound to be # left out during the assembly of the training files. # # Although the dataset has 113 compounds, only a subset of these have known MOAs and therefore can be used. # # Our general plan is to: # # 1 - Load BBBC-021 metadata #. 2 - Create a map of lists, the keys for which corresponds to each compound with known moa. The list will be a list of objects, each of which will #. contain sufficient info to specify the training artifacts, namely, { plateId, imageId } - we also need embeddingName but that is global context. # 3 - Get the list of plateIds compatible with the specified embedding #. 4 - Iterate through the list of compatible plateIds #. 5 - For each plateId, iterate through its member images # 6 - For each member image, iterate over each moa-compound # 7 - For each moa-compound, create a list of corresponding imageIds. # 8 - Once all lists are populated, write the exclusion files locally. # ############################################################################################### import sys import argparse import boto3 from pathlib import Path import bbbc021common as bb s3c = boto3.client('s3') sys.path.insert(0, "../../../cli/bioims/src") import bioims parser = argparse.ArgumentParser() parser.add_argument('--bbbc021-bucket', type=str, required=True, help='bbbc021 bucket') parser.add_argument('--bioims-resource-bucket', type=str, required=True, help='resource bucket') parser.add_argument('--embeddingName', type=str, required=True, help='embedding name') args = parser.parse_args() BBBC021_BUCKET = args.bbbc021_bucket BIOIMS_INPUT_BUCKET = args.bioims_resource_bucket EMBEDDING = args.embeddingName image_df, moa_df = bb.Bbbc021PlateInfoByDF.getDataFrames(BBBC021_BUCKET) compound_moa_map = bb.Bbbc021PlateInfoByDF.getCompoundMoaMapFromDf(moa_df) # We need to go from imageId->ImageSourceId->compound->moa # 'Image_FileName_DAPI[:-4]' serves as the ImageSourceId sourceCompoundMap={} for i in range(len(image_df.index)): r = image_df.iloc[i] imageSourceId = r['Image_FileName_DAPI'][:-4] compound = r['Image_Metadata_Compound'] sourceCompoundMap[imageSourceId]=compound bbbc021ImageCount = len(image_df.index) print("BBBC-021 image count={}".format(bbbc021ImageCount)) imagesRemovedByCompound={} moaDict={} i=0 for k, v in compound_moa_map.items(): print("i={} key={} value={}".format(i,k,v)) moaDict[v]=True removedList = [] imagesRemovedByCompound[k]=removedList i+=1 imageClient = bioims.client('image-management') trainingConfigurationClient = bioims.client('training-configuration') embeddingInfo = trainingConfigurationClient.getEmbeddingInfo(EMBEDDING) print(embeddingInfo) width = embeddingInfo['inputWidth'] height = embeddingInfo['inputHeight'] depth = embeddingInfo['inputDepth'] channels = embeddingInfo['inputChannels'] plateList = imageClient.listCompatiblePlates(width, height, depth, channels) for i, pi in enumerate(plateList): plateId = pi['plateId'] print("Plate {} {}".format(i, plateId)) imageList = imageClient.getImagesByPlateId(plateId) for imageItem in imageList: image = imageItem['Item'] imageId = image['imageId'] imageSourceId = image['imageSourceId'] imageCompound = sourceCompoundMap[imageSourceId] if 'trainCategory' in image and 'trainLabel' in image: trainCategory = image['trainCategory'] trainLabel = image['trainLabel'] if trainCategory=='moa' and moaDict[trainLabel]: for compound, moa in compound_moa_map.items(): if compound==imageCompound: imagesRemovedByCompound[compound].append(imageId) for compound, imageList in imagesRemovedByCompound.items(): l = len(imageList) print("{} has {} entries".format(compound, l)) cnws ="".join(compound.split()) c2 = cnws.replace('/','-') trainFile = c2 + "-filter.txt" print("Writing {}".format(trainFile)) f = open(trainFile, "w") for imageId in imageList: f.write(imageId+'\n') f.close() trainPath = "train-filter/" + EMBEDDING + "/" + trainFile with open(trainFile, 'rb') as fdata: s3c.upload_fileobj(fdata, BIOIMS_INPUT_BUCKET, trainPath) fnPath=Path(trainFile) fnPath.unlink()