3 $8+b!@sjddlZddlZddlZddlZddlZddlZddljZddl j Z ddl j j Z ddljZddlZddlZddlmZmZddlZddlZddlZddlmZddlmZddlmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%ddl&m'Z'ej(e)Z*e*j+ej,e*j-ej.ej/GdddeZ0d d Z1d d Z2d dZ3ddZ4e)dkrfej5Z6e6j7de8dddde6j7de8dddde6j7de8dddde6j7de9dddde6j7d e9d!d"d#de6j7d$e8d%d&d'de6j7d(e8ddd)de6j7d*e:dd+d,e6j7d-e;ejdS)8N)Dataset DataLoader)Image) print_config)Compose LoadImageResizeScaleIntensityToTensor RandRotateRandFlipRandZoom) densenet121c@s$eZdZddZddZddZdS) DICOMDatasetcCs||_||_||_dS)N) image_fileslabels transforms)selfrrrrD/home/ec2-user/SageMaker/CovidCTClassification/source/monai_dicom.py__init__!szDICOMDataset.__init__cCs t|jS)N)lenr)rrrr__len__&szDICOMDataset.__len__cCs|j|j||j|fS)N)rrr)rindexrrr __getitem__)szDICOMDataset.__getitem__N)__name__ __module__ __qualname__rrrrrrrrsrcKstjdttddttddddtdddtdd tg}t |||}|r`t j j j j|nd}t j j j|f||dk|d |S)NzGet train data loaderT) image_onlyg?)range_xprob keep_sizer) spatial_axisr!) spatial_size) batch_sizeshufflesampler)r$r*)loggerinforrr r r rr rtorchutilsdata distributedDistributedSamplerr)r'ZtrainXZtrainYis_distributedkwargstrain_transformsdatasetZ train_samplerrrr_get_train_data_loader-s     r6c'Csxt|jdko|jdk }tjdj||jdk}tjdj|j|rRdddni}tj|rbdnd }|rt|j}t |t j d <|jj |j }t |t j d <tj|j||d tjd j|jtjdjtj|jtj|j|rtjj|jg}g}|jd} t| } tj| } WdQRXdddd} t| j} t| }g}g}g}g}xP| D]H}|d}|jd|}|j||dd}| |}|j|ggq`Wtdt|t|j||df|}t ddddj!|}tj"j#}tj$j%|j&d}|j'}d}d"}d#}t}t}xHt(|D]:}tj)d$tj)d|dd||j*d}d} x|D]}!| d7} |!dj!|}"td|"j+|"j,dddd}"td|"j+|!ddj!|}#|j-||"}$||$|#}%|%j.|j/||%j07}tj)| dt|j1|jd|%j0d t|j1|j}&qhW|| }|j|tj)d|dd!|d q(Wt2||j3dS)%Nr%zDistributed training - {}rzNumber of gpus available - {} T) num_workers pin_memorycudacpu WORLD_SIZERANK)backendrank world_sizezCInitialized the distributed environment: '{}' backend on {} nodes. z+Current host rank is {}. Number of gpus: {}z/manifest.json)capnormalZcovidfilename/contentlabelzTraining count =F) spatial_dims in_channels out_channelsgh㈵>-zepoch zinputs shape is -----zinputs shape after is -----z, train_loss: z.4fz average loss: r*r*z ----------)4rhostsr>r+debugformatnum_gpusr-devicestrosenvironr current_hostdistinit_process_groupget_world_sizeget_rank manual_seedseedr:data_diropenjsonloadlistkeysappendextendprintr6r'rtonnCrossEntropyLossoptimAdam parametersepochsranger,trainshapepermute zero_gradbackwardstepitemr5 save_model model_dir)'argsr2use_cudar3rQr@Z host_rankimage_label_listimage_file_listmetadatafmanifestZ my_dictionary class_names num_classfilenamerDrGZ label_numeric train_loadermodel loss_function optimizerZ epoch_numZ val_interval best_metricbest_metric_epochZepoch_loss_valuesZ metric_valuesepochZ epoch_lossrr batch_datainputsroutputslossZ epoch_lenrrrrmDs                 , "rmcCs0tjdtjj|d}tj|jj|dS)NzSaving the model.z model.pth) r+r,rSpathjoinr-saver; state_dict)rrurrrrrts rtc Cs`tjtjjrdnd}tdddd}ttjj|dd}|j tj |WdQRX|j |S) Nr:r;rAr%rH)rIrJrKz model.pthrb) r-rQr: is_availablerr]rSrrload_state_dictr_re)rurQrr{rrrmodel_fnsr__main__z --batch-sizedNz-input batch size for training (default: 1000))typedefaultmetavarhelpz--test-batch-sizez+input batch size for testing (default: 100)z--epochsz&number of epochs to train (default: 5)z--lrg{Gz?LRzlearning rate (default: 0.01)z --momentumg?MzSGD momentum (default: 0.5)z--seedr%Szrandom seed (default: 1)z--log-intervalz7how many batches to wait before logging training statusz --backendzIbackend for distributed training (tcp, gloo on cpu and gloo, nccl on gpu))rrrz--hostsZSM_HOSTS)rrz--current-hostZSM_CURRENT_HOSTz --model-dirZ SM_MODEL_DIRz --data-dirZSM_CHANNEL_TRAINz --num-gpusZ SM_NUM_GPUS)?argparser^loggingrSsysr-torch.distributedr0rVtorch.nnrftorch.nn.functional functionalFZ torch.optimrhtorch.utils.datatorch.utils.data.distributedrr torchvisionpandaspdnumpynpPILr monai.configrmonai.transformsrrrr r r r r monai.networks.netsr getLoggerrr+setLevelDEBUG addHandler StreamHandlerstdoutrr6rmrtrArgumentParserparser add_argumentintfloatrRr`loadsrT parse_argsrrrrsf      (   o