3 \b!@sjddlZddlZddlZddlZddlZddlZddljZddl j Z ddl j j Z ddljZddlZddlZddlmZmZddlZddlZddlZddlmZddlmZddlmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%ddl&m'Z'ej(e)Z*e*j+ej,e*j-ej.ej/GdddeZ0d d Z1d d Z2d dZ3ddZ4e)dkrfej5Z6e6j7de8dddde6j7de8dddde6j7de8dddde6j7de9dddde6j7d e9d!d"d#de6j7d$e8d%d&d'de6j7d(e8ddd)de6j7d*e:dd+d,e6j7d-e;ejdS)8N)Dataset DataLoader)Image) print_config)Compose LoadImageResizeScaleIntensityToTensor RandRotateRandFlipRandZoom) densenet121c@s$eZdZddZddZddZdS) DICOMDatasetcCs||_||_||_dS)N) image_fileslabels transforms)selfrrrrR/home/ec2-user/SageMaker/MONAI-MedicalImage-SageMaker/Classification/code/train.py__init__!szDICOMDataset.__init__cCs t|jS)N)lenr)rrrr__len__&szDICOMDataset.__len__cCs|j|j||j|fS)N)rrr)rindexrrr __getitem__)szDICOMDataset.__getitem__N)__name__ __module__ __qualname__rrrrrrrrsrcKstjdttddttddddtdddtdd tg}t |||}|r`t j j j j|nd}t j j j|f||dk|d |S)NzGet train data loaderT) image_onlyg?)Zrange_xprobZ keep_sizer)Z spatial_axisr ) spatial_size) batch_sizeshuffleZsampler)r!r&)loggerinforrr r r rr rtorchutilsdata distributedZDistributedSamplerr)r$ZtrainXZtrainYis_distributedkwargsZtrain_transformsdatasetZ train_samplerrrr_get_train_data_loader-s     r0c'Csxt|jdko|jdk }tjdj||jdk}tjdj|j|rRdddni}tj|rbdnd }|rt|j}t |t j d <|jj |j }t |t j d <tj|j||d tjd j|jtjdjtj|jtj|j|rtjj|jg}g}|jd} t| } tj| } WdQRXdddd} t| j} t| }g}g}g}g}xP| D]H}|d}|jd|}|j||dd}| |}|j|ggq`Wtdt|t|j||df|}t ddddj!|}tj"j#}tj$j%|j&d}|j'}d}d"}d#}t}t}xHt(|D]:}tj)d$tj)d|dd||j*d}d} x|D]}!| d7} |!dj!|}"td|"j+|"j,dddd}"td|"j+|!ddj!|}#|j-||"}$||$|#}%|%j.|j/||%j07}tj)| dt|j1|jd|%j0d t|j1|j}&qhW|| }|j|tj)d|dd!|d q(Wt2||j3dS)%Nr"zDistributed training - {}rzNumber of gpus available - {} T) num_workersZ pin_memorycudacpuZ WORLD_SIZEZRANK)backendrank world_sizezCInitialized the distributed environment: '{}' backend on {} nodes. z+Current host rank is {}. Number of gpus: {}z/manifest.json)capnormalZcovidfilename/contentlabelzTraining count =F) spatial_dims in_channels out_channelsgh㈵>-zepoch zinputs shape is -----zinputs shape after is -----z, train_loss: z.4fz average loss: r&r&z ----------)4rhostsr5r'debugformatZnum_gpusr)devicestrosenvironr current_hostdistZinit_process_groupZget_world_sizeZget_rankZ manual_seedseedr3data_diropenjsonloadlistkeysappendextendprintr0r$rtonnZCrossEntropyLossoptimZAdam parametersepochsranger(trainshapeZpermuteZ zero_gradbackwardstepitemr/ save_model model_dir)'argsr-Zuse_cudar.rGr7Z host_rankZimage_label_listimage_file_listmetadatafmanifestZ my_dictionary class_namesZ num_classfilenamer;r>Z label_numericZ train_loadermodel loss_function optimizerZ epoch_numZ val_intervalZ best_metricZbest_metric_epochZepoch_loss_valuesZ metric_valuesepochZ epoch_lossr`Z batch_datainputsroutputslossZ epoch_lenrrrr]Ds                 , "r]cCs0tjdtjj|d}tj|jj|dS)NzSaving the model.z model.pth) r'r(rIpathjoinr)saver4Z state_dict)rlrcrsrrrrbs rbc Cs`tjtjjrdnd}tdddd}ttjj|dd}|j tj |WdQRX|j |S) Nr3r4r8r"r?)r@rArBz model.pthrb) r)rGr3Z is_availablerrOrIrsrtZload_state_dictrQrW)rcrGrlrgrrrmodel_fnsrw__main__z --batch-sizedNz-input batch size for training (default: 1000))typedefaultmetavarhelpz--test-batch-sizez+input batch size for testing (default: 100)z--epochsz&number of epochs to train (default: 5)z--lrg{Gz?ZLRzlearning rate (default: 0.01)z --momentumg?MzSGD momentum (default: 0.5)z--seedr"Szrandom seed (default: 1)z--log-intervalz7how many batches to wait before logging training statusz --backendzIbackend for distributed training (tcp, gloo on cpu and gloo, nccl on gpu))r{r|r~z--hostsZSM_HOSTS)r{r|z--current-hostZSM_CURRENT_HOSTz --model-dirZ SM_MODEL_DIRz --data-dirZSM_CHANNEL_TRAINz --num-gpusZ SM_NUM_GPUS)?argparserPloggingrIsysr)Ztorch.distributedr,rLZtorch.nnrXZtorch.nn.functionalZ functionalFZ torch.optimrYZtorch.utils.dataZtorch.utils.data.distributedrrZ torchvisionpandaspdnumpynpZPILrZ monai.configrZmonai.transformsrrrr r r r r Zmonai.networks.netsr getLoggerrr'setLevelDEBUG addHandler StreamHandlerstdoutrr0r]rbrwArgumentParserparser add_argumentintfloatrHrRloadsrJ parse_argsrrrrsf      (   o