# This is a sample Python program that runs a simple scikit-learn processing using the FrameworkProcessor to perform # word_tokenize using nltk . This implementation will work on your *local computer*. # # Prerequisites: # 1. Install required Python packages: # pip install boto3 sagemaker pandas scikit-learn # pip install 'sagemaker[local]' # 2. Docker Desktop installed and running on your computer: # `docker ps` # 3. You should have AWS credentials configured on your local machine # in order to be able to pull the docker image from ECR. ######################################################################################################################## from sagemaker.local import LocalSession from sagemaker.processing import ProcessingInput, ProcessingOutput from sagemaker.processing import FrameworkProcessor from sagemaker.sklearn.estimator import SKLearn sagemaker_session = LocalSession() sagemaker_session.config = {'local': {'local_code': True}} # For local training a dummy role will be sufficient role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001' processor = FrameworkProcessor( estimator_cls=SKLearn, framework_version='0.20.0', instance_count=1, instance_type='local', role=role ) print('Starting processing job.') print('Note: if launching for the first time in local mode, container image download might take a few minutes to complete.') processor.run( code='processing_script.py', dependencies=['./dependencies/requirements.txt'], inputs=[ ProcessingInput( source='./input_data/', destination='/opt/ml/processing/input_data/') ], outputs=[ProcessingOutput( output_name='tokenized_words_data', source='/opt/ml/processing/processed_data/')], arguments=['job-type', 'word-tokenize'] ) preprocessing_job_description = processor.jobs[-1].describe() output_config = preprocessing_job_description['ProcessingOutputConfig'] print(output_config) for output in output_config['Outputs']: if output['OutputName'] == 'tokenized_words_data': tokenized_words_data_file = output['S3Output']['S3Uri'] print('Output file is located on: {}'.format(tokenized_words_data_file))