#Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. #SPDX-License-Identifier: MIT-0 import ast from io import StringIO import jsonlines import json import pickle import boto3 import nltk nltk.download('punkt') nltk.download('wordnet') def split_s3_path(s3_path): """ Splits the complete s3 path to a bucket name and a key name, this is useful in cases where and api requires two seperate entries (bucket and key) Arguments: s3_path {string} -- An S3 uri path Returns: bucket {string} - The bucket name key {string} - The key name """ bucket = s3_path.split("/")[2] key = '/'.join(s3_path.split("/")[3:]) return bucket, key def write_dataframe_to_s3(dataframe, bucket_name, file_name, index=True, header=True): """ Pushes a pandas dataframe to S3 using StringIO library Arguments: dataframe {pd.DataFrame} -- The pandas dataframe that you want to push to S3 bucket_name {string} -- The name of the bucket in the object file path file_name {string} -- The name of the key/file that will show up in S3 Keyword Arguments: index {bool} -- If the index of the dataframe is included (default: {True}) header {bool} -- If the head of the dataframe is included (default: {True}) """ csv_buffer = StringIO() dataframe.to_csv(csv_buffer, header=header, index=index) s3_resource = boto3.resource("s3") s3_resource.Object(bucket_name, file_name).put(Body=csv_buffer.getvalue()) def write_pickle_to_s3(obj, bucket_name, file_name): """ Writes a pickle object to S3 using the pickle library and boto3 api Arguments: obj {a pickle object} -- This is the pickle object you want to push to S3 bucket_name {string} -- The name of the bucket file_name {string} -- The key/file name """ pickle_byte_obj = pickle.dumps(obj) s3_resource = boto3.resource("s3") s3_resource.Object(bucket_name, file_name).put(Body=pickle_byte_obj) def read_pickle_from_s3(bucket_name, file_name): """ Reads a pickle object from S3 using the pickle library and the boto3 api Arguments: bucket_name {string} -- The name of the bucket file_name {string} -- The key/file name Returns: {pickle} -- the loaded pickle object """ s3_resource = boto3.resource("s3") pickle_obj_bytes = s3_resource.Object(bucket_name, file_name).get()["Body"].read() obj = pickle.loads(pickle_obj_bytes) return obj def read_jsonline(fname): """ Iterates over a jsonlines file and yields results Arguments: bucket_name {string} -- The name of the bucket """ with jsonlines.open(fname) as reader: for line in reader: yield line def read_jsonlines_from_s3(bucket_name, file_name): """ Read jsonlines file directly from s3 Arguments: bucket {string} -- The name of the bucket file_name {string} -- The key/file name Returns: {list} -- a list of json objects """ json_lines_object = boto3.client('s3').get_object(Bucket=bucket_name, Key=file_name) json_lines_object_list = json_lines_object['Body'].read().decode('utf8').split('\n')[:-1] records = [] for element in json_lines_object_list: record = ast.literal_eval(element) records.append(record) return records def read_json_from_s3(bucket_name, file_name): """ Reads a json file from S3 based on the bucket and key passed as in input Arguments: bucket_name {string} -- The name of the bucket file_name {string} -- The key/file name Returns: dict -- The dictionary containing the read json object """ s3_obj = boto3.client('s3') s3_clientobj = s3_obj.get_object(Bucket=bucket_name, Key=file_name) s3_clientdata = s3_clientobj['Body'].read().decode('utf-8') result=json.loads(s3_clientdata) return result