/** * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. * SPDX-License-Identifier: Apache-2.0. */ #pragma once #include #include #include namespace Aws { namespace Utils { namespace Json { class JsonValue; class JsonView; } // namespace Json } // namespace Utils namespace MachineLearning { namespace Model { /** *

Describes the data specification of a DataSource.

See * Also:

AWS * API Reference

*/ class S3DataSpec { public: AWS_MACHINELEARNING_API S3DataSpec(); AWS_MACHINELEARNING_API S3DataSpec(Aws::Utils::Json::JsonView jsonValue); AWS_MACHINELEARNING_API S3DataSpec& operator=(Aws::Utils::Json::JsonView jsonValue); AWS_MACHINELEARNING_API Aws::Utils::Json::JsonValue Jsonize() const; /** *

The location of the data file(s) used by a DataSource. The URI * specifies a data file or an Amazon Simple Storage Service (Amazon S3) directory * or bucket containing data files.

*/ inline const Aws::String& GetDataLocationS3() const{ return m_dataLocationS3; } /** *

The location of the data file(s) used by a DataSource. The URI * specifies a data file or an Amazon Simple Storage Service (Amazon S3) directory * or bucket containing data files.

*/ inline bool DataLocationS3HasBeenSet() const { return m_dataLocationS3HasBeenSet; } /** *

The location of the data file(s) used by a DataSource. The URI * specifies a data file or an Amazon Simple Storage Service (Amazon S3) directory * or bucket containing data files.

*/ inline void SetDataLocationS3(const Aws::String& value) { m_dataLocationS3HasBeenSet = true; m_dataLocationS3 = value; } /** *

The location of the data file(s) used by a DataSource. The URI * specifies a data file or an Amazon Simple Storage Service (Amazon S3) directory * or bucket containing data files.

*/ inline void SetDataLocationS3(Aws::String&& value) { m_dataLocationS3HasBeenSet = true; m_dataLocationS3 = std::move(value); } /** *

The location of the data file(s) used by a DataSource. The URI * specifies a data file or an Amazon Simple Storage Service (Amazon S3) directory * or bucket containing data files.

*/ inline void SetDataLocationS3(const char* value) { m_dataLocationS3HasBeenSet = true; m_dataLocationS3.assign(value); } /** *

The location of the data file(s) used by a DataSource. The URI * specifies a data file or an Amazon Simple Storage Service (Amazon S3) directory * or bucket containing data files.

*/ inline S3DataSpec& WithDataLocationS3(const Aws::String& value) { SetDataLocationS3(value); return *this;} /** *

The location of the data file(s) used by a DataSource. The URI * specifies a data file or an Amazon Simple Storage Service (Amazon S3) directory * or bucket containing data files.

*/ inline S3DataSpec& WithDataLocationS3(Aws::String&& value) { SetDataLocationS3(std::move(value)); return *this;} /** *

The location of the data file(s) used by a DataSource. The URI * specifies a data file or an Amazon Simple Storage Service (Amazon S3) directory * or bucket containing data files.

*/ inline S3DataSpec& WithDataLocationS3(const char* value) { SetDataLocationS3(value); return *this;} /** *

A JSON string that represents the splitting and rearrangement processing to * be applied to a DataSource. If the DataRearrangement * parameter is not provided, all of the input data is used to create the * Datasource.

There are multiple parameters that control what * data is used to create a datasource:

  • * percentBegin

    Use percentBegin to indicate * the beginning of the range of the data used to create the Datasource. If you do * not include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource.

  • * percentEnd

    Use percentEnd to indicate the * end of the range of the data used to create the Datasource. If you do not * include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource.

  • * complement

    The complement parameter * instructs Amazon ML to use the data that is not included in the range of * percentBegin to percentEnd to create a datasource. The * complement parameter is useful if you need to create complementary * datasources for training and evaluation. To create a complementary datasource, * use the same values for percentBegin and percentEnd, * along with the complement parameter.

    For example, the * following two datasources do not share any data, and can be used to train and * evaluate a model. The first datasource has 25 percent of the data, and the * second one has 75 percent of the data.

    Datasource for evaluation: * {"splitting":{"percentBegin":0, "percentEnd":25}}

    *

    Datasource for training: {"splitting":{"percentBegin":0, * "percentEnd":25, "complement":"true"}}

  • * strategy

    To change how Amazon ML splits the data for a * datasource, use the strategy parameter.

    The default value * for the strategy parameter is sequential, meaning that * Amazon ML takes all of the data records between the percentBegin * and percentEnd parameters for the datasource, in the order that the * records appear in the input data.

    The following two * DataRearrangement lines are examples of sequentially ordered * training and evaluation datasources:

    Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"sequential"}}

    Datasource for training: * {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"sequential", "complement":"true"}}

    To randomly split * the input data into the proportions indicated by the percentBegin and percentEnd * parameters, set the strategy parameter to random and * provide a string that is used as the seed value for the random data splitting * (for example, you can use the S3 path to your data as the random seed string). * If you choose the random split strategy, Amazon ML assigns each row of data a * pseudo-random number between 0 and 100, and then selects the rows that have an * assigned number between percentBegin and percentEnd. * Pseudo-random numbers are assigned using both the input seed string value and * the byte offset as a seed, so changing the data results in a different split. * Any existing ordering is preserved. The random splitting strategy ensures that * variables in the training and evaluation data are distributed similarly. It is * useful in the cases where the input data may have an implicit sort order, which * would otherwise result in training and evaluation datasources containing * non-similar data records.

    The following two * DataRearrangement lines are examples of non-sequentially ordered * training and evaluation datasources:

    Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"random", * "randomSeed"="s3://my_s3_path/bucket/file.csv"}}

    Datasource for * training: {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"random", "randomSeed"="s3://my_s3_path/bucket/file.csv", * "complement":"true"}}

*/ inline const Aws::String& GetDataRearrangement() const{ return m_dataRearrangement; } /** *

A JSON string that represents the splitting and rearrangement processing to * be applied to a DataSource. If the DataRearrangement * parameter is not provided, all of the input data is used to create the * Datasource.

There are multiple parameters that control what * data is used to create a datasource:

  • * percentBegin

    Use percentBegin to indicate * the beginning of the range of the data used to create the Datasource. If you do * not include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource.

  • * percentEnd

    Use percentEnd to indicate the * end of the range of the data used to create the Datasource. If you do not * include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource.

  • * complement

    The complement parameter * instructs Amazon ML to use the data that is not included in the range of * percentBegin to percentEnd to create a datasource. The * complement parameter is useful if you need to create complementary * datasources for training and evaluation. To create a complementary datasource, * use the same values for percentBegin and percentEnd, * along with the complement parameter.

    For example, the * following two datasources do not share any data, and can be used to train and * evaluate a model. The first datasource has 25 percent of the data, and the * second one has 75 percent of the data.

    Datasource for evaluation: * {"splitting":{"percentBegin":0, "percentEnd":25}}

    *

    Datasource for training: {"splitting":{"percentBegin":0, * "percentEnd":25, "complement":"true"}}

  • * strategy

    To change how Amazon ML splits the data for a * datasource, use the strategy parameter.

    The default value * for the strategy parameter is sequential, meaning that * Amazon ML takes all of the data records between the percentBegin * and percentEnd parameters for the datasource, in the order that the * records appear in the input data.

    The following two * DataRearrangement lines are examples of sequentially ordered * training and evaluation datasources:

    Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"sequential"}}

    Datasource for training: * {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"sequential", "complement":"true"}}

    To randomly split * the input data into the proportions indicated by the percentBegin and percentEnd * parameters, set the strategy parameter to random and * provide a string that is used as the seed value for the random data splitting * (for example, you can use the S3 path to your data as the random seed string). * If you choose the random split strategy, Amazon ML assigns each row of data a * pseudo-random number between 0 and 100, and then selects the rows that have an * assigned number between percentBegin and percentEnd. * Pseudo-random numbers are assigned using both the input seed string value and * the byte offset as a seed, so changing the data results in a different split. * Any existing ordering is preserved. The random splitting strategy ensures that * variables in the training and evaluation data are distributed similarly. It is * useful in the cases where the input data may have an implicit sort order, which * would otherwise result in training and evaluation datasources containing * non-similar data records.

    The following two * DataRearrangement lines are examples of non-sequentially ordered * training and evaluation datasources:

    Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"random", * "randomSeed"="s3://my_s3_path/bucket/file.csv"}}

    Datasource for * training: {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"random", "randomSeed"="s3://my_s3_path/bucket/file.csv", * "complement":"true"}}

*/ inline bool DataRearrangementHasBeenSet() const { return m_dataRearrangementHasBeenSet; } /** *

A JSON string that represents the splitting and rearrangement processing to * be applied to a DataSource. If the DataRearrangement * parameter is not provided, all of the input data is used to create the * Datasource.

There are multiple parameters that control what * data is used to create a datasource:

  • * percentBegin

    Use percentBegin to indicate * the beginning of the range of the data used to create the Datasource. If you do * not include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource.

  • * percentEnd

    Use percentEnd to indicate the * end of the range of the data used to create the Datasource. If you do not * include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource.

  • * complement

    The complement parameter * instructs Amazon ML to use the data that is not included in the range of * percentBegin to percentEnd to create a datasource. The * complement parameter is useful if you need to create complementary * datasources for training and evaluation. To create a complementary datasource, * use the same values for percentBegin and percentEnd, * along with the complement parameter.

    For example, the * following two datasources do not share any data, and can be used to train and * evaluate a model. The first datasource has 25 percent of the data, and the * second one has 75 percent of the data.

    Datasource for evaluation: * {"splitting":{"percentBegin":0, "percentEnd":25}}

    *

    Datasource for training: {"splitting":{"percentBegin":0, * "percentEnd":25, "complement":"true"}}

  • * strategy

    To change how Amazon ML splits the data for a * datasource, use the strategy parameter.

    The default value * for the strategy parameter is sequential, meaning that * Amazon ML takes all of the data records between the percentBegin * and percentEnd parameters for the datasource, in the order that the * records appear in the input data.

    The following two * DataRearrangement lines are examples of sequentially ordered * training and evaluation datasources:

    Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"sequential"}}

    Datasource for training: * {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"sequential", "complement":"true"}}

    To randomly split * the input data into the proportions indicated by the percentBegin and percentEnd * parameters, set the strategy parameter to random and * provide a string that is used as the seed value for the random data splitting * (for example, you can use the S3 path to your data as the random seed string). * If you choose the random split strategy, Amazon ML assigns each row of data a * pseudo-random number between 0 and 100, and then selects the rows that have an * assigned number between percentBegin and percentEnd. * Pseudo-random numbers are assigned using both the input seed string value and * the byte offset as a seed, so changing the data results in a different split. * Any existing ordering is preserved. The random splitting strategy ensures that * variables in the training and evaluation data are distributed similarly. It is * useful in the cases where the input data may have an implicit sort order, which * would otherwise result in training and evaluation datasources containing * non-similar data records.

    The following two * DataRearrangement lines are examples of non-sequentially ordered * training and evaluation datasources:

    Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"random", * "randomSeed"="s3://my_s3_path/bucket/file.csv"}}

    Datasource for * training: {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"random", "randomSeed"="s3://my_s3_path/bucket/file.csv", * "complement":"true"}}

*/ inline void SetDataRearrangement(const Aws::String& value) { m_dataRearrangementHasBeenSet = true; m_dataRearrangement = value; } /** *

A JSON string that represents the splitting and rearrangement processing to * be applied to a DataSource. If the DataRearrangement * parameter is not provided, all of the input data is used to create the * Datasource.

There are multiple parameters that control what * data is used to create a datasource:

  • * percentBegin

    Use percentBegin to indicate * the beginning of the range of the data used to create the Datasource. If you do * not include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource.

  • * percentEnd

    Use percentEnd to indicate the * end of the range of the data used to create the Datasource. If you do not * include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource.

  • * complement

    The complement parameter * instructs Amazon ML to use the data that is not included in the range of * percentBegin to percentEnd to create a datasource. The * complement parameter is useful if you need to create complementary * datasources for training and evaluation. To create a complementary datasource, * use the same values for percentBegin and percentEnd, * along with the complement parameter.

    For example, the * following two datasources do not share any data, and can be used to train and * evaluate a model. The first datasource has 25 percent of the data, and the * second one has 75 percent of the data.

    Datasource for evaluation: * {"splitting":{"percentBegin":0, "percentEnd":25}}

    *

    Datasource for training: {"splitting":{"percentBegin":0, * "percentEnd":25, "complement":"true"}}

  • * strategy

    To change how Amazon ML splits the data for a * datasource, use the strategy parameter.

    The default value * for the strategy parameter is sequential, meaning that * Amazon ML takes all of the data records between the percentBegin * and percentEnd parameters for the datasource, in the order that the * records appear in the input data.

    The following two * DataRearrangement lines are examples of sequentially ordered * training and evaluation datasources:

    Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"sequential"}}

    Datasource for training: * {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"sequential", "complement":"true"}}

    To randomly split * the input data into the proportions indicated by the percentBegin and percentEnd * parameters, set the strategy parameter to random and * provide a string that is used as the seed value for the random data splitting * (for example, you can use the S3 path to your data as the random seed string). * If you choose the random split strategy, Amazon ML assigns each row of data a * pseudo-random number between 0 and 100, and then selects the rows that have an * assigned number between percentBegin and percentEnd. * Pseudo-random numbers are assigned using both the input seed string value and * the byte offset as a seed, so changing the data results in a different split. * Any existing ordering is preserved. The random splitting strategy ensures that * variables in the training and evaluation data are distributed similarly. It is * useful in the cases where the input data may have an implicit sort order, which * would otherwise result in training and evaluation datasources containing * non-similar data records.

    The following two * DataRearrangement lines are examples of non-sequentially ordered * training and evaluation datasources:

    Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"random", * "randomSeed"="s3://my_s3_path/bucket/file.csv"}}

    Datasource for * training: {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"random", "randomSeed"="s3://my_s3_path/bucket/file.csv", * "complement":"true"}}

*/ inline void SetDataRearrangement(Aws::String&& value) { m_dataRearrangementHasBeenSet = true; m_dataRearrangement = std::move(value); } /** *

A JSON string that represents the splitting and rearrangement processing to * be applied to a DataSource. If the DataRearrangement * parameter is not provided, all of the input data is used to create the * Datasource.

There are multiple parameters that control what * data is used to create a datasource:

  • * percentBegin

    Use percentBegin to indicate * the beginning of the range of the data used to create the Datasource. If you do * not include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource.

  • * percentEnd

    Use percentEnd to indicate the * end of the range of the data used to create the Datasource. If you do not * include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource.

  • * complement

    The complement parameter * instructs Amazon ML to use the data that is not included in the range of * percentBegin to percentEnd to create a datasource. The * complement parameter is useful if you need to create complementary * datasources for training and evaluation. To create a complementary datasource, * use the same values for percentBegin and percentEnd, * along with the complement parameter.

    For example, the * following two datasources do not share any data, and can be used to train and * evaluate a model. The first datasource has 25 percent of the data, and the * second one has 75 percent of the data.

    Datasource for evaluation: * {"splitting":{"percentBegin":0, "percentEnd":25}}

    *

    Datasource for training: {"splitting":{"percentBegin":0, * "percentEnd":25, "complement":"true"}}

  • * strategy

    To change how Amazon ML splits the data for a * datasource, use the strategy parameter.

    The default value * for the strategy parameter is sequential, meaning that * Amazon ML takes all of the data records between the percentBegin * and percentEnd parameters for the datasource, in the order that the * records appear in the input data.

    The following two * DataRearrangement lines are examples of sequentially ordered * training and evaluation datasources:

    Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"sequential"}}

    Datasource for training: * {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"sequential", "complement":"true"}}

    To randomly split * the input data into the proportions indicated by the percentBegin and percentEnd * parameters, set the strategy parameter to random and * provide a string that is used as the seed value for the random data splitting * (for example, you can use the S3 path to your data as the random seed string). * If you choose the random split strategy, Amazon ML assigns each row of data a * pseudo-random number between 0 and 100, and then selects the rows that have an * assigned number between percentBegin and percentEnd. * Pseudo-random numbers are assigned using both the input seed string value and * the byte offset as a seed, so changing the data results in a different split. * Any existing ordering is preserved. The random splitting strategy ensures that * variables in the training and evaluation data are distributed similarly. It is * useful in the cases where the input data may have an implicit sort order, which * would otherwise result in training and evaluation datasources containing * non-similar data records.

    The following two * DataRearrangement lines are examples of non-sequentially ordered * training and evaluation datasources:

    Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"random", * "randomSeed"="s3://my_s3_path/bucket/file.csv"}}

    Datasource for * training: {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"random", "randomSeed"="s3://my_s3_path/bucket/file.csv", * "complement":"true"}}

*/ inline void SetDataRearrangement(const char* value) { m_dataRearrangementHasBeenSet = true; m_dataRearrangement.assign(value); } /** *

A JSON string that represents the splitting and rearrangement processing to * be applied to a DataSource. If the DataRearrangement * parameter is not provided, all of the input data is used to create the * Datasource.

There are multiple parameters that control what * data is used to create a datasource:

  • * percentBegin

    Use percentBegin to indicate * the beginning of the range of the data used to create the Datasource. If you do * not include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource.

  • * percentEnd

    Use percentEnd to indicate the * end of the range of the data used to create the Datasource. If you do not * include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource.

  • * complement

    The complement parameter * instructs Amazon ML to use the data that is not included in the range of * percentBegin to percentEnd to create a datasource. The * complement parameter is useful if you need to create complementary * datasources for training and evaluation. To create a complementary datasource, * use the same values for percentBegin and percentEnd, * along with the complement parameter.

    For example, the * following two datasources do not share any data, and can be used to train and * evaluate a model. The first datasource has 25 percent of the data, and the * second one has 75 percent of the data.

    Datasource for evaluation: * {"splitting":{"percentBegin":0, "percentEnd":25}}

    *

    Datasource for training: {"splitting":{"percentBegin":0, * "percentEnd":25, "complement":"true"}}

  • * strategy

    To change how Amazon ML splits the data for a * datasource, use the strategy parameter.

    The default value * for the strategy parameter is sequential, meaning that * Amazon ML takes all of the data records between the percentBegin * and percentEnd parameters for the datasource, in the order that the * records appear in the input data.

    The following two * DataRearrangement lines are examples of sequentially ordered * training and evaluation datasources:

    Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"sequential"}}

    Datasource for training: * {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"sequential", "complement":"true"}}

    To randomly split * the input data into the proportions indicated by the percentBegin and percentEnd * parameters, set the strategy parameter to random and * provide a string that is used as the seed value for the random data splitting * (for example, you can use the S3 path to your data as the random seed string). * If you choose the random split strategy, Amazon ML assigns each row of data a * pseudo-random number between 0 and 100, and then selects the rows that have an * assigned number between percentBegin and percentEnd. * Pseudo-random numbers are assigned using both the input seed string value and * the byte offset as a seed, so changing the data results in a different split. * Any existing ordering is preserved. The random splitting strategy ensures that * variables in the training and evaluation data are distributed similarly. It is * useful in the cases where the input data may have an implicit sort order, which * would otherwise result in training and evaluation datasources containing * non-similar data records.

    The following two * DataRearrangement lines are examples of non-sequentially ordered * training and evaluation datasources:

    Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"random", * "randomSeed"="s3://my_s3_path/bucket/file.csv"}}

    Datasource for * training: {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"random", "randomSeed"="s3://my_s3_path/bucket/file.csv", * "complement":"true"}}

*/ inline S3DataSpec& WithDataRearrangement(const Aws::String& value) { SetDataRearrangement(value); return *this;} /** *

A JSON string that represents the splitting and rearrangement processing to * be applied to a DataSource. If the DataRearrangement * parameter is not provided, all of the input data is used to create the * Datasource.

There are multiple parameters that control what * data is used to create a datasource:

  • * percentBegin

    Use percentBegin to indicate * the beginning of the range of the data used to create the Datasource. If you do * not include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource.

  • * percentEnd

    Use percentEnd to indicate the * end of the range of the data used to create the Datasource. If you do not * include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource.

  • * complement

    The complement parameter * instructs Amazon ML to use the data that is not included in the range of * percentBegin to percentEnd to create a datasource. The * complement parameter is useful if you need to create complementary * datasources for training and evaluation. To create a complementary datasource, * use the same values for percentBegin and percentEnd, * along with the complement parameter.

    For example, the * following two datasources do not share any data, and can be used to train and * evaluate a model. The first datasource has 25 percent of the data, and the * second one has 75 percent of the data.

    Datasource for evaluation: * {"splitting":{"percentBegin":0, "percentEnd":25}}

    *

    Datasource for training: {"splitting":{"percentBegin":0, * "percentEnd":25, "complement":"true"}}

  • * strategy

    To change how Amazon ML splits the data for a * datasource, use the strategy parameter.

    The default value * for the strategy parameter is sequential, meaning that * Amazon ML takes all of the data records between the percentBegin * and percentEnd parameters for the datasource, in the order that the * records appear in the input data.

    The following two * DataRearrangement lines are examples of sequentially ordered * training and evaluation datasources:

    Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"sequential"}}

    Datasource for training: * {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"sequential", "complement":"true"}}

    To randomly split * the input data into the proportions indicated by the percentBegin and percentEnd * parameters, set the strategy parameter to random and * provide a string that is used as the seed value for the random data splitting * (for example, you can use the S3 path to your data as the random seed string). * If you choose the random split strategy, Amazon ML assigns each row of data a * pseudo-random number between 0 and 100, and then selects the rows that have an * assigned number between percentBegin and percentEnd. * Pseudo-random numbers are assigned using both the input seed string value and * the byte offset as a seed, so changing the data results in a different split. * Any existing ordering is preserved. The random splitting strategy ensures that * variables in the training and evaluation data are distributed similarly. It is * useful in the cases where the input data may have an implicit sort order, which * would otherwise result in training and evaluation datasources containing * non-similar data records.

    The following two * DataRearrangement lines are examples of non-sequentially ordered * training and evaluation datasources:

    Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"random", * "randomSeed"="s3://my_s3_path/bucket/file.csv"}}

    Datasource for * training: {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"random", "randomSeed"="s3://my_s3_path/bucket/file.csv", * "complement":"true"}}

*/ inline S3DataSpec& WithDataRearrangement(Aws::String&& value) { SetDataRearrangement(std::move(value)); return *this;} /** *

A JSON string that represents the splitting and rearrangement processing to * be applied to a DataSource. If the DataRearrangement * parameter is not provided, all of the input data is used to create the * Datasource.

There are multiple parameters that control what * data is used to create a datasource:

  • * percentBegin

    Use percentBegin to indicate * the beginning of the range of the data used to create the Datasource. If you do * not include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource.

  • * percentEnd

    Use percentEnd to indicate the * end of the range of the data used to create the Datasource. If you do not * include percentBegin and percentEnd, Amazon ML * includes all of the data when creating the datasource.

  • * complement

    The complement parameter * instructs Amazon ML to use the data that is not included in the range of * percentBegin to percentEnd to create a datasource. The * complement parameter is useful if you need to create complementary * datasources for training and evaluation. To create a complementary datasource, * use the same values for percentBegin and percentEnd, * along with the complement parameter.

    For example, the * following two datasources do not share any data, and can be used to train and * evaluate a model. The first datasource has 25 percent of the data, and the * second one has 75 percent of the data.

    Datasource for evaluation: * {"splitting":{"percentBegin":0, "percentEnd":25}}

    *

    Datasource for training: {"splitting":{"percentBegin":0, * "percentEnd":25, "complement":"true"}}

  • * strategy

    To change how Amazon ML splits the data for a * datasource, use the strategy parameter.

    The default value * for the strategy parameter is sequential, meaning that * Amazon ML takes all of the data records between the percentBegin * and percentEnd parameters for the datasource, in the order that the * records appear in the input data.

    The following two * DataRearrangement lines are examples of sequentially ordered * training and evaluation datasources:

    Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"sequential"}}

    Datasource for training: * {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"sequential", "complement":"true"}}

    To randomly split * the input data into the proportions indicated by the percentBegin and percentEnd * parameters, set the strategy parameter to random and * provide a string that is used as the seed value for the random data splitting * (for example, you can use the S3 path to your data as the random seed string). * If you choose the random split strategy, Amazon ML assigns each row of data a * pseudo-random number between 0 and 100, and then selects the rows that have an * assigned number between percentBegin and percentEnd. * Pseudo-random numbers are assigned using both the input seed string value and * the byte offset as a seed, so changing the data results in a different split. * Any existing ordering is preserved. The random splitting strategy ensures that * variables in the training and evaluation data are distributed similarly. It is * useful in the cases where the input data may have an implicit sort order, which * would otherwise result in training and evaluation datasources containing * non-similar data records.

    The following two * DataRearrangement lines are examples of non-sequentially ordered * training and evaluation datasources:

    Datasource for evaluation: * {"splitting":{"percentBegin":70, "percentEnd":100, "strategy":"random", * "randomSeed"="s3://my_s3_path/bucket/file.csv"}}

    Datasource for * training: {"splitting":{"percentBegin":70, "percentEnd":100, * "strategy":"random", "randomSeed"="s3://my_s3_path/bucket/file.csv", * "complement":"true"}}

*/ inline S3DataSpec& WithDataRearrangement(const char* value) { SetDataRearrangement(value); return *this;} /** *

A JSON string that represents the schema for an Amazon S3 * DataSource. The DataSchema defines the structure of * the observation data in the data file(s) referenced in the * DataSource.

You must provide either the * DataSchema or the DataSchemaLocationS3.

Define * your DataSchema as a series of key-value pairs. * attributes and excludedVariableNames have an array of * key-value pairs for their value. Use the following format to define your * DataSchema.

{ "version": "1.0",

*

"recordAnnotationFieldName": "F1",

"recordWeightFieldName": "F2",

*

"targetFieldName": "F3",

"dataFormat": "CSV",

*

"dataFileContainsHeader": true,

"attributes": [

{ "fieldName": * "F1", "fieldType": "TEXT" }, { "fieldName": "F2", "fieldType": "NUMERIC" }, { * "fieldName": "F3", "fieldType": "CATEGORICAL" }, { "fieldName": "F4", * "fieldType": "NUMERIC" }, { "fieldName": "F5", "fieldType": "CATEGORICAL" }, { * "fieldName": "F6", "fieldType": "TEXT" }, { "fieldName": "F7", "fieldType": * "WEIGHTED_INT_SEQUENCE" }, { "fieldName": "F8", "fieldType": * "WEIGHTED_STRING_SEQUENCE" } ],

"excludedVariableNames": [ "F6" ] }

*/ inline const Aws::String& GetDataSchema() const{ return m_dataSchema; } /** *

A JSON string that represents the schema for an Amazon S3 * DataSource. The DataSchema defines the structure of * the observation data in the data file(s) referenced in the * DataSource.

You must provide either the * DataSchema or the DataSchemaLocationS3.

Define * your DataSchema as a series of key-value pairs. * attributes and excludedVariableNames have an array of * key-value pairs for their value. Use the following format to define your * DataSchema.

{ "version": "1.0",

*

"recordAnnotationFieldName": "F1",

"recordWeightFieldName": "F2",

*

"targetFieldName": "F3",

"dataFormat": "CSV",

*

"dataFileContainsHeader": true,

"attributes": [

{ "fieldName": * "F1", "fieldType": "TEXT" }, { "fieldName": "F2", "fieldType": "NUMERIC" }, { * "fieldName": "F3", "fieldType": "CATEGORICAL" }, { "fieldName": "F4", * "fieldType": "NUMERIC" }, { "fieldName": "F5", "fieldType": "CATEGORICAL" }, { * "fieldName": "F6", "fieldType": "TEXT" }, { "fieldName": "F7", "fieldType": * "WEIGHTED_INT_SEQUENCE" }, { "fieldName": "F8", "fieldType": * "WEIGHTED_STRING_SEQUENCE" } ],

"excludedVariableNames": [ "F6" ] }

*/ inline bool DataSchemaHasBeenSet() const { return m_dataSchemaHasBeenSet; } /** *

A JSON string that represents the schema for an Amazon S3 * DataSource. The DataSchema defines the structure of * the observation data in the data file(s) referenced in the * DataSource.

You must provide either the * DataSchema or the DataSchemaLocationS3.

Define * your DataSchema as a series of key-value pairs. * attributes and excludedVariableNames have an array of * key-value pairs for their value. Use the following format to define your * DataSchema.

{ "version": "1.0",

*

"recordAnnotationFieldName": "F1",

"recordWeightFieldName": "F2",

*

"targetFieldName": "F3",

"dataFormat": "CSV",

*

"dataFileContainsHeader": true,

"attributes": [

{ "fieldName": * "F1", "fieldType": "TEXT" }, { "fieldName": "F2", "fieldType": "NUMERIC" }, { * "fieldName": "F3", "fieldType": "CATEGORICAL" }, { "fieldName": "F4", * "fieldType": "NUMERIC" }, { "fieldName": "F5", "fieldType": "CATEGORICAL" }, { * "fieldName": "F6", "fieldType": "TEXT" }, { "fieldName": "F7", "fieldType": * "WEIGHTED_INT_SEQUENCE" }, { "fieldName": "F8", "fieldType": * "WEIGHTED_STRING_SEQUENCE" } ],

"excludedVariableNames": [ "F6" ] }

*/ inline void SetDataSchema(const Aws::String& value) { m_dataSchemaHasBeenSet = true; m_dataSchema = value; } /** *

A JSON string that represents the schema for an Amazon S3 * DataSource. The DataSchema defines the structure of * the observation data in the data file(s) referenced in the * DataSource.

You must provide either the * DataSchema or the DataSchemaLocationS3.

Define * your DataSchema as a series of key-value pairs. * attributes and excludedVariableNames have an array of * key-value pairs for their value. Use the following format to define your * DataSchema.

{ "version": "1.0",

*

"recordAnnotationFieldName": "F1",

"recordWeightFieldName": "F2",

*

"targetFieldName": "F3",

"dataFormat": "CSV",

*

"dataFileContainsHeader": true,

"attributes": [

{ "fieldName": * "F1", "fieldType": "TEXT" }, { "fieldName": "F2", "fieldType": "NUMERIC" }, { * "fieldName": "F3", "fieldType": "CATEGORICAL" }, { "fieldName": "F4", * "fieldType": "NUMERIC" }, { "fieldName": "F5", "fieldType": "CATEGORICAL" }, { * "fieldName": "F6", "fieldType": "TEXT" }, { "fieldName": "F7", "fieldType": * "WEIGHTED_INT_SEQUENCE" }, { "fieldName": "F8", "fieldType": * "WEIGHTED_STRING_SEQUENCE" } ],

"excludedVariableNames": [ "F6" ] }

*/ inline void SetDataSchema(Aws::String&& value) { m_dataSchemaHasBeenSet = true; m_dataSchema = std::move(value); } /** *

A JSON string that represents the schema for an Amazon S3 * DataSource. The DataSchema defines the structure of * the observation data in the data file(s) referenced in the * DataSource.

You must provide either the * DataSchema or the DataSchemaLocationS3.

Define * your DataSchema as a series of key-value pairs. * attributes and excludedVariableNames have an array of * key-value pairs for their value. Use the following format to define your * DataSchema.

{ "version": "1.0",

*

"recordAnnotationFieldName": "F1",

"recordWeightFieldName": "F2",

*

"targetFieldName": "F3",

"dataFormat": "CSV",

*

"dataFileContainsHeader": true,

"attributes": [

{ "fieldName": * "F1", "fieldType": "TEXT" }, { "fieldName": "F2", "fieldType": "NUMERIC" }, { * "fieldName": "F3", "fieldType": "CATEGORICAL" }, { "fieldName": "F4", * "fieldType": "NUMERIC" }, { "fieldName": "F5", "fieldType": "CATEGORICAL" }, { * "fieldName": "F6", "fieldType": "TEXT" }, { "fieldName": "F7", "fieldType": * "WEIGHTED_INT_SEQUENCE" }, { "fieldName": "F8", "fieldType": * "WEIGHTED_STRING_SEQUENCE" } ],

"excludedVariableNames": [ "F6" ] }

*/ inline void SetDataSchema(const char* value) { m_dataSchemaHasBeenSet = true; m_dataSchema.assign(value); } /** *

A JSON string that represents the schema for an Amazon S3 * DataSource. The DataSchema defines the structure of * the observation data in the data file(s) referenced in the * DataSource.

You must provide either the * DataSchema or the DataSchemaLocationS3.

Define * your DataSchema as a series of key-value pairs. * attributes and excludedVariableNames have an array of * key-value pairs for their value. Use the following format to define your * DataSchema.

{ "version": "1.0",

*

"recordAnnotationFieldName": "F1",

"recordWeightFieldName": "F2",

*

"targetFieldName": "F3",

"dataFormat": "CSV",

*

"dataFileContainsHeader": true,

"attributes": [

{ "fieldName": * "F1", "fieldType": "TEXT" }, { "fieldName": "F2", "fieldType": "NUMERIC" }, { * "fieldName": "F3", "fieldType": "CATEGORICAL" }, { "fieldName": "F4", * "fieldType": "NUMERIC" }, { "fieldName": "F5", "fieldType": "CATEGORICAL" }, { * "fieldName": "F6", "fieldType": "TEXT" }, { "fieldName": "F7", "fieldType": * "WEIGHTED_INT_SEQUENCE" }, { "fieldName": "F8", "fieldType": * "WEIGHTED_STRING_SEQUENCE" } ],

"excludedVariableNames": [ "F6" ] }

*/ inline S3DataSpec& WithDataSchema(const Aws::String& value) { SetDataSchema(value); return *this;} /** *

A JSON string that represents the schema for an Amazon S3 * DataSource. The DataSchema defines the structure of * the observation data in the data file(s) referenced in the * DataSource.

You must provide either the * DataSchema or the DataSchemaLocationS3.

Define * your DataSchema as a series of key-value pairs. * attributes and excludedVariableNames have an array of * key-value pairs for their value. Use the following format to define your * DataSchema.

{ "version": "1.0",

*

"recordAnnotationFieldName": "F1",

"recordWeightFieldName": "F2",

*

"targetFieldName": "F3",

"dataFormat": "CSV",

*

"dataFileContainsHeader": true,

"attributes": [

{ "fieldName": * "F1", "fieldType": "TEXT" }, { "fieldName": "F2", "fieldType": "NUMERIC" }, { * "fieldName": "F3", "fieldType": "CATEGORICAL" }, { "fieldName": "F4", * "fieldType": "NUMERIC" }, { "fieldName": "F5", "fieldType": "CATEGORICAL" }, { * "fieldName": "F6", "fieldType": "TEXT" }, { "fieldName": "F7", "fieldType": * "WEIGHTED_INT_SEQUENCE" }, { "fieldName": "F8", "fieldType": * "WEIGHTED_STRING_SEQUENCE" } ],

"excludedVariableNames": [ "F6" ] }

*/ inline S3DataSpec& WithDataSchema(Aws::String&& value) { SetDataSchema(std::move(value)); return *this;} /** *

A JSON string that represents the schema for an Amazon S3 * DataSource. The DataSchema defines the structure of * the observation data in the data file(s) referenced in the * DataSource.

You must provide either the * DataSchema or the DataSchemaLocationS3.

Define * your DataSchema as a series of key-value pairs. * attributes and excludedVariableNames have an array of * key-value pairs for their value. Use the following format to define your * DataSchema.

{ "version": "1.0",

*

"recordAnnotationFieldName": "F1",

"recordWeightFieldName": "F2",

*

"targetFieldName": "F3",

"dataFormat": "CSV",

*

"dataFileContainsHeader": true,

"attributes": [

{ "fieldName": * "F1", "fieldType": "TEXT" }, { "fieldName": "F2", "fieldType": "NUMERIC" }, { * "fieldName": "F3", "fieldType": "CATEGORICAL" }, { "fieldName": "F4", * "fieldType": "NUMERIC" }, { "fieldName": "F5", "fieldType": "CATEGORICAL" }, { * "fieldName": "F6", "fieldType": "TEXT" }, { "fieldName": "F7", "fieldType": * "WEIGHTED_INT_SEQUENCE" }, { "fieldName": "F8", "fieldType": * "WEIGHTED_STRING_SEQUENCE" } ],

"excludedVariableNames": [ "F6" ] }

*/ inline S3DataSpec& WithDataSchema(const char* value) { SetDataSchema(value); return *this;} /** *

Describes the schema location in Amazon S3. You must provide either the * DataSchema or the DataSchemaLocationS3.

*/ inline const Aws::String& GetDataSchemaLocationS3() const{ return m_dataSchemaLocationS3; } /** *

Describes the schema location in Amazon S3. You must provide either the * DataSchema or the DataSchemaLocationS3.

*/ inline bool DataSchemaLocationS3HasBeenSet() const { return m_dataSchemaLocationS3HasBeenSet; } /** *

Describes the schema location in Amazon S3. You must provide either the * DataSchema or the DataSchemaLocationS3.

*/ inline void SetDataSchemaLocationS3(const Aws::String& value) { m_dataSchemaLocationS3HasBeenSet = true; m_dataSchemaLocationS3 = value; } /** *

Describes the schema location in Amazon S3. You must provide either the * DataSchema or the DataSchemaLocationS3.

*/ inline void SetDataSchemaLocationS3(Aws::String&& value) { m_dataSchemaLocationS3HasBeenSet = true; m_dataSchemaLocationS3 = std::move(value); } /** *

Describes the schema location in Amazon S3. You must provide either the * DataSchema or the DataSchemaLocationS3.

*/ inline void SetDataSchemaLocationS3(const char* value) { m_dataSchemaLocationS3HasBeenSet = true; m_dataSchemaLocationS3.assign(value); } /** *

Describes the schema location in Amazon S3. You must provide either the * DataSchema or the DataSchemaLocationS3.

*/ inline S3DataSpec& WithDataSchemaLocationS3(const Aws::String& value) { SetDataSchemaLocationS3(value); return *this;} /** *

Describes the schema location in Amazon S3. You must provide either the * DataSchema or the DataSchemaLocationS3.

*/ inline S3DataSpec& WithDataSchemaLocationS3(Aws::String&& value) { SetDataSchemaLocationS3(std::move(value)); return *this;} /** *

Describes the schema location in Amazon S3. You must provide either the * DataSchema or the DataSchemaLocationS3.

*/ inline S3DataSpec& WithDataSchemaLocationS3(const char* value) { SetDataSchemaLocationS3(value); return *this;} private: Aws::String m_dataLocationS3; bool m_dataLocationS3HasBeenSet = false; Aws::String m_dataRearrangement; bool m_dataRearrangementHasBeenSet = false; Aws::String m_dataSchema; bool m_dataSchemaHasBeenSet = false; Aws::String m_dataSchemaLocationS3; bool m_dataSchemaLocationS3HasBeenSet = false; }; } // namespace Model } // namespace MachineLearning } // namespace Aws