/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.
*/
#pragma once
#include Provides the configuration information required for Amazon Kendra Web
* Crawler.See Also:
AWS
* API Reference
Specifies the seed or starting point URLs of the websites or the sitemap URLs * of the websites you want to crawl.
You can include website subdomains. * You can list up to 100 seed URLs and up to three sitemap URLs.
You can * only crawl websites that use the secure communication protocol, Hypertext * Transfer Protocol Secure (HTTPS). If you receive an error when crawling a * website, it could be that the website is blocked from crawling.
When * selecting websites to index, you must adhere to the Amazon Acceptable Use Policy and all * other Amazon terms. Remember that you must only use Amazon Kendra Web Crawler to * index your own web pages, or web pages that you have authorization to index. *
*/ inline const Urls& GetUrls() const{ return m_urls; } /** *Specifies the seed or starting point URLs of the websites or the sitemap URLs * of the websites you want to crawl.
You can include website subdomains. * You can list up to 100 seed URLs and up to three sitemap URLs.
You can * only crawl websites that use the secure communication protocol, Hypertext * Transfer Protocol Secure (HTTPS). If you receive an error when crawling a * website, it could be that the website is blocked from crawling.
When * selecting websites to index, you must adhere to the Amazon Acceptable Use Policy and all * other Amazon terms. Remember that you must only use Amazon Kendra Web Crawler to * index your own web pages, or web pages that you have authorization to index. *
*/ inline bool UrlsHasBeenSet() const { return m_urlsHasBeenSet; } /** *Specifies the seed or starting point URLs of the websites or the sitemap URLs * of the websites you want to crawl.
You can include website subdomains. * You can list up to 100 seed URLs and up to three sitemap URLs.
You can * only crawl websites that use the secure communication protocol, Hypertext * Transfer Protocol Secure (HTTPS). If you receive an error when crawling a * website, it could be that the website is blocked from crawling.
When * selecting websites to index, you must adhere to the Amazon Acceptable Use Policy and all * other Amazon terms. Remember that you must only use Amazon Kendra Web Crawler to * index your own web pages, or web pages that you have authorization to index. *
*/ inline void SetUrls(const Urls& value) { m_urlsHasBeenSet = true; m_urls = value; } /** *Specifies the seed or starting point URLs of the websites or the sitemap URLs * of the websites you want to crawl.
You can include website subdomains. * You can list up to 100 seed URLs and up to three sitemap URLs.
You can * only crawl websites that use the secure communication protocol, Hypertext * Transfer Protocol Secure (HTTPS). If you receive an error when crawling a * website, it could be that the website is blocked from crawling.
When * selecting websites to index, you must adhere to the Amazon Acceptable Use Policy and all * other Amazon terms. Remember that you must only use Amazon Kendra Web Crawler to * index your own web pages, or web pages that you have authorization to index. *
*/ inline void SetUrls(Urls&& value) { m_urlsHasBeenSet = true; m_urls = std::move(value); } /** *Specifies the seed or starting point URLs of the websites or the sitemap URLs * of the websites you want to crawl.
You can include website subdomains. * You can list up to 100 seed URLs and up to three sitemap URLs.
You can * only crawl websites that use the secure communication protocol, Hypertext * Transfer Protocol Secure (HTTPS). If you receive an error when crawling a * website, it could be that the website is blocked from crawling.
When * selecting websites to index, you must adhere to the Amazon Acceptable Use Policy and all * other Amazon terms. Remember that you must only use Amazon Kendra Web Crawler to * index your own web pages, or web pages that you have authorization to index. *
*/ inline WebCrawlerConfiguration& WithUrls(const Urls& value) { SetUrls(value); return *this;} /** *Specifies the seed or starting point URLs of the websites or the sitemap URLs * of the websites you want to crawl.
You can include website subdomains. * You can list up to 100 seed URLs and up to three sitemap URLs.
You can * only crawl websites that use the secure communication protocol, Hypertext * Transfer Protocol Secure (HTTPS). If you receive an error when crawling a * website, it could be that the website is blocked from crawling.
When * selecting websites to index, you must adhere to the Amazon Acceptable Use Policy and all * other Amazon terms. Remember that you must only use Amazon Kendra Web Crawler to * index your own web pages, or web pages that you have authorization to index. *
*/ inline WebCrawlerConfiguration& WithUrls(Urls&& value) { SetUrls(std::move(value)); return *this;} /** *The 'depth' or number of levels from the seed level to crawl. For example, * the seed URL page is depth 1 and any hyperlinks on this page that are also * crawled are depth 2.
*/ inline int GetCrawlDepth() const{ return m_crawlDepth; } /** *The 'depth' or number of levels from the seed level to crawl. For example, * the seed URL page is depth 1 and any hyperlinks on this page that are also * crawled are depth 2.
*/ inline bool CrawlDepthHasBeenSet() const { return m_crawlDepthHasBeenSet; } /** *The 'depth' or number of levels from the seed level to crawl. For example, * the seed URL page is depth 1 and any hyperlinks on this page that are also * crawled are depth 2.
*/ inline void SetCrawlDepth(int value) { m_crawlDepthHasBeenSet = true; m_crawlDepth = value; } /** *The 'depth' or number of levels from the seed level to crawl. For example, * the seed URL page is depth 1 and any hyperlinks on this page that are also * crawled are depth 2.
*/ inline WebCrawlerConfiguration& WithCrawlDepth(int value) { SetCrawlDepth(value); return *this;} /** *The maximum number of URLs on a web page to include when crawling a website. * This number is per web page.
As a website’s web pages are crawled, any * URLs the web pages link to are also crawled. URLs on a web page are crawled in * order of appearance.
The default maximum links per page is 100.
*/ inline int GetMaxLinksPerPage() const{ return m_maxLinksPerPage; } /** *The maximum number of URLs on a web page to include when crawling a website. * This number is per web page.
As a website’s web pages are crawled, any * URLs the web pages link to are also crawled. URLs on a web page are crawled in * order of appearance.
The default maximum links per page is 100.
*/ inline bool MaxLinksPerPageHasBeenSet() const { return m_maxLinksPerPageHasBeenSet; } /** *The maximum number of URLs on a web page to include when crawling a website. * This number is per web page.
As a website’s web pages are crawled, any * URLs the web pages link to are also crawled. URLs on a web page are crawled in * order of appearance.
The default maximum links per page is 100.
*/ inline void SetMaxLinksPerPage(int value) { m_maxLinksPerPageHasBeenSet = true; m_maxLinksPerPage = value; } /** *The maximum number of URLs on a web page to include when crawling a website. * This number is per web page.
As a website’s web pages are crawled, any * URLs the web pages link to are also crawled. URLs on a web page are crawled in * order of appearance.
The default maximum links per page is 100.
*/ inline WebCrawlerConfiguration& WithMaxLinksPerPage(int value) { SetMaxLinksPerPage(value); return *this;} /** *The maximum size (in MB) of a web page or attachment to crawl.
Files * larger than this size (in MB) are skipped/not crawled.
The default * maximum size of a web page or attachment is set to 50 MB.
*/ inline double GetMaxContentSizePerPageInMegaBytes() const{ return m_maxContentSizePerPageInMegaBytes; } /** *The maximum size (in MB) of a web page or attachment to crawl.
Files * larger than this size (in MB) are skipped/not crawled.
The default * maximum size of a web page or attachment is set to 50 MB.
*/ inline bool MaxContentSizePerPageInMegaBytesHasBeenSet() const { return m_maxContentSizePerPageInMegaBytesHasBeenSet; } /** *The maximum size (in MB) of a web page or attachment to crawl.
Files * larger than this size (in MB) are skipped/not crawled.
The default * maximum size of a web page or attachment is set to 50 MB.
*/ inline void SetMaxContentSizePerPageInMegaBytes(double value) { m_maxContentSizePerPageInMegaBytesHasBeenSet = true; m_maxContentSizePerPageInMegaBytes = value; } /** *The maximum size (in MB) of a web page or attachment to crawl.
Files * larger than this size (in MB) are skipped/not crawled.
The default * maximum size of a web page or attachment is set to 50 MB.
*/ inline WebCrawlerConfiguration& WithMaxContentSizePerPageInMegaBytes(double value) { SetMaxContentSizePerPageInMegaBytes(value); return *this;} /** *The maximum number of URLs crawled per website host per minute.
A * minimum of one URL is required.
The default maximum number of URLs * crawled per website host per minute is 300.
*/ inline int GetMaxUrlsPerMinuteCrawlRate() const{ return m_maxUrlsPerMinuteCrawlRate; } /** *The maximum number of URLs crawled per website host per minute.
A * minimum of one URL is required.
The default maximum number of URLs * crawled per website host per minute is 300.
*/ inline bool MaxUrlsPerMinuteCrawlRateHasBeenSet() const { return m_maxUrlsPerMinuteCrawlRateHasBeenSet; } /** *The maximum number of URLs crawled per website host per minute.
A * minimum of one URL is required.
The default maximum number of URLs * crawled per website host per minute is 300.
*/ inline void SetMaxUrlsPerMinuteCrawlRate(int value) { m_maxUrlsPerMinuteCrawlRateHasBeenSet = true; m_maxUrlsPerMinuteCrawlRate = value; } /** *The maximum number of URLs crawled per website host per minute.
A * minimum of one URL is required.
The default maximum number of URLs * crawled per website host per minute is 300.
*/ inline WebCrawlerConfiguration& WithMaxUrlsPerMinuteCrawlRate(int value) { SetMaxUrlsPerMinuteCrawlRate(value); return *this;} /** *A list of regular expression patterns to include certain URLs to crawl. URLs * that match the patterns are included in the index. URLs that don't match the * patterns are excluded from the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline const Aws::VectorA list of regular expression patterns to include certain URLs to crawl. URLs * that match the patterns are included in the index. URLs that don't match the * patterns are excluded from the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline bool UrlInclusionPatternsHasBeenSet() const { return m_urlInclusionPatternsHasBeenSet; } /** *A list of regular expression patterns to include certain URLs to crawl. URLs * that match the patterns are included in the index. URLs that don't match the * patterns are excluded from the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline void SetUrlInclusionPatterns(const Aws::VectorA list of regular expression patterns to include certain URLs to crawl. URLs * that match the patterns are included in the index. URLs that don't match the * patterns are excluded from the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline void SetUrlInclusionPatterns(Aws::VectorA list of regular expression patterns to include certain URLs to crawl. URLs * that match the patterns are included in the index. URLs that don't match the * patterns are excluded from the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline WebCrawlerConfiguration& WithUrlInclusionPatterns(const Aws::VectorA list of regular expression patterns to include certain URLs to crawl. URLs * that match the patterns are included in the index. URLs that don't match the * patterns are excluded from the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline WebCrawlerConfiguration& WithUrlInclusionPatterns(Aws::VectorA list of regular expression patterns to include certain URLs to crawl. URLs * that match the patterns are included in the index. URLs that don't match the * patterns are excluded from the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline WebCrawlerConfiguration& AddUrlInclusionPatterns(const Aws::String& value) { m_urlInclusionPatternsHasBeenSet = true; m_urlInclusionPatterns.push_back(value); return *this; } /** *A list of regular expression patterns to include certain URLs to crawl. URLs * that match the patterns are included in the index. URLs that don't match the * patterns are excluded from the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline WebCrawlerConfiguration& AddUrlInclusionPatterns(Aws::String&& value) { m_urlInclusionPatternsHasBeenSet = true; m_urlInclusionPatterns.push_back(std::move(value)); return *this; } /** *A list of regular expression patterns to include certain URLs to crawl. URLs * that match the patterns are included in the index. URLs that don't match the * patterns are excluded from the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline WebCrawlerConfiguration& AddUrlInclusionPatterns(const char* value) { m_urlInclusionPatternsHasBeenSet = true; m_urlInclusionPatterns.push_back(value); return *this; } /** *A list of regular expression patterns to exclude certain URLs to crawl. URLs * that match the patterns are excluded from the index. URLs that don't match the * patterns are included in the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline const Aws::VectorA list of regular expression patterns to exclude certain URLs to crawl. URLs * that match the patterns are excluded from the index. URLs that don't match the * patterns are included in the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline bool UrlExclusionPatternsHasBeenSet() const { return m_urlExclusionPatternsHasBeenSet; } /** *A list of regular expression patterns to exclude certain URLs to crawl. URLs * that match the patterns are excluded from the index. URLs that don't match the * patterns are included in the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline void SetUrlExclusionPatterns(const Aws::VectorA list of regular expression patterns to exclude certain URLs to crawl. URLs * that match the patterns are excluded from the index. URLs that don't match the * patterns are included in the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline void SetUrlExclusionPatterns(Aws::VectorA list of regular expression patterns to exclude certain URLs to crawl. URLs * that match the patterns are excluded from the index. URLs that don't match the * patterns are included in the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline WebCrawlerConfiguration& WithUrlExclusionPatterns(const Aws::VectorA list of regular expression patterns to exclude certain URLs to crawl. URLs * that match the patterns are excluded from the index. URLs that don't match the * patterns are included in the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline WebCrawlerConfiguration& WithUrlExclusionPatterns(Aws::VectorA list of regular expression patterns to exclude certain URLs to crawl. URLs * that match the patterns are excluded from the index. URLs that don't match the * patterns are included in the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline WebCrawlerConfiguration& AddUrlExclusionPatterns(const Aws::String& value) { m_urlExclusionPatternsHasBeenSet = true; m_urlExclusionPatterns.push_back(value); return *this; } /** *A list of regular expression patterns to exclude certain URLs to crawl. URLs * that match the patterns are excluded from the index. URLs that don't match the * patterns are included in the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline WebCrawlerConfiguration& AddUrlExclusionPatterns(Aws::String&& value) { m_urlExclusionPatternsHasBeenSet = true; m_urlExclusionPatterns.push_back(std::move(value)); return *this; } /** *A list of regular expression patterns to exclude certain URLs to crawl. URLs * that match the patterns are excluded from the index. URLs that don't match the * patterns are included in the index. If a URL matches both an inclusion and * exclusion pattern, the exclusion pattern takes precedence and the URL file isn't * included in the index.
*/ inline WebCrawlerConfiguration& AddUrlExclusionPatterns(const char* value) { m_urlExclusionPatternsHasBeenSet = true; m_urlExclusionPatterns.push_back(value); return *this; } /** *Configuration information required to connect to your internal websites via a * web proxy.
You must provide the website host name and port number. For * example, the host name of https://a.example.com/page1.html is "a.example.com" * and the port is 443, the standard port for HTTPS.
Web proxy credentials * are optional and you can use them to connect to a web proxy server that requires * basic authentication. To store web proxy credentials, you use a secret in Secrets * Manager.
*/ inline const ProxyConfiguration& GetProxyConfiguration() const{ return m_proxyConfiguration; } /** *Configuration information required to connect to your internal websites via a * web proxy.
You must provide the website host name and port number. For * example, the host name of https://a.example.com/page1.html is "a.example.com" * and the port is 443, the standard port for HTTPS.
Web proxy credentials * are optional and you can use them to connect to a web proxy server that requires * basic authentication. To store web proxy credentials, you use a secret in Secrets * Manager.
*/ inline bool ProxyConfigurationHasBeenSet() const { return m_proxyConfigurationHasBeenSet; } /** *Configuration information required to connect to your internal websites via a * web proxy.
You must provide the website host name and port number. For * example, the host name of https://a.example.com/page1.html is "a.example.com" * and the port is 443, the standard port for HTTPS.
Web proxy credentials * are optional and you can use them to connect to a web proxy server that requires * basic authentication. To store web proxy credentials, you use a secret in Secrets * Manager.
*/ inline void SetProxyConfiguration(const ProxyConfiguration& value) { m_proxyConfigurationHasBeenSet = true; m_proxyConfiguration = value; } /** *Configuration information required to connect to your internal websites via a * web proxy.
You must provide the website host name and port number. For * example, the host name of https://a.example.com/page1.html is "a.example.com" * and the port is 443, the standard port for HTTPS.
Web proxy credentials * are optional and you can use them to connect to a web proxy server that requires * basic authentication. To store web proxy credentials, you use a secret in Secrets * Manager.
*/ inline void SetProxyConfiguration(ProxyConfiguration&& value) { m_proxyConfigurationHasBeenSet = true; m_proxyConfiguration = std::move(value); } /** *Configuration information required to connect to your internal websites via a * web proxy.
You must provide the website host name and port number. For * example, the host name of https://a.example.com/page1.html is "a.example.com" * and the port is 443, the standard port for HTTPS.
Web proxy credentials * are optional and you can use them to connect to a web proxy server that requires * basic authentication. To store web proxy credentials, you use a secret in Secrets * Manager.
*/ inline WebCrawlerConfiguration& WithProxyConfiguration(const ProxyConfiguration& value) { SetProxyConfiguration(value); return *this;} /** *Configuration information required to connect to your internal websites via a * web proxy.
You must provide the website host name and port number. For * example, the host name of https://a.example.com/page1.html is "a.example.com" * and the port is 443, the standard port for HTTPS.
Web proxy credentials * are optional and you can use them to connect to a web proxy server that requires * basic authentication. To store web proxy credentials, you use a secret in Secrets * Manager.
*/ inline WebCrawlerConfiguration& WithProxyConfiguration(ProxyConfiguration&& value) { SetProxyConfiguration(std::move(value)); return *this;} /** *Configuration information required to connect to websites using * authentication.
You can connect to websites using basic authentication of * user name and password. You use a secret in Secrets * Manager to store your authentication credentials.
You must provide * the website host name and port number. For example, the host name of * https://a.example.com/page1.html is "a.example.com" and the port is 443, the * standard port for HTTPS.
*/ inline const AuthenticationConfiguration& GetAuthenticationConfiguration() const{ return m_authenticationConfiguration; } /** *Configuration information required to connect to websites using * authentication.
You can connect to websites using basic authentication of * user name and password. You use a secret in Secrets * Manager to store your authentication credentials.
You must provide * the website host name and port number. For example, the host name of * https://a.example.com/page1.html is "a.example.com" and the port is 443, the * standard port for HTTPS.
*/ inline bool AuthenticationConfigurationHasBeenSet() const { return m_authenticationConfigurationHasBeenSet; } /** *Configuration information required to connect to websites using * authentication.
You can connect to websites using basic authentication of * user name and password. You use a secret in Secrets * Manager to store your authentication credentials.
You must provide * the website host name and port number. For example, the host name of * https://a.example.com/page1.html is "a.example.com" and the port is 443, the * standard port for HTTPS.
*/ inline void SetAuthenticationConfiguration(const AuthenticationConfiguration& value) { m_authenticationConfigurationHasBeenSet = true; m_authenticationConfiguration = value; } /** *Configuration information required to connect to websites using * authentication.
You can connect to websites using basic authentication of * user name and password. You use a secret in Secrets * Manager to store your authentication credentials.
You must provide * the website host name and port number. For example, the host name of * https://a.example.com/page1.html is "a.example.com" and the port is 443, the * standard port for HTTPS.
*/ inline void SetAuthenticationConfiguration(AuthenticationConfiguration&& value) { m_authenticationConfigurationHasBeenSet = true; m_authenticationConfiguration = std::move(value); } /** *Configuration information required to connect to websites using * authentication.
You can connect to websites using basic authentication of * user name and password. You use a secret in Secrets * Manager to store your authentication credentials.
You must provide * the website host name and port number. For example, the host name of * https://a.example.com/page1.html is "a.example.com" and the port is 443, the * standard port for HTTPS.
*/ inline WebCrawlerConfiguration& WithAuthenticationConfiguration(const AuthenticationConfiguration& value) { SetAuthenticationConfiguration(value); return *this;} /** *Configuration information required to connect to websites using * authentication.
You can connect to websites using basic authentication of * user name and password. You use a secret in Secrets * Manager to store your authentication credentials.
You must provide * the website host name and port number. For example, the host name of * https://a.example.com/page1.html is "a.example.com" and the port is 443, the * standard port for HTTPS.
*/ inline WebCrawlerConfiguration& WithAuthenticationConfiguration(AuthenticationConfiguration&& value) { SetAuthenticationConfiguration(std::move(value)); return *this;} private: Urls m_urls; bool m_urlsHasBeenSet = false; int m_crawlDepth; bool m_crawlDepthHasBeenSet = false; int m_maxLinksPerPage; bool m_maxLinksPerPageHasBeenSet = false; double m_maxContentSizePerPageInMegaBytes; bool m_maxContentSizePerPageInMegaBytesHasBeenSet = false; int m_maxUrlsPerMinuteCrawlRate; bool m_maxUrlsPerMinuteCrawlRateHasBeenSet = false; Aws::Vector