# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of # the License is located at # # http://aws.amazon.com/apache2.0/ # # or in the "license" file accompanying this file. This file is # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. """The processing job config module of SageMaker JumpStart Industry. The following configuration classes assist in providing the necessary information to configure SageMaker JumpStart Industry's processors. """ from __future__ import print_function, absolute_import from abc import ABC, abstractmethod import re import logging from typing import Any, Dict, List, Set, Union from smjsindustry.finance.constants import ( JACCARD_SUMMARIZER, KMEDOIDS_SUMMARIZER, NLP_SCORER, LOAD_DATA, SUPPORTED_SEC_FORMS, KMEDOIDS_SUMMARIZER_INIT_VALUES, KMEDOIDS_SUMMARIZER_METRIC_VALUES, ) from smjsindustry.finance.nlp_score_type import NLPScoreType logger = logging.getLogger() class FinanceProcessorConfig(ABC): """The configuration class to instantiate SageMaker JumpStart Industry processors. Args: processor_type (str): A unique dataset key. """ def __init__(self, processor_type: str): """Initializes a configuration for SageMaker JumpStart Industry processor.""" self._processor_type = processor_type @abstractmethod def get_config(self) -> Dict[str, Any]: """Returns the config to be passed to a SageMaker JumpStart Industry processor instance.""" return None @property def processor_type(self) -> str: """Gets the string of the ``processor_type`` parameter.""" return self._processor_type class JaccardSummarizerConfig(FinanceProcessorConfig): """The configuration class for ``JaccardSummarizer``. The aim of the ``JaccardSummarizer`` is to extract the main thematic sentences of the document. The ``JaccardSummarizer`` is a traditional summarizer that scores the sentences in a document using similarities. The sentences with higher similarities to other sentences in the documents are ranked higher. The top scoring sentences are selected as the summary of the document. More specifically, the similarity is calculated in terms of `Jaccard Similarity `_. The Jaccard Similarity of two sentences *A* and *B* is the ratio of the size of intersection of tokens in *A* and *B* vs the size of union of tokens in *A* and *B*. The ``JaccardSummarizer`` is based on extraction-based summarization. The extractive method is more practical because the summaries it creates are more grammatically correct and semantically relevant to the document. Abstraction-based summarization is avoided because it may alter the legal meaning of texts from SEC filings and legal financial texts that have strict meanings; small changes in the structure of a sentence may alter the legal meaning of the text. Extractive summarization also works for very long documents that cannot be easily processed with abstractive summarization. Use this configuration class to use the ``JaccardSummarizer`` algorithm when you specify the required parameter by the :class:`~smjsindustry.finance.processor.Summarizer` instance. Args: summary_size (int): The maximum number of sentences in the summary (default: 0). summary_percentage (float): The number of sentences in the summary should not exceed a ``summary_percentage`` of the sentences in the original text (default: 0.0). max_tokens (int): The max number of tokens in the summary (default: 0). cutoff (float): The similarity cut off (default: 0.0). vocabulary (Set[str]): A set of sentiment words (default: None). """ def __init__( self, summary_size: int = 0, summary_percentage: float = 0.0, max_tokens: int = 0, cutoff: float = 0.0, vocabulary: Set[str] = None, ): """Initializes a ``JaccardSummarizerConfig`` instance. Raises: TypeError: - if ``summary_size`` (int) is not an integer - if ``summary_percentage`` (float) is not a float - if ``max_tokens`` (int) is not an integer - if ``cutoff`` (float) is not a float - if ``vocabulary`` (Set[str]) is not None and not a set Or any item in the set is not a string ValueError: - if ``summary_size`` (int) is not a non-negative integer - if ``summary_percentage`` (float) is not in the range of 0 to 1 - if ``max_tokens`` (int) is not a non-negative integer - if ``cutoff`` (float) is not in the range of 0 to 1 """ super().__init__(JACCARD_SUMMARIZER) size_arguments = [summary_size, summary_percentage, max_tokens, cutoff] size_argument_count = sum([1 if arg else 0 for arg in size_arguments]) if size_argument_count != 1: raise ValueError( "Only one summary size related argument can be specified, " "choose to specify one from summary_size, summary_percentage, max_tokens, cutoff." ) if not isinstance(summary_size, int): raise TypeError("JaccardSummarizerConfig requires summary_size to be an integer.") if summary_size < 0: raise ValueError( "JaccardSummarizerConfig requires summary_size to be a non-negative integer." ) if not isinstance(summary_percentage, float): raise TypeError("JaccardSummarizerConfig requires summary_percentage to be a float.") if summary_percentage < 0 or summary_percentage > 1: raise ValueError( "JaccardSummarizerConfig requires summary_percentage to be in the range of 0 to 1." ) if not isinstance(max_tokens, int): raise ValueError("JaccardSummarizerConfig requires max_tokens to be an integer.") if max_tokens < 0: raise ValueError( "JaccardSummarizerConfig requires max_tokens to be a non-negative integer." ) if not isinstance(cutoff, float): raise ValueError("JaccardSummarizerConfig requires cutoff to be a float.") if cutoff < 0 or cutoff > 1: raise ValueError( "JaccardSummarizerConfig requires cutoff to be in the range of 0 to 1." ) if vocabulary is not None: if not isinstance(vocabulary, set) or any( not isinstance(word, str) for word in vocabulary ): raise TypeError( "JaccardSummarizerConfig requires vocabulary to be a set of strings." ) self._summary_size = summary_size self._summary_percentage = summary_percentage self._max_tokens = max_tokens self._cutoff = cutoff self._vocabulary = vocabulary def get_config(self) -> Dict[str, Union[str, int, float, Set[str]]]: """Returns the config to be passed to a SageMaker JumpStart Industry Summarizer instance.""" return { "processor_type": self.processor_type, "summary_size": self.summary_size, "summary_percentage": self.summary_percentage, "max_tokens": self.max_tokens, "cutoff": self.cutoff, "vocabulary": self.vocabulary, } @property def summary_size(self) -> int: """Gets the value of the ``summary_size`` parameter.""" return self._summary_size @property def summary_percentage(self) -> float: """Gets the value of the ``summary_percentage`` parameter.""" return self._summary_percentage @property def max_tokens(self) -> int: """Gets the value of the ``max_tokens`` parameter.""" return self._max_tokens @property def cutoff(self) -> float: """Gets the value of the ``cutoff`` parameter.""" return self._cutoff @property def vocabulary(self) -> Set[str]: """Gets the value of the ``vocabulary`` parameter.""" return self._vocabulary class KMedoidsSummarizerConfig(FinanceProcessorConfig): """Configuration class for ``KMedoidsSummarizer``. The ``KMedoidsSummarizer`` is an extractive summarizer and uses the k-medoids based approach. First, it creates sentence embeddings using `Gensim’s Doc2Vec `_. Second, k-medoids clustering is performed on the sentence vectors. Note that we use k-medoids instead of k-means clustering. Whereas k-means minimizes the total squared error from a central position in each cluster (centroid), k-medoids minimizes the sum of dissimilarities between vectors in a cluster and one of the vectors designated as the representative of that cluster; the representative vectors are called medoids. The m sentences in the document corresponding to the cluster medoids are returned as the summary. The goal of this summarizer is different from the ``JaccardSummarizer``. The ``KMedoidsSummarizer`` picks up peripheral sentences, not just the main theme of the document, in case there are items of importance that are buried in sentences different from the main theme. The ``KMedoidsSummarizer`` is based on extraction-based summarization. The extractive method is more practical because the summaries it creates are more grammatically correct and semantically relevant to the document. Abstraction-based summarization is avoided because it may alter the legal meaning of texts from SEC filings and legal financial texts that have strict meanings; small changes in the structure of a sentence may alter the legal meaning of the text. Extractive summarization also works for very long documents that cannot be easily processed with abstractive summarization. Use this configuration class to use the ``KMedoidsSummarizer`` algorithm when you specify the required parameter by the :class:`~smjsindustry.finance.processor.Summarizer` instance. Args: summary_size (int): Required. The number of sentences to be extracted. vector_size (int): The embedding dimensions (default: 100). min_count (int): The minimal word occurrences to be included (default: 0). epochs (int): The number of epochs in a training (default: 60). metric (str): The distance metric to use. Possible values are ``'euclidean'``, ``'cosine'``, ``'dot-product'`` (default: ``'euclidean'``). init (str): The value specifies medoid initialization method. Possible values are ``'random'``, ``'heuristic'``, ``'k-medoids++'``, ``'build'`` (default: ``'heuristic'``). """ def __init__( self, summary_size: int, vector_size: int = 100, min_count: int = 0, epochs: int = 60, metric: str = "euclidean", init: str = "heuristic", ): """Initializes a ``KMedoidsSummarizerConfig`` instance. Raises: TypeError: - if ``summary_size`` (int) is not an integer - if ``vector_size`` (int) is not an integer - if ``min_count`` (int) is not an integer - if ``epochs`` (int) is not an integer - if ``metric`` (str) is not a string - if ``init`` (str) is not a string ValueError: - if ``summary_size`` (int) is not a non-negative integer - if ``vector_size`` (int) is not a positive integer - if ``min_count`` (int) is not a non-negative integer - if ``epochs`` (int) is not a positive integer - if ``metric`` (str) is not from KMEDOIDS_SUMMARIZER_METRIC_VALUES - if ``init`` (str) is not from KMEDOIDS_SUMMARIZER_INIT_VALUES """ super().__init__(KMEDOIDS_SUMMARIZER) if not isinstance(summary_size, int): raise TypeError("KMedoidsSummarizerConfig requires summary_size to be an integer.") if summary_size < 0: raise ValueError( "KMedoidsSummarizerConfig requires summary_size to be a non-negative integer." ) if not isinstance(vector_size, int): raise TypeError("KMedoidsSummarizerConfig requires vector_size to be an integer.") if vector_size <= 0: raise ValueError( "KMedoidsSummarizerConfig requires vector_size to be a positive integer." ) if not isinstance(min_count, int): raise TypeError("KMedoidsSummarizerConfig requires min_count to be an integer.") if min_count < 0: raise ValueError( "KMedoidsSummarizerConfig requires min_count to be a non-negative integer." ) if not isinstance(epochs, int): raise TypeError("KMedoidsSummarizerConfig requires epochs to be an integer.") if epochs <= 0: raise ValueError("KMedoidsSummarizerConfig requires epochs to be a positive integer.") if not isinstance(metric, str): raise TypeError("KMedoidsSummarizerConfig requires metric to be a string.") if metric not in KMEDOIDS_SUMMARIZER_METRIC_VALUES: raise ValueError(f"{metric} not valid.") if not isinstance(init, str): raise TypeError("KMedoidsSummarizerConfig requires init to be a string.") if init not in KMEDOIDS_SUMMARIZER_INIT_VALUES: raise ValueError(f"{init} not valid.") self._summary_size = summary_size self._vector_size = vector_size self._min_count = min_count self._epochs = epochs self._metric = metric self._init = init def get_config(self) -> Dict[str, Union[str, int]]: """Returns the config to be passed to a SageMaker JumpStart Industry Summarizer instance.""" return { "processor_type": self.processor_type, "summary_size": self.summary_size, "vector_size": self.vector_size, "min_count": self.min_count, "epochs": self.epochs, "metric": self.metric, "init": self.init, } @property def summary_size(self) -> int: """Gets the value of the ``summary_size`` parameter.""" return self._summary_size @property def vector_size(self) -> int: """Gets the value of the ``vector_size`` parameter.""" return self._vector_size @property def min_count(self) -> int: """Gets the value of the ``min_count`` parameter.""" return self._min_count @property def epochs(self) -> int: """Gets the value of the ``epochs`` parameter.""" return self._epochs @property def metric(self) -> str: """Gets the value of the ``metric`` parameter.""" return self._metric @property def init(self) -> str: """Gets the value of the ``init`` parameter.""" return self._init class NLPScorerConfig(FinanceProcessorConfig): """Config class for :class:`~smjsindustry.finance.processor.NLPScorer`. The NLP scores report the percentage of words in a document that match a list of words, which is called a lexicon. The matching is undertaken after stemming of the document and the lexicon. NLP scoring of sentiment is based on the Vader sentiment lexicon. NLP Scoring of readability is based on the Gunning-Fog index. Use this configuration class to specify the word lists and their corresponding names that will be used when performing NLP scoring on a document. Args: nlp_score_types (List[NLPScoreType]): The score types that will be used for NLP scoring. """ def __init__(self, nlp_score_types: List[NLPScoreType]): """Initializes a ````NLPScorerConfig```` instance.""" super().__init__(NLP_SCORER) self._config = {} self._config["processor_type"] = self.processor_type self._config["score_types"] = {} if not isinstance(nlp_score_types, list): nlp_score_types = [nlp_score_types] for score_type in nlp_score_types: if not isinstance(score_type, NLPScoreType): raise TypeError( "An NLPScorerConfig must be initialized with " "either a single NLPScoreType object, or " "a list of NLPScoreType objects." ) self._config["score_types"][score_type.score_name] = score_type.word_list def get_config(self) -> Dict[str, Union[str, Dict[str, List[str]]]]: """Returns the config to be passed to a SageMaker JumpStart Industry NLPScorer instance.""" return self._config class EDGARDataSetConfig(FinanceProcessorConfig): """Config class for loading SEC filings from SEC EDGAR. It specifies the details of SEC filings required by the DataLoader. Args: tickers_or_ciks (List[str]): A list of stock tickers or CIKs. For example, ``['amzn']`` form_types (List[str]): A list of SEC form types. The supported form types are ``10-K``, ``10-Q``, ``8-K``, ``497``, ``497K``, ``S-3ASR``, ``N-1A``, ``485BXT``, ``485BPOS``, ``485APOS``, ``S-3``, ``S-3/A``, ``DEF 14A``, ``SC 13D``, and ``SC 13D/A``. For example, ``['10-K']``. filing_date_start (str): The starting filing date in the format of ``'YYYY-MM-DD'``. For example, ``'2021-01-01'``. filing_date_end (str): The ending filing date in the format of ``'YYYY-MM-DD'``. For example, ``'2021-12-31'``. email_as_user_agent (str): The user email used as a ``user_agent`` for SEC EDGAR HTTP requests. For example, ``"gecko_demo_user@amazon.com"``. """ def __init__( self, tickers_or_ciks: List[str] = None, form_types: List[str] = None, filing_date_start: str = None, filing_date_end: str = None, email_as_user_agent: str = None, ): """Initializes a ``EDGARDataSetConfig`` instance. Raises: TypeError: - if ``tickers_or_ciks`` (List[str]) is not a list OR any item in the list is not a string - if ``form_types`` (List[str]) is not a list OR any item in the list is not a string - if ``filing_date_start`` (str) is not a string - if ``filing_date_end`` (str) is not a string - if ``email_as_user_agent`` (str) is not a string ValueError: - if any item in the ``form_types`` (List[str]) is not from SUPPORTED_SEC_FORMS - if ``filing_date_start`` (str) is not in the format of 'YYYY-MM-DD' - if ``filing_date_end`` (str) is not in the format of 'YYYY-MM-DD' - if ``email_as_user_agent`` (str) is not a valid email address """ super().__init__(LOAD_DATA) if ( not tickers_or_ciks or not isinstance(tickers_or_ciks, list) or any(not isinstance(ticker_or_cik, str) for ticker_or_cik in tickers_or_ciks) ): raise TypeError("EDGARDataSetConfig requires tickers_or_ciks to be a list of strings.") if ( not form_types or not isinstance(form_types, list) or any(not isinstance(form_type, str) for form_type in form_types) ): raise TypeError("EDGARDataSetConfig requires form_types to be a list of strings.") for form_type in form_types: if form_type.upper() not in SUPPORTED_SEC_FORMS: raise ValueError(f"{form_type} not supported.") if not isinstance(filing_date_start, str): raise TypeError("EDGARDataSetConfig requires filing_date_start to be a string.") if not bool(re.match(r"^\d{4}-\d{1,2}-\d{1,2}$", filing_date_start)): raise ValueError( "EDGARDataSetConfig requires filing_date_start in the format of 'YYYY-MM-DD'." ) if not isinstance(filing_date_end, str): raise TypeError("EDGARDataSetConfig requires filing_date_end to be a string.") if not bool(re.match(r"^\d{4}-\d{1,2}-\d{1,2}$", filing_date_end)): raise ValueError( "EDGARDataSetConfig requires filing_date_end in the format of 'YYYY-MM-DD'." ) if not isinstance(email_as_user_agent, str): raise TypeError("EDGARDataSetConfig requires email_as_user_agent to be a string.") if not re.match(r"^[A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*$", email_as_user_agent): raise ValueError( "EDGARDataSetConfig requires email_as_user_agent to be a valid email address." ) self._tickers_or_ciks = tickers_or_ciks self._form_types = form_types self._filing_date_start = filing_date_start self._filing_date_end = filing_date_end self._email_as_user_agent = email_as_user_agent logger.info( "Use of SageMaker JumpStart Industry Pack is subject to the SEC terms and conditions " "governing the EDGAR database. You should conduct your own " "review of the terms to make sure they are acceptable for your " "use case before proceeding." ) def get_config(self): """Returns config to be passed to a SageMaker JumpStart Industry DataLoader instance.""" return { "processor_type": self.processor_type, "tickers_or_ciks": self.tickers_or_ciks, "form_types": self.form_types, "filing_date_start": self.filing_date_start, "filing_date_end": self.filing_date_end, "email_as_user_agent": self.email_as_user_agent, } @property def tickers_or_ciks(self): """Gets the string of the tickers_or_ciks parameter.""" return self._tickers_or_ciks @property def form_types(self): """Gets the string of the ``form_types`` parameter.""" return self._form_types @property def filing_date_start(self): """Gets the string of the ``filing_date_start`` parameter.""" return self._filing_date_start @property def filing_date_end(self): """Gets the string of the ``filing_date_end`` parameter.""" return self._filing_date_end @property def email_as_user_agent(self): """Gets the string of the ``email_as_user_agent`` parameter.""" return self._email_as_user_agent