# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of # the License is located at # # http://aws.amazon.com/apache2.0/ # # or in the "license" file accompanying this file. This file is # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. """The module that builds a TabText dataframe.""" from __future__ import absolute_import import pandas as pd from smjsindustry.finance.utils import get_freq_label JUMPSTART_NORMALIZED_DATE = "jumpstart-normalized-date" def build_tabText( tabular_df: pd.DataFrame, tabular_key: str, tabular_date_column: str, text_df: pd.DataFrame, text_key: str, text_date_column: str, how: str = "inner", freq: str = "Q", ) -> pd.DataFrame: """Builds a TabText dataframe by joining the columns in the tabular and text dataframes. It joins a tabular dataframe and a text dataframe to create a TabText dataframe. Each row of the two dataframes must be uniquely defined by a composite key consisting of a key and a date column. After the date columns are normalized according to the given frequency, the two dataframes can be merged using the key column and the normalized date column. Args: tabular_df (pandas.DataFrame): The tabular dataframe to be joined, requiring a date column. tabular_key (str): The tabular dataframe's key column to be joined on. tabular_date_column (str): The tabular dataframe's date column to be joined on, in a format of ``"yyyy-mm-dd"``, ``"yyyy-mm"``, or ``"yyyy"``. text_df (pandas.DataFrame): The text dataframe to be joined, requiring a date column. text_key (str): The text dataframe's key column to be joined on. text_date_column (str): The text dataframe's date column to be joined on, in a format of ``"yyyy-mm-dd"``, ``"yyyy-mm"``, or ``"yyyy"``. how (str): The type of join to be performed; possible values: ``{'left', 'right', 'outer', 'inner'}`` (default: ``'inner'``). freq (str): Specify how the date field should be joined, by year, quarter, month, week or day. Possible values: ``{'Y', 'Q', 'M', 'W', 'D'}`` (default: ``'Q'``). Returns: pandas.DataFrame: The joined dataframe object. """ if tabular_date_column and text_date_column: tabular_df[JUMPSTART_NORMALIZED_DATE] = tabular_df[tabular_date_column] for i in range(len(tabular_df)): date_value = tabular_df.loc[i, tabular_date_column] freq_label = get_freq_label(date_value, freq) tabular_df.loc[i, JUMPSTART_NORMALIZED_DATE] = freq_label text_df[JUMPSTART_NORMALIZED_DATE] = text_df[text_date_column] for i in range(len(text_df)): date_value = text_df.loc[i, text_date_column] freq_label = get_freq_label(date_value, freq) text_df.loc[i, JUMPSTART_NORMALIZED_DATE] = freq_label joined = pd.merge( tabular_df, text_df, left_on=[tabular_key, JUMPSTART_NORMALIZED_DATE], right_on=[text_key, JUMPSTART_NORMALIZED_DATE], how=how, ) tabular_df.drop(columns=[JUMPSTART_NORMALIZED_DATE], inplace=True) text_df.drop(columns=[JUMPSTART_NORMALIZED_DATE], inplace=True) joined.drop(columns=[JUMPSTART_NORMALIZED_DATE], inplace=True) else: joined = pd.merge(tabular_df, text_df, left_on=tabular_key, right_on=text_key, how=how) return joined