# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 """Synthetic dataset generation for date normalization via seq2seq text modelling This script provides utilities for tackling date field format normalization as a conditional language modelling task. For example, training a model with input like: "Convert dates to YYYY-MM-DD: 31/12/2000" ...into a target output sequence like "2000-12-31". In the plain text case, it's relatively straightforward to generate synthetic data for this task as shown here. By modifying the distribution of randomly generated dates, the likelihood of different observed formats in the source document and target formats in the prompt, we can tailor model performance to match target use-case without having to write extensive text parsing rules. """ # Python Built-Ins: from dataclasses import dataclass from logging import getLogger import time from typing import List, Optional, Sequence # External Dependencies: from datasets import Dataset, DatasetInfo import numpy as np logger = getLogger("data.seq2seq.dates") @dataclass class DateFormatConfig: """Configuration describing a date format for date normalization tasks Parameters ---------- format_str : A formal `time.strftime`-compatible specifier for the date format, for example `%Y-%m-%d`. format_name : A human-friendly identifier for the format, as might be used in task prompts. For example `YYYY-MM-DD` for a prompt like "Convert dates to YYYY-MM-DD". observed_weight : Weight/frequency with which this date format will be observed in content, for synthetic data generation. Does not need to be normalized to 1.0 across all your configured formats, because the dataset generator will ensure this for you. target_weight : Weight/frequency with which this date format will be used as the target for prompting, for synthetic data generation. Does not need to be normalized to 1.0 across all your configured formats, because the dataset generator will ensure this for you. """ format_str: str format_name: str observed_weight: float target_weight: float # Format configuration for synthetic date normalization training data generation DATE_FORMAT_CONFIGS = [ DateFormatConfig("%Y-%m-%d", "YYYY-MM-DD", observed_weight=0.1, target_weight=0.7), DateFormatConfig("%m/%d/%y", "MM/DD/YY", observed_weight=0.35, target_weight=0.05), DateFormatConfig("%m/%d/%Y", "MM/DD/YYYY", observed_weight=0.35, target_weight=0.2), DateFormatConfig("%d/%m/%y", "DD/MM/YY", observed_weight=0.05, target_weight=0.02), DateFormatConfig("%d/%m/%Y", "DD/MM/YYYY", observed_weight=0.04, target_weight=0.03), # Including day names and month names: DateFormatConfig("%A %b %d %y", "DDDD MMM DD YY", observed_weight=0.03, target_weight=0.0), DateFormatConfig("%A, %b %d %y", "DDDD, MMM DD YY", observed_weight=0.02, target_weight=0.0), DateFormatConfig("%a %b %d, %y", "DDD MMM DD, YY", observed_weight=0.02, target_weight=0.0), DateFormatConfig("%a. %b %d %y", "DDD. MMM DD YY", observed_weight=0.02, target_weight=0.0), DateFormatConfig("%A %b %dst %y", "DDDD MMM DDst YY", observed_weight=0.01, target_weight=0.0), DateFormatConfig("%A %b %dnd %y", "DDDD MMM DDnd YY", observed_weight=0.01, target_weight=0.0), DateFormatConfig("%A %b %drd %y", "DDDD MMM DDrd YY", observed_weight=0.01, target_weight=0.0), DateFormatConfig("%A %b %dth %y", "DDDD MMM DDth YY", observed_weight=0.01, target_weight=0.0), DateFormatConfig("%a %d %b %y", "DDD DD MMM YY", observed_weight=0.02, target_weight=0.0), DateFormatConfig("%a. %d %b %y", "DDD. DD MMM YY", observed_weight=0.02, target_weight=0.0), # Including times: DateFormatConfig( "%Y-%m-%d %H:%M:%S", "YYYY-MM-DD HH:mm:ss", observed_weight=0.02, target_weight=0.0 ), DateFormatConfig("%d/%m/%y %H:%M", "DD/MM/YY HH:mm", observed_weight=0.02, target_weight=0.0), DateFormatConfig("%H:%M %d/%m/%y", "HH:mm DD/MM/YY", observed_weight=0.02, target_weight=0.0), DateFormatConfig( "%I:%M%p %d/%m/%Y", "hh:mmp DD/MM/YYYY", observed_weight=0.02, target_weight=0.0 ), DateFormatConfig("%H:%M %d/%m/%Y", "HH:mm DD/MM/YYYY", observed_weight=0.02, target_weight=0.0), DateFormatConfig( "%d/%m/%Y %I:%M%p", "DD/MM/YYYY hh:mmp", observed_weight=0.02, target_weight=0.0 ), DateFormatConfig("%d/%m/%Y %H:%M", "DD/MM/YYYY HH:mm", observed_weight=0.02, target_weight=0.0), DateFormatConfig("%m/%d/%y", "MM/DD/YY", observed_weight=0.02, target_weight=0.0), DateFormatConfig( "%d/%m/%y %I:%M%p", "DD/MM/YY hh:mmp", observed_weight=0.02, target_weight=0.0 ), DateFormatConfig("%d/%m/%y %H:%M", "DD/MM/YY HH:mm", observed_weight=0.02, target_weight=0.0), ] def random_times_between( start: time.struct_time, end: time.struct_time, n: int = 1, rng: Optional[np.random.Generator] = None, ) -> List[time.struct_time]: """Generate uniformly random datetimes between `start` and `end` Parameters ---------- start : Start of the date/time window (Generate with e.g. `time.strptime()`). end : End of the date/time window (Generate with e.g. `time.strptime()`). n : Number of samples to generate. rng : Optional numpy random generator. Provide this to speed things up and enable reproducibility. Returns ------- datetimes : List of `n` generated date/times in the given window. You can convert these to string representations via e.g. `time.strftime()`. """ # Create a RNG if one was not provided: if rng is None: rng = np.random.default_rng() # To treat the struct_times as numeric (so we can add randomized offsets), convert them into # timestamps via mktime(): start = time.mktime(start) end = time.mktime(end) # Generate random offsets as a 0-1 proportion through the window: props = rng.uniform(size=n) # localtime() is the inverse of mktime(), converting timestamps back to full time structs: max_offset = end - start return [time.localtime(start + p * max_offset) for p in props] def generate_seq2seq_date_norm_dataset( n: int, configs: Sequence[DateFormatConfig] = DATE_FORMAT_CONFIGS, from_date: time.struct_time = time.strptime("1950-01-01", "%Y-%m-%d"), to_date: time.struct_time = time.strptime("2050-01-01", "%Y-%m-%d"), rng: Optional[np.random.Generator] = None, ) -> Dataset: """Generate a synthetic seq2seq task dataset for date normalization in text Parameters ---------- n : Number of examples to generate configs : Sequence of date format configuration objects describing the date formats to use and their relative frequencies in source texts and target requests. from_date : Start of the date window that randomly generated dates should fall within. to_date : End of the date window that randomly generated dates should fall within. rng : Optional numpy random generator object. Provide this if you want reproducibility. Returns ------- dataset : Hugging Face datasets.Dataset with fields `src_texts` (the input prompts) and `tgt_texts` (the target outputs) for each generated example. """ # Create a RNG if one was not provided: if rng is None: rng = np.random.default_rng() # Normalize the observed_weights of the date format configurations: observed_weights = [fmt.observed_weight for fmt in configs] observed_weights_total = sum(observed_weights) if observed_weights_total != 1.0: logger.info(f"Normalizing observed_weights (summed to {observed_weights_total})") observed_weights = [w / observed_weights_total for w in observed_weights] # Select an observed format for the `n` input texts: obs_choices = rng.choice( len(observed_weights), p=observed_weights, size=(n,), replace=True, ) # Normalize the target_weights of the date format configurations target_weights = [fmt.target_weight for fmt in configs] target_weights_total = sum(target_weights) if target_weights_total != 1.0: logger.info(f"Normalizing target_weights (summed to {target_weights_total})") target_weights = [w / target_weights_total for w in target_weights] # Select a requested format for the `n` prompts: target_choices = rng.choice( len(target_weights), p=target_weights, size=(n,), replace=True, ) # Generate the `n` prompts & answers: random_dates = random_times_between(from_date, to_date, n=n, rng=rng) prompts = [] answers = [] for ix in range(n): obs_config = configs[obs_choices[ix]] target_config = configs[target_choices[ix]] random_date = random_dates[ix] prompt = "Convert dates to %s: %s" % ( target_config.format_name, time.strftime(obs_config.format_str, random_date), ) answer = time.strftime(target_config.format_str, random_date) prompts.append(prompt) answers.append(answer) return Dataset.from_dict( { "src_texts": prompts, "tgt_texts": answers, }, info=DatasetInfo( description="Synthetic dataset for T5-style seq2seq normalization of dates", ), )