#
#   Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#   SPDX-License-Identifier: MIT-0
#
from typing import TypedDict, List

from api_python_client.model.form_json_schema import FormJSONSchema

from aws_lambdas.utils.textract.extraction import ordered_object_schema_property_keys


class TextractQuery(TypedDict):
    Alias: str
    Pages: List[str]
    Text: str


def get_queries_from_schema(
    schema: FormJSONSchema, alias: str = ""
) -> List[TextractQuery]:
    """
    Given a document schema, find all optional textract queries specified and return as a flat list to be passed in the
    query configuration for textract.
    """

    if schema["typeOf"] == "object":
        queries = []
        for property_key in ordered_object_schema_property_keys(schema):
            queries += get_queries_from_schema(
                schema["properties"][property_key],
                alias + ("" if len(alias) == 0 else ".") + property_key,
            )
        return queries
    elif schema["typeOf"] == "array":
        # Ignore any queries specified in array types. Queries are for a single question, single answer extraction, and
        # we don't know how many items we need to query for upfront. If min/max lengths are set for the array it might
        # be possible to ask a variant of the query for each individual item.
        return []

    # Primitive types (string, integer etc)
    if (
        "extractionMetadata" in schema
        and "textractQuery" in schema["extractionMetadata"]
        and schema["extractionMetadata"]["textractQuery"] is not None
    ):
        return [
            {
                "Alias": alias,
                "Pages": ["*"],  # all pages
                "Text": schema["extractionMetadata"]["textractQuery"],
            }
        ]

    return []