from __future__ import division

# Python Built-Ins:
import gzip
import os
import shutil
import subprocess
import tarfile
import time
from typing import Optional

# External Dependencies:
import numpy as np
from sklearn import preprocessing
import torchtext


def wait_for_file_stable(path: str, stable_secs: int=60, poll_secs: Optional[int]=None) -> bool:
    """Wait for a file to become stable (not recently modified) & return existence

    Returns False if file does not exist. Raises FileNotFoundError if file deleted during polling.

    When running through the two notebooks at the same time in parallel, this helps to minimize any
    errors caused by initiating multiple downloads/extractions/etc on the same file in parallel.
    """
    if not poll_secs:
        poll_secs = stable_secs / 4
    try:
        init_stat = os.stat(path)
    except FileNotFoundError:
        return False

    if (time.time() - init_stat.st_mtime) < stable_secs:
        print(f"Waiting for file to stabilize... {path}")
        while (time.time() - os.stat(path).st_mtime) < stable_secs:
            time.sleep(poll_secs)
        print("File ready")

    return True

def dummy_encode_labels(df,label):
    encoder = preprocessing.LabelEncoder()
    encoded_y = encoder.fit_transform(df[label].values)
    num_classes = len(encoder.classes_)
    # convert integers to dummy variables (i.e. one hot encoded)
    dummy_y = np.eye(num_classes, dtype="float32")[encoded_y]
    return dummy_y, encoder.classes_


def tokenize_and_pad_docs(df, columns, max_length=40):
    docs = df[columns].values

    t = torchtext.data.Field(
      lower       = True,
      tokenize   = "basic_english",
      fix_length  = max_length
    )
    docs = list(map(t.preprocess, docs))
    padded_docs = t.pad(docs)
    t.build_vocab(padded_docs)
    print(f"Vocabulary size: {len(t.vocab)}")
    numericalized_docs = []
    for d in padded_docs:
        temp = []
        for c in d:
            temp.append(t.vocab.stoi[c])
        numericalized_docs.append(temp)
    print(f"Number of headlines: {len(numericalized_docs)}")
    return np.array(numericalized_docs), t


def get_word_embeddings(t, folder, lang="en"):
    """Download pre-trained word vectors and construct an embedding matrix for tokenizer `t`

    Any tokens in `t` not found in the embedding vectors are mapped to all-zeros.
    """
    vecs_url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.{lang}.300.vec.gz"
    vecs_gz_filename = vecs_url.rpartition("/")[2]
    os.makedirs(folder, exist_ok=True)
    vecs_gz_filepath = os.path.join(folder, vecs_gz_filename)

    tokenizer_vocab_size = len(t.vocab)

    if wait_for_file_stable(vecs_gz_filepath):
        print("Using existing embeddings file")
    else:
        print("Downloading word vectors...")
        subprocess.run([" ".join(["wget", "-NP", folder, vecs_url])], check=True, shell=True)

    print("Loading into memory...")
    embeddings_index = dict()
    with gzip.open(vecs_gz_filepath, "rt") as zipf:
        firstline = zipf.readline()
        emb_vocab_size, emb_d = firstline.split(" ")
        emb_vocab_size = int(emb_vocab_size)
        emb_d = int(emb_d)
        for line in zipf:
            values = line.split()
            word = values[0]
            # Only load subset of the embeddings recognised by the tokenizer:
            if word in t.vocab.stoi:
                coefs = np.asarray(values[1:], dtype="float32")
                embeddings_index[word] = coefs
    print("Loaded {} of {} word vectors for tokenizer vocabulary length {}".format(
        len(embeddings_index),
        emb_vocab_size,
        tokenizer_vocab_size,
    ))

    # create a weight matrix for words in training docs
    embedding_matrix = np.zeros((tokenizer_vocab_size, emb_d))
    for word, i in t.vocab.stoi.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix