import csv
import re
import sys

# Supported Languages
SUPPORTED_LANGS = ["en", "tr"]

newIPAList = []

# Check the parameters
numArgs = len(sys.argv)
if (numArgs < 2) or (numArgs > 3):
    print("Usage: vocab-validation csvFilename {language}")

# Extract the filename and language code (default to en)
filename = sys.argv[1]
if numArgs == 3:
    langCode = sys.argv[2].lower()
    if langCode not in SUPPORTED_LANGS:
        print("Usage: vocab-validation csvFilename {language}")
        print("-- {0} is not in the supported language list: {1}".format(langCode, SUPPORTED_LANGS))
else:
    langCode = "en"

# Pick out correct codes
if langCode == "en":
    # English values
    VALID_COMB = [{"Split": "a ʊ", "Combo": "aʊ"},
                  {"Split": "a ɪ", "Combo": "aɪ"},
                  {"Split": "e ɪ", "Combo": "eɪ"},
                  {"Split": "ɔ ɪ", "Combo": "ɔɪ"},
                  {"Split": "o ʊ", "Combo": "oʊ"},
                  {"Split": "n ̩", "Combo": "n̩"},
                  {"Split": "l ̩", "Combo": "l̩"}]

    VALID_PAIR = ["aʊ", "aɪ", "eɪ", "ɔɪ", "oʊ", "n̩", "l̩"]

    VALID_CHAR = ["w", "ɪ", "z", "b", "æ", "d", "ð", "ŋ", "f", "ɑ", "g", "ɔ", "h", "i", "ə",
                  "j", "ɛ", "k", "ɝ", "l", "ɡ", "m", "ɹ", "n", "ʃ", "ʊ", "ʌ", "p", "ʍ", "s",
                  "ʒ", "t", "ʤ", "u", "ʧ", "v", "θ"]

    VALID_PHRASE = "^[a-zA-Z.'-]+$"
    VALID_SOUNDS = "^[a-zA-Z.'-]+"
elif langCode == "tr":
    VALID_COMB = [{"Split": "a ː", "Combo": "aː"},
                  {"Split": "e ː", "Combo": "eː"},
                  {"Split": "i ː", "Combo": "iː"},
                  {"Split": "o ː", "Combo": "oː"},
                  {"Split": "u ː", "Combo": "uː"},
                  {"Split": "y ː", "Combo": "yː"},
                  {"Split": "ø ː", "Combo": "øː"},
                  {"Split": "ɯ ː", "Combo": "ɯː"}]

    VALID_PAIR = ["aː", "eː", "iː", "oː", "uː", "yː", "øː", "ɯː"]

    VALID_CHAR = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o",
                  "p", "s", "t", "u", "v", "w", "y", "z", "ø", "ŋ", "ɟ", "ɣ", "ɫ", "ɯ", "ɾ",
                  "ʃ", "ʒ", "ʔ", "ʤ", "ʧ"]

    VALID_PHRASE = "^[a-zA-Z.'-ÇÖÜâäçèéêíîóöûüĞğİıŞşšž]+$"
    VALID_SOUNDS = "^[a-zA-Z.'-ÇÖÜâäçèéêíîóöûüĞğİıŞşšž]+"

with open(filename, newline='') as csvfile:
    vocab = csv.reader(csvfile)
    line = 0
    for row in vocab:
        line += 1
        if line > 1:
            # Pick out a random training line for this entry
            phrase = row[0]
            displayAS = row[1]
            soundsLike = row[2]
            ipaNewText = row[3]

            # Ensure that the Phrase is only [A-Z], [a-z] and [.-']
            if not re.search(VALID_PHRASE, phrase) or phrase.endswith("-"):
                print("Failed: phrase [" + phrase + "] violates allowed characters")
                line = -1

            # Ensure that SoundsLike is also only [A-Z], [a-z] and [.-']
            if " " in soundsLike:
                print("Failed: soundsLike [" + soundsLike + "] contains SPACE characters")
                line = -1
            if (soundsLike != "") and not re.search(VALID_SOUNDS, soundsLike):
                print("Failed: soundsLike [" + soundsLike + "] violates allowed characters")
                line = -1

            # Do some substitutions / replacements in the IPA string (from standard IPA forms)
            if " " in ipaNewText:
                print("Failed: IPA phrase [" + ipaNewText + "] contains SPACE characters")

            # Only worry about the IPA if there is something there
            if ipaNewText != "":
                # Now format the IPA phrase for the correct character separation
                ipaNewText = " ".join(ipaNewText)
                for combo in VALID_COMB:
                    if combo["Split"] in ipaNewText:
                        ipaNewText = ipaNewText.replace(combo["Split"], combo["Combo"])

                # Ensure no disallowed characters exist in the IPA
                ipaNewText = ipaNewText.split(' ')
                for token in ipaNewText:
                    if token not in VALID_CHAR:
                        if token not in VALID_PAIR:
                            print("Failed: found [" + token + "] in " + phrase)
                            line = -1

            if line == -1:
                break

            # Print success and store set, generating new
            finalIPA = " ".join(ipaNewText)

            print("Phrase: " + phrase + " : " + finalIPA + soundsLike)
            newIPAList.append({'Phrase': phrase, 'DisplayAs': displayAS, 'SoundsLike': soundsLike, 'IPA': finalIPA})

with open(filename+'.txt', 'w', newline='') as file:
    fieldnames = ['Phrase', 'DisplayAs', 'IPA', 'SoundsLike']
    file.write('\t'.join(fieldnames) + '\n')
    for entry in newIPAList:
        file.write(entry['Phrase'] + '\t' + entry['DisplayAs'] + '\t' + entry['IPA'] + '\t' + entry["SoundsLike"] + '\n')