# OpenAI API Completions Inference for JAQKET dataset

[OpenAI API](https://platform.openai.com/) の [Completions API](https://platform.openai.com/docs/guides/gpt/completions-api) で [JAQKET](https://www.nlp.ecei.tohoku.ac.jp/projects/jaqket/) のデータセットに回答するコードです。

Notebook を動かす前に、 OpenAI API の API KEY と Organization ID の確認が必要です。いずれも Manage Account のメニューから確認できます。取得した API KEY と Organization ID は `os.environ` などを使い事前に設定しておいてください。これらは機微な情報のため、 GitHub などに誤ってコミットしないよう注意してください。

## Setup

In [None]:
!pip install openai requests

In [None]:
import os
import json
import requests
import pandas as pd
import openai


OPENAI_ORGANIZATION = os.getenv("OPENAI_ORGANIZATION")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
JAQKET_DEV_DATASET = (
    "https://jaqket.s3.ap-northeast-1.amazonaws.com/data/aio_02/aio_02_dev_v1.0.jsonl"
)


if OPENAI_ORGANIZATION is None or OPENAI_API_KEY is None:
    raise Exception(
        "Please set the OPENAI_ORGANIZATION and OPENAI_API_KEY environment variables for organization and api key."
    )

## Prepare dataset

In [None]:
def read_jaqket_dev() -> pd.DataFrame:
    file_name = os.path.basename(JAQKET_DEV_DATASET)
    location = os.path.join(f"data/{file_name}")
    if not os.path.exists(location):
        response = requests.get(JAQKET_DEV_DATASET)
        with open(location, mode="wb") as f:
            f.write(response.content)

    return pd.read_json(location, lines=True)

## Answer to quiz by Completions API

In [None]:
import re

def answer(model: str, question: str) -> str:
    openai.organization = OPENAI_ORGANIZATION
    openai.api_key = OPENAI_API_KEY

    template = "日本語のクイズに答えてください。\n{instruction}\n答えは「"
    prompt = template.format(instruction=question)

    response = openai.Completion.create(
        model=model,
        prompt=prompt,
        max_tokens=64,
        temperature=0,
        top_p=1,
        n=1,
        stop="\n",
    )

    _answer = response["choices"][0]["text"]
    _answer = re.findall("「(.*?)」", prompt + _answer)[-1]
    return _answer

## Answer to dataset

In [None]:
from tqdm import tqdm


def answer_jaqket(
    question_df: pd.DataFrame, model: str = "text-davinci-003"
) -> pd.DataFrame:
    chatgpt_answers = []
    matches = []
    for idx, row in tqdm(question_df.iterrows()):
        chatgpt_answer = answer(model, row["question"])
        chatgpt_answers += [chatgpt_answer]
        matches += [chatgpt_answer in row["answers"]]

    question_df["chatgpt_answer"] = pd.Series(chatgpt_answers)
    question_df["match"] = pd.Series(matches)
    return question_df


answer_file_name = "jaqket_answers.csv"
answers = answer_jaqket(read_jaqket_dev(), model="text-davinci-003")
answers.to_csv(f"data/{answer_file_name}", index=False)