{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Amazon SageMaker と Amazon QuickSight による自然言語処理ダッシュボードの作成" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "オープンソースの形態素解析ツールである GiNZA を Amazon SageMaker ノートブックに導入し、テキストからワード、係り受けを抽出、抽出した結果を Amazon QuickSight に取り込み、分析可能なダッシュボードを作成します。 詳細は以下のブログを参照してください。\n", "\n", "https://aws.amazon.com/jp/blogs/news/amazon-sagemaker-amazon-quicksight-nlp-dashboard/" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 0. 準備" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### GiNZA のインストール\n", "- 形態素解析で使用するライブラリをインストールします。インストール後にKernelを再起動します。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com\n", "Collecting ginza\n", " Using cached ginza-5.1.2-py3-none-any.whl (20 kB)\n", "Collecting ja-ginza\n", " Using cached ja_ginza-5.1.2-py3-none-any.whl (59.1 MB)\n", "Collecting spacy<3.5.0,>=3.2.0\n", " Using cached spacy-3.4.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)\n", "Collecting plac>=1.3.3\n", " Using cached plac-1.3.5-py2.py3-none-any.whl (22 kB)\n", "Collecting SudachiPy<0.7.0,>=0.6.2\n", " Using cached SudachiPy-0.6.6-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.2 MB)\n", "Collecting SudachiDict-core>=20210802\n", " Using cached SudachiDict-core-20221021.tar.gz (9.0 kB)\n", " Preparing metadata (setup.py) ... \u001b[?25ldone\n", "\u001b[?25hCollecting pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4\n", " Using cached pydantic-1.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.6 MB)\n", "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.2.0->ginza) (4.62.3)\n", "Collecting pathy>=0.3.5\n", " Using cached pathy-0.10.0-py3-none-any.whl (48 kB)\n", "Collecting srsly<3.0.0,>=2.4.3\n", " Using cached srsly-2.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (492 kB)\n", "Requirement already satisfied: numpy>=1.15.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.2.0->ginza) (1.20.3)\n", "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.2.0->ginza) (2.26.0)\n", "Collecting thinc<8.2.0,>=8.1.0\n", " Using cached thinc-8.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (819 kB)\n", "Requirement already satisfied: packaging>=20.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.2.0->ginza) (21.3)\n", "Collecting spacy-legacy<3.1.0,>=3.0.10\n", " Using cached spacy_legacy-3.0.10-py2.py3-none-any.whl (21 kB)\n", "Collecting spacy-loggers<2.0.0,>=1.0.0\n", " Using cached spacy_loggers-1.0.3-py3-none-any.whl (9.3 kB)\n", "Requirement already satisfied: setuptools in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.2.0->ginza) (59.4.0)\n", "Collecting cymem<2.1.0,>=2.0.2\n", " Using cached cymem-2.0.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36 kB)\n", "Collecting catalogue<2.1.0,>=2.0.6\n", " Using cached catalogue-2.0.8-py3-none-any.whl (17 kB)\n", "Collecting langcodes<4.0.0,>=3.2.0\n", " Using cached langcodes-3.3.0-py3-none-any.whl (181 kB)\n", "Collecting typer<0.8.0,>=0.3.0\n", " Using cached typer-0.7.0-py3-none-any.whl (38 kB)\n", "Requirement already satisfied: jinja2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.2.0->ginza) (3.0.3)\n", "Collecting wasabi<1.1.0,>=0.9.1\n", " Using cached wasabi-0.10.1-py3-none-any.whl (26 kB)\n", "Collecting murmurhash<1.1.0,>=0.28.0\n", " Using cached murmurhash-1.0.9-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21 kB)\n", "Collecting preshed<3.1.0,>=3.0.2\n", " Using cached preshed-3.0.8-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)\n", "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from packaging>=20.0->spacy<3.5.0,>=3.2.0->ginza) (3.0.6)\n", "Collecting smart-open<6.0.0,>=5.2.1\n", " Using cached smart_open-5.2.1-py3-none-any.whl (58 kB)\n", "Collecting typing-extensions>=4.1.0\n", " Using cached typing_extensions-4.4.0-py3-none-any.whl (26 kB)\n", "Requirement already satisfied: certifi>=2017.4.17 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.2.0->ginza) (2021.10.8)\n", "Requirement already satisfied: idna<4,>=2.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.2.0->ginza) (3.1)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.2.0->ginza) (1.26.8)\n", "Requirement already satisfied: charset-normalizer~=2.0.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.2.0->ginza) (2.0.8)\n", "Collecting blis<0.8.0,>=0.7.8\n", " Downloading blis-0.7.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.2/10.2 MB\u001b[0m \u001b[31m29.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting confection<1.0.0,>=0.0.1\n", " Downloading confection-0.0.3-py3-none-any.whl (32 kB)\n", "Requirement already satisfied: click<9.0.0,>=7.1.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from typer<0.8.0,>=0.3.0->spacy<3.5.0,>=3.2.0->ginza) (8.0.3)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from jinja2->spacy<3.5.0,>=3.2.0->ginza) (2.0.1)\n", "Building wheels for collected packages: SudachiDict-core\n", " Building wheel for SudachiDict-core (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Created wheel for SudachiDict-core: filename=SudachiDict_core-20221021-py3-none-any.whl size=71574782 sha256=423df534e443b9cd7be9b4a4d4daaab2f9ab82fc329bf17ad7fb0ac7403a34cf\n", " Stored in directory: /home/ec2-user/.cache/pip/wheels/ca/e3/ed/e78fecf6fd34349114d292242a16fc08d513fb32c2d9c5d786\n", "Successfully built SudachiDict-core\n", "Installing collected packages: wasabi, SudachiPy, plac, cymem, typing-extensions, typer, SudachiDict-core, spacy-loggers, spacy-legacy, smart-open, murmurhash, langcodes, catalogue, blis, srsly, pydantic, preshed, pathy, confection, thinc, spacy, ginza, ja-ginza\n", " Attempting uninstall: typing-extensions\n", " Found existing installation: typing_extensions 4.0.0\n", " Uninstalling typing_extensions-4.0.0:\n", " Successfully uninstalled typing_extensions-4.0.0\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "aiobotocore 2.0.1 requires botocore<1.22.9,>=1.22.8, but you have botocore 1.24.19 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed SudachiDict-core-20221021 SudachiPy-0.6.6 blis-0.7.9 catalogue-2.0.8 confection-0.0.3 cymem-2.0.7 ginza-5.1.2 ja-ginza-5.1.2 langcodes-3.3.0 murmurhash-1.0.9 pathy-0.10.0 plac-1.3.5 preshed-3.0.8 pydantic-1.10.2 smart-open-5.2.1 spacy-3.4.3 spacy-legacy-3.0.10 spacy-loggers-1.0.3 srsly-2.4.5 thinc-8.1.5 typer-0.7.0 typing-extensions-4.4.0 wasabi-0.10.1\n", "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.3.1 is available.\n", "You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", "\u001b[0mLooking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com\n", "Requirement already satisfied: awscli in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (1.27.2)\n", "Collecting awscli\n", " Using cached awscli-1.27.19-py3-none-any.whl (3.9 MB)\n", "Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from awscli) (0.6.0)\n", "Requirement already satisfied: rsa<4.8,>=3.1.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from awscli) (4.7.2)\n", "Requirement already satisfied: docutils<0.17,>=0.10 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from awscli) (0.15.2)\n", "Collecting botocore==1.29.19\n", " Using cached botocore-1.29.19-py3-none-any.whl (10.1 MB)\n", "Requirement already satisfied: PyYAML<5.5,>=3.10 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from awscli) (5.4.1)\n", "Requirement already satisfied: colorama<0.4.5,>=0.2.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from awscli) (0.4.3)\n", "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from botocore==1.29.19->awscli) (1.26.8)\n", "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from botocore==1.29.19->awscli) (2.8.2)\n", "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from botocore==1.29.19->awscli) (0.10.0)\n", "Requirement already satisfied: pyasn1>=0.1.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from rsa<4.8,>=3.1.2->awscli) (0.4.8)\n", "Requirement already satisfied: six>=1.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from python-dateutil<3.0.0,>=2.1->botocore==1.29.19->awscli) (1.16.0)\n", "Installing collected packages: botocore, awscli\n", " Attempting uninstall: botocore\n", " Found existing installation: botocore 1.24.19\n", " Uninstalling botocore-1.24.19:\n", " Successfully uninstalled botocore-1.24.19\n", " Attempting uninstall: awscli\n", " Found existing installation: awscli 1.27.2\n", " Uninstalling awscli-1.27.2:\n", " Successfully uninstalled awscli-1.27.2\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "aiobotocore 2.0.1 requires botocore<1.22.9,>=1.22.8, but you have botocore 1.29.19 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed awscli-1.27.19 botocore-1.29.19\n", "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.3.1 is available.\n", "You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", "\u001b[0mLooking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com\n", "Requirement already satisfied: boto3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (1.26.2)\n", "Collecting boto3\n", " Using cached boto3-1.26.19-py3-none-any.whl (132 kB)\n", "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from boto3) (0.10.0)\n", "Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from boto3) (0.6.0)\n", "Requirement already satisfied: botocore<1.30.0,>=1.29.19 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from boto3) (1.29.19)\n", "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from botocore<1.30.0,>=1.29.19->boto3) (2.8.2)\n", "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from botocore<1.30.0,>=1.29.19->boto3) (1.26.8)\n", "Requirement already satisfied: six>=1.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.30.0,>=1.29.19->boto3) (1.16.0)\n", "Installing collected packages: boto3\n", " Attempting uninstall: boto3\n", " Found existing installation: boto3 1.26.2\n", " Uninstalling boto3-1.26.2:\n", " Successfully uninstalled boto3-1.26.2\n", "Successfully installed boto3-1.26.19\n", "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.3.1 is available.\n", "You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", "\u001b[0m" ] } ], "source": [ "!pip install -U ginza ja-ginza\n", "!pip install -U awscli\n", "!pip install -U boto3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. データの取得\n", "- データをCSV形式で取得します。\n", "- データはテキストフィールドが1つ、日付フィールドが1つと、複数の定型フィールドがあることが前提となります。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com\n", "Requirement already satisfied: beautifulsoup4 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (4.10.0)\n", "Requirement already satisfied: soupsieve>1.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from beautifulsoup4) (2.3)\n", "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.3.1 is available.\n", "You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", "\u001b[0m" ] } ], "source": [ "!pip install beautifulsoup4" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File downloaded: data/amazon_review.tsv.gz\n", "File uznipped: data/amazon_review.tsv\n" ] } ], "source": [ "import urllib.request\n", "import os\n", "import gzip\n", "import shutil\n", "\n", "download_url = \"https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz\" \n", "dir_name = \"data\"\n", "file_name = \"amazon_review.tsv.gz\"\n", "tsv_file_name = \"amazon_review.tsv\"\n", "file_path = os.path.join(dir_name,file_name)\n", "tsv_file_path = os.path.join(dir_name,tsv_file_name)\n", "\n", "os.makedirs(dir_name, exist_ok=True)\n", "\n", "if os.path.exists(file_path):\n", " print(\"File {} already exists. Skipped download.\".format(file_name))\n", "else:\n", " urllib.request.urlretrieve(download_url, file_path)\n", " print(\"File downloaded: {}\".format(file_path))\n", " \n", "if os.path.exists(tsv_file_path):\n", " print(\"File {} already exists. Skipped unzip.\".format(tsv_file_name))\n", "else:\n", " with gzip.open(file_path, mode='rb') as fin:\n", " with open(tsv_file_path, 'wb') as fout:\n", " shutil.copyfileobj(fin, fout)\n", " print(\"File uznipped: {}\".format(tsv_file_path))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
marketplacecustomer_idreview_idproduct_idproduct_parentproduct_titleproduct_categorystar_ratinghelpful_votestotal_votesvineverified_purchasereview_headlinereview_bodyreview_date
0JP65317R33RSUD4ZTRKT7B000001GBJ957145596SONGS FROM A SECRET GARDEMusic1115NY残念ながら…残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…2012-12-05
1JP65317R2U1VB8GPZBBEHB000YPWBQ2904244932鏡の中の鏡‾ペルト作品集(SACD)(Arvo Part:Spiegel im Spiegel)Music1420NY残念ながら…残念ながら…趣味ではありませんでした。正直退屈…眠気も起きない…2012-12-05
2JP65696R1IBRCJPPGWVJWB0002E5O9G108978277Les Miserables 10th Anniversary ConcertMusic523NYドリームキャスト素晴らしいパフォーマンス。ミュージカル映画版の物足りない歌唱とは違います。2013-03-02
3JP67162RL02CW5XLYONUB00004SRJ5606528497It Takes a Nation of Millions to Hold Us BackMusic569NYやっぱりマスト専門的な事を言わずにお勧めレコメを書きたいのですが、文才が無いので無理でした。ヒップホップが...2013-08-11
4JP67701R2LA2SS3HU3A3LB0093H8H8I509738390Intel CPU Core I3-3225 3.3GHz 3MBキャッシュ LGA1155...PC424NYコスパ的には十分今までの環境(Core2 Duo E4600)に比べれば十分に快適になりました。<br />...2013-02-10
\n", "
" ], "text/plain": [ " marketplace customer_id review_id product_id product_parent \\\n", "0 JP 65317 R33RSUD4ZTRKT7 B000001GBJ 957145596 \n", "1 JP 65317 R2U1VB8GPZBBEH B000YPWBQ2 904244932 \n", "2 JP 65696 R1IBRCJPPGWVJW B0002E5O9G 108978277 \n", "3 JP 67162 RL02CW5XLYONU B00004SRJ5 606528497 \n", "4 JP 67701 R2LA2SS3HU3A3L B0093H8H8I 509738390 \n", "\n", " product_title product_category \\\n", "0 SONGS FROM A SECRET GARDE Music \n", "1 鏡の中の鏡‾ペルト作品集(SACD)(Arvo Part:Spiegel im Spiegel) Music \n", "2 Les Miserables 10th Anniversary Concert Music \n", "3 It Takes a Nation of Millions to Hold Us Back Music \n", "4 Intel CPU Core I3-3225 3.3GHz 3MBキャッシュ LGA1155... PC \n", "\n", " star_rating helpful_votes total_votes vine verified_purchase \\\n", "0 1 1 15 N Y \n", "1 1 4 20 N Y \n", "2 5 2 3 N Y \n", "3 5 6 9 N Y \n", "4 4 2 4 N Y \n", "\n", " review_headline review_body \\\n", "0 残念ながら… 残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね… \n", "1 残念ながら… 残念ながら…趣味ではありませんでした。正直退屈…眠気も起きない… \n", "2 ドリームキャスト 素晴らしいパフォーマンス。ミュージカル映画版の物足りない歌唱とは違います。 \n", "3 やっぱりマスト 専門的な事を言わずにお勧めレコメを書きたいのですが、文才が無いので無理でした。ヒップホップが... \n", "4 コスパ的には十分 今までの環境(Core2 Duo E4600)に比べれば十分に快適になりました。
... \n", "\n", " review_date \n", "0 2012-12-05 \n", "1 2012-12-05 \n", "2 2013-03-02 \n", "3 2013-08-11 \n", "4 2013-02-10 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(tsv_file_path, sep ='\\t')\n", "df.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "レビューコメントからHTMLタグを除去する" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "\n", "def filterHtmlTag(txt):\n", " soup = BeautifulSoup(txt, 'html.parser')\n", " txt = soup.get_text(strip=True)\n", " \n", " return txt" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/bs4/__init__.py:431: MarkupResemblesLocatorWarning: \"http://www.amazon.co.jp/gp/product/B0040NOWY8/ref=cm_cr_ryp_prd_ttl_sol_3\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n", " warnings.warn(\n" ] } ], "source": [ "df['review_body'] = df['review_body'].map(filterHtmlTag)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. 初期設定項目\n", "以下の項目を指定します。\n", "- project_name:適当な名前\n", "- timestamp_field: 日付フィールド\n", "- structured_fields: 定型フィールド\n", "- text_field: テキストフィールド" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import sagemaker\n", "\n", "sagemaker_session = sagemaker.Session()\n", "bucket = sagemaker_session.default_bucket()\n", "role = sagemaker.get_execution_role()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "project_name = 'amazon_review'\n", "timestamp_field = 'review_date'\n", "structured_fields = ['product_id', 'product_parent','product_title', 'product_category', 'star_rating']\n", "text_field = 'review_body'" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "num = pd.RangeIndex(start=0, stop=len(df.index), step=1)\n", "df['id'] = num\n", "all_fields = ['id'] + [timestamp_field] + [text_field] + structured_fields " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtstxtcol0col1col2col3col4
002012-12-05残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…B000001GBJ957145596SONGS FROM A SECRET GARDEMusic1
112012-12-05残念ながら…趣味ではありませんでした。正直退屈…眠気も起きない…B000YPWBQ2904244932鏡の中の鏡‾ペルト作品集(SACD)(Arvo Part:Spiegel im Spiegel)Music1
222013-03-02素晴らしいパフォーマンス。ミュージカル映画版の物足りない歌唱とは違います。B0002E5O9G108978277Les Miserables 10th Anniversary ConcertMusic5
332013-08-11専門的な事を言わずにお勧めレコメを書きたいのですが、文才が無いので無理でした。ヒップホップが...B00004SRJ5606528497It Takes a Nation of Millions to Hold Us BackMusic5
442013-02-10今までの環境(Core2 Duo E4600)に比べれば十分に快適になりました。動画のエンコ...B0093H8H8I509738390Intel CPU Core I3-3225 3.3GHz 3MBキャッシュ LGA1155...PC4
\n", "
" ], "text/plain": [ " id ts txt \\\n", "0 0 2012-12-05 残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね… \n", "1 1 2012-12-05 残念ながら…趣味ではありませんでした。正直退屈…眠気も起きない… \n", "2 2 2013-03-02 素晴らしいパフォーマンス。ミュージカル映画版の物足りない歌唱とは違います。 \n", "3 3 2013-08-11 専門的な事を言わずにお勧めレコメを書きたいのですが、文才が無いので無理でした。ヒップホップが... \n", "4 4 2013-02-10 今までの環境(Core2 Duo E4600)に比べれば十分に快適になりました。動画のエンコ... \n", "\n", " col0 col1 col2 \\\n", "0 B000001GBJ 957145596 SONGS FROM A SECRET GARDE \n", "1 B000YPWBQ2 904244932 鏡の中の鏡‾ペルト作品集(SACD)(Arvo Part:Spiegel im Spiegel) \n", "2 B0002E5O9G 108978277 Les Miserables 10th Anniversary Concert \n", "3 B00004SRJ5 606528497 It Takes a Nation of Millions to Hold Us Back \n", "4 B0093H8H8I 509738390 Intel CPU Core I3-3225 3.3GHz 3MBキャッシュ LGA1155... \n", "\n", " col3 col4 \n", "0 Music 1 \n", "1 Music 1 \n", "2 Music 5 \n", "3 Music 5 \n", "4 PC 4 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 = df.loc[:, all_fields]\n", "df1.columns = ['id','ts', 'txt'] + list(map(lambda x: f'col{str(x)}',list(range(len(structured_fields)))))\n", "df1.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "delete: s3://sagemaker-ap-northeast-1-797821601610/amazon_review/input/input.csv\n", "delete: s3://sagemaker-ap-northeast-1-797821601610/amazon_review/output/output.csv\n" ] } ], "source": [ "if(os.path.isdir('input/') == True):\n", " shutil.rmtree('input/')\n", "if(os.path.isdir('output/') == True):\n", " shutil.rmtree('output/')\n", "if(os.path.isdir('code/') == True):\n", " shutil.rmtree('code/')\n", "\n", "!aws s3 rm s3://{bucket}/{project_name}/input --recursive\n", "!aws s3 rm s3://{bucket}/{project_name}/output --recursive" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. ワード、係り受けの抽出\n", "3.1, 3.2 のどちらかを実行して、ワード、係り受けを抽出します。\n", "- 3.1 実行中のNotebookインスタンスで処理を実行します(データが少ない場合)\n", "- 3.2 SageMaker Processingで処理を実行します(データが多い場合)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3.1. Notebookインスタンスで実行" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import spacy\n", "nlp = spacy.load('ja_ginza')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "df2 = df1[:20000]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "upload: input/input.csv to s3://sagemaker-ap-northeast-1-797821601610/amazon_review/input/input.csv\n" ] } ], "source": [ "os.makedirs('input', exist_ok=True)\n", "df2.to_csv(f'input/input.csv', index=False)\n", "!aws s3 cp ./input s3://{bucket}/{project_name}/input --recursive" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 13min 46s, sys: 3.7 s, total: 13min 49s\n", "Wall time: 13min 50s\n" ] } ], "source": [ "%%time\n", "pos_id = []\n", "pos_token_no = []\n", "pos_word = []\n", "pos_pos = []\n", "\n", "dep_id = []\n", "dep_token_no_1 = []\n", "dep_token_no_2 = []\n", "dep_words_pair = []\n", "dep_dep = []\n", "\n", "deptypes = {'advmod':'副詞修飾子', 'amod':'形容詞修飾子', 'nmod': '名詞修飾子', 'nsubj':'主語名詞'}\n", "\n", "for index, row in df2.iterrows():\n", " doc = nlp(row['txt'])\n", " id = row['id']\n", " for sent in doc.sents:\n", " for token in sent:\n", " lemma = token.lemma_\n", " pos = token.tag_.split('-')[0]\n", " dep = token.dep_\n", " \n", " if pos in ('名詞','動詞','形容詞','副詞'):\n", " pos_id += [id]\n", " pos_token_no += [token.i]\n", " pos_word += [lemma]\n", " pos_pos += [pos]\n", "\n", " if dep in deptypes.keys():\n", " dep_id += [id]\n", " dep_token_no_1 += [token.i]\n", " dep_token_no_2 += [token.head.i]\n", " dep_words_pair += [token.lemma_+' - '+token.head.lemma_]\n", " dep_dep += [dep]\n", "\n", "df_pos = pd.DataFrame(\n", " data = {'id':pos_id, 'token_no':pos_token_no, 'word':pos_word, 'pos':pos_pos},\n", " columns= ['id','token_no', 'word', 'pos']\n", ") \n", "\n", "df_dep = pd.DataFrame(\n", " data = {'id':dep_id, 'token_no_1':dep_token_no_1, 'token_no_2':dep_token_no_2, 'words_pair':dep_words_pair, 'dep':dep_dep},\n", " columns= ['id', 'token_no_1', 'token_no_2', 'words_pair', 'dep']\n", ") " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### データのマージ" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtstxtcol0col1col2col3col4token_nowordposwords_pairdepvariable
002012-12-05残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…B000001GBJ957145596SONGS FROM A SECRET GARDEMusic13趣味名詞NaNNaNNaN
102012-12-05残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…B000001GBJ957145596SONGS FROM A SECRET GARDEMusic16ある動詞NaNNaNNaN
202012-12-05残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…B000001GBJ957145596SONGS FROM A SECRET GARDEMusic112ケルト名詞NaNNaNNaN
302012-12-05残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…B000001GBJ957145596SONGS FROM A SECRET GARDEMusic113音楽名詞音楽 - 範疇nmodtoken_no_1
402012-12-05残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…B000001GBJ957145596SONGS FROM A SECRET GARDEMusic115範疇名詞音楽 - 範疇nmodtoken_no_2
\n", "
" ], "text/plain": [ " id ts txt col0 \\\n", "0 0 2012-12-05 残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね… B000001GBJ \n", "1 0 2012-12-05 残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね… B000001GBJ \n", "2 0 2012-12-05 残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね… B000001GBJ \n", "3 0 2012-12-05 残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね… B000001GBJ \n", "4 0 2012-12-05 残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね… B000001GBJ \n", "\n", " col1 col2 col3 col4 token_no word pos \\\n", "0 957145596 SONGS FROM A SECRET GARDE Music 1 3 趣味 名詞 \n", "1 957145596 SONGS FROM A SECRET GARDE Music 1 6 ある 動詞 \n", "2 957145596 SONGS FROM A SECRET GARDE Music 1 12 ケルト 名詞 \n", "3 957145596 SONGS FROM A SECRET GARDE Music 1 13 音楽 名詞 \n", "4 957145596 SONGS FROM A SECRET GARDE Music 1 15 範疇 名詞 \n", "\n", " words_pair dep variable \n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 音楽 - 範疇 nmod token_no_1 \n", "4 音楽 - 範疇 nmod token_no_2 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_dep2 = pd.melt(df_dep, id_vars=['id', 'words_pair','dep'], value_vars=['token_no_1','token_no_2'], value_name='token_no' )\n", "df_pos_dep = pd.merge(df_pos, df_dep2, how='left', on=['id','token_no'])\n", "df3 = pd.merge(df2, df_pos_dep, how='right', on=['id'])\n", "df3.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "upload: output/output.csv to s3://sagemaker-ap-northeast-1-797821601610/amazon_review/output/output.csv\n" ] } ], "source": [ "os.makedirs('output', exist_ok=True)\n", "df3.to_csv('output/output.csv', index=False)\n", "!aws s3 cp ./output s3://{bucket}/{project_name}/output --recursive" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3.2. SageMaker Processingで実行" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import os\n", "import boto3\n", "import datetime\n", "import sagemaker\n", "from sagemaker import get_execution_role\n", "from sagemaker.sklearn.processing import SKLearnProcessor\n", "from sagemaker.processing import FrameworkProcessor\n", "\n", "region = sagemaker.Session().boto_region_name\n", "role = get_execution_role()\n", "\n", "est_cls = sagemaker.sklearn.estimator.SKLearn\n", "framework_version_str = \"0.20.0\"\n", "base_job_name='job' + datetime.datetime.now().strftime('%Y%m%d%H%M%S')\n", "\n", "script_processor = FrameworkProcessor(\n", " role=role,\n", " instance_count=1,\n", " instance_type=\"ml.m5.xlarge\",\n", " estimator_cls=est_cls,\n", " framework_version=framework_version_str,\n", " code_location = f's3://{bucket}/{project_name}/code',\n", " base_job_name=base_job_name\n", ")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "df2 = df1[:260000]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "k = 40000 #各CSVの行数\n", "n = df2.shape[0]\n", "dfs = [df2.loc[i:i+k-1, :] for i in range(0, n, k)]\n", "\n", "os.makedirs('input', exist_ok=True)\n", "for i,df_i in enumerate(dfs):\n", " df_i.to_csv(f'input/input_{i}.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "upload: input/input_0.csv to s3://sagemaker-ap-northeast-1-797821601610/amazon_review/input/input_0.csv\n", "upload: input/input.csv to s3://sagemaker-ap-northeast-1-797821601610/amazon_review/input/input.csv\n", "upload: input/input_1.csv to s3://sagemaker-ap-northeast-1-797821601610/amazon_review/input/input_1.csv\n", "upload: input/input_2.csv to s3://sagemaker-ap-northeast-1-797821601610/amazon_review/input/input_2.csv\n", "upload: input/input_3.csv to s3://sagemaker-ap-northeast-1-797821601610/amazon_review/input/input_3.csv\n", "upload: input/input_6.csv to s3://sagemaker-ap-northeast-1-797821601610/amazon_review/input/input_6.csv\n", "upload: input/input_4.csv to s3://sagemaker-ap-northeast-1-797821601610/amazon_review/input/input_4.csv\n", "upload: input/input_5.csv to s3://sagemaker-ap-northeast-1-797821601610/amazon_review/input/input_5.csv\n", "delete: s3://sagemaker-ap-northeast-1-797821601610/amazon_review/output/output.csv\n" ] } ], "source": [ "!aws s3 cp ./input s3://{bucket}/{project_name}/input --recursive\n", "!aws s3 rm s3://{bucket}/{project_name}/output --recursive" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "os.makedirs('code', exist_ok=True)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing code/preprocessing.py\n" ] } ], "source": [ "%%writefile code/preprocessing.py\n", "import pandas as pd\n", "import os\n", "import pandas as pd\n", "import spacy\n", "import argparse\n", "nlp = spacy.load('ja_ginza')\n", "\n", "parser = argparse.ArgumentParser()\n", "parser.add_argument('--sequence_num') \n", "args = parser.parse_args()\n", "sequence_num = args.sequence_num\n", "input_data_path = f\"/opt/ml/processing/input/input_{sequence_num}.csv\"\n", "df2 = pd.read_csv(input_data_path)\n", "\n", "pos_id = []\n", "pos_token_no = []\n", "pos_word = []\n", "pos_pos = []\n", "\n", "dep_id = []\n", "dep_token_no_1 = []\n", "dep_token_no_2 = []\n", "dep_words_pair = []\n", "dep_dep = []\n", "\n", "deptypes = {'advmod':'副詞修飾子', 'amod':'形容詞修飾子', 'nmod': '名詞修飾子', 'nsubj':'主語名詞'}\n", "text_field = 'txt'\n", "\n", "for index, row in df2.iterrows():\n", " doc = nlp(str(row[text_field]))\n", " id = row['id']\n", " for sent in doc.sents:\n", " for token in sent:\n", " lemma = token.lemma_\n", " pos = token.tag_.split('-')[0]\n", " dep = token.dep_\n", " \n", " if pos in ('名詞','動詞','形容詞','副詞'):\n", " pos_id += [id]\n", " pos_token_no += [token.i]\n", " pos_word += [lemma]\n", " pos_pos += [pos]\n", "\n", " if dep in deptypes.keys():\n", " dep_id += [id]\n", " dep_token_no_1 += [token.i]\n", " dep_token_no_2 += [token.head.i]\n", " dep_words_pair += [token.lemma_+' - '+token.head.lemma_]\n", " #dep_dep += [deptypes[dep]]\n", " dep_dep += [dep]\n", "\n", "df_pos = pd.DataFrame(\n", " data = {'id':pos_id, 'token_no':pos_token_no, 'word':pos_word, 'pos':pos_pos},\n", " columns= ['id','token_no', 'word', 'pos']\n", ") \n", "\n", "df_dep = pd.DataFrame(\n", " data = {'id':dep_id, 'token_no_1':dep_token_no_1, 'token_no_2':dep_token_no_2, 'words_pair':dep_words_pair, 'dep':dep_dep},\n", " columns= ['id', 'token_no_1', 'token_no_2', 'words_pair', 'dep']\n", ") \n", "\n", "df_dep2 = pd.melt(df_dep, id_vars=['id', 'words_pair','dep'], value_vars=['token_no_1','token_no_2'], value_name='token_no' )\n", "df_pos_dep = pd.merge(df_pos, df_dep2, how='left', on=['id','token_no'])\n", "df2 = pd.merge(df2, df_pos_dep, how='right', on=['id'])\n", "df2.to_csv(f'/opt/ml/processing/output/output_{sequence_num}.csv', index=False)\n", "\n", "print(\"Completed running the processing job\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing code/requirements.txt\n" ] } ], "source": [ "%%writefile code/requirements.txt \n", "ginza\n", "ja-ginza" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "%%capture output\n", "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", "\n", "for i in range(len(dfs)):\n", " script_processor.run(\n", " code=\"preprocessing.py\",\n", " source_dir=\"code\",\n", " inputs=[ProcessingInput(source=f's3://{bucket}/{project_name}/input/input_{i}.csv', destination=\"/opt/ml/processing/input\")],\n", " outputs=[ProcessingOutput(source=\"/opt/ml/processing/output\", destination=f's3://{bucket}/{project_name}/output/')],\n", " arguments=['--sequence_num', str(i)],\n", " wait=False\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### ジョブの実行待ち\n", "以下のセルを実行し、ProcessingJobStatusがCompletedになるまで待ちます" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ProcessingJobNameProcessingJobArnCreationTimeProcessingEndTimeLastModifiedTimeProcessingJobStatus
0job20221130105714-2022-11-30-10-57-27-940arn:aws:sagemaker:ap-northeast-1:797821601610:...2022-11-30 10:57:28.968000+00:002022-11-30 11:29:36.669000+00:002022-11-30 11:29:37.040000+00:00Completed
1job20221130105714-2022-11-30-10-57-26-802arn:aws:sagemaker:ap-northeast-1:797821601610:...2022-11-30 10:57:27.780000+00:002022-11-30 11:55:39.292000+00:002022-11-30 11:55:39.653000+00:00Completed
2job20221130105714-2022-11-30-10-57-25-541arn:aws:sagemaker:ap-northeast-1:797821601610:...2022-11-30 10:57:26.634000+00:002022-11-30 11:51:51.534000+00:002022-11-30 11:51:52.082000+00:00Completed
3job20221130105714-2022-11-30-10-57-25-000arn:aws:sagemaker:ap-northeast-1:797821601610:...2022-11-30 10:57:25.368000+00:002022-11-30 11:48:22.984000+00:002022-11-30 11:48:23.376000+00:00Completed
4job20221130105714-2022-11-30-10-57-23-829arn:aws:sagemaker:ap-northeast-1:797821601610:...2022-11-30 10:57:24.890000+00:002022-11-30 11:51:01.282000+00:002022-11-30 11:51:01.816000+00:00Completed
5job20221130105714-2022-11-30-10-57-21-848arn:aws:sagemaker:ap-northeast-1:797821601610:...2022-11-30 10:57:23.631000+00:002022-11-30 11:41:58.795000+00:002022-11-30 11:41:59.304000+00:00Completed
6job20221130105714-2022-11-30-10-57-21-103arn:aws:sagemaker:ap-northeast-1:797821601610:...2022-11-30 10:57:21.681000+00:002022-11-30 11:35:00.804000+00:002022-11-30 11:35:01.425000+00:00Completed
\n", "
" ], "text/plain": [ " ProcessingJobName \\\n", "0 job20221130105714-2022-11-30-10-57-27-940 \n", "1 job20221130105714-2022-11-30-10-57-26-802 \n", "2 job20221130105714-2022-11-30-10-57-25-541 \n", "3 job20221130105714-2022-11-30-10-57-25-000 \n", "4 job20221130105714-2022-11-30-10-57-23-829 \n", "5 job20221130105714-2022-11-30-10-57-21-848 \n", "6 job20221130105714-2022-11-30-10-57-21-103 \n", "\n", " ProcessingJobArn \\\n", "0 arn:aws:sagemaker:ap-northeast-1:797821601610:... \n", "1 arn:aws:sagemaker:ap-northeast-1:797821601610:... \n", "2 arn:aws:sagemaker:ap-northeast-1:797821601610:... \n", "3 arn:aws:sagemaker:ap-northeast-1:797821601610:... \n", "4 arn:aws:sagemaker:ap-northeast-1:797821601610:... \n", "5 arn:aws:sagemaker:ap-northeast-1:797821601610:... \n", "6 arn:aws:sagemaker:ap-northeast-1:797821601610:... \n", "\n", " CreationTime ProcessingEndTime \\\n", "0 2022-11-30 10:57:28.968000+00:00 2022-11-30 11:29:36.669000+00:00 \n", "1 2022-11-30 10:57:27.780000+00:00 2022-11-30 11:55:39.292000+00:00 \n", "2 2022-11-30 10:57:26.634000+00:00 2022-11-30 11:51:51.534000+00:00 \n", "3 2022-11-30 10:57:25.368000+00:00 2022-11-30 11:48:22.984000+00:00 \n", "4 2022-11-30 10:57:24.890000+00:00 2022-11-30 11:51:01.282000+00:00 \n", "5 2022-11-30 10:57:23.631000+00:00 2022-11-30 11:41:58.795000+00:00 \n", "6 2022-11-30 10:57:21.681000+00:00 2022-11-30 11:35:00.804000+00:00 \n", "\n", " LastModifiedTime ProcessingJobStatus \n", "0 2022-11-30 11:29:37.040000+00:00 Completed \n", "1 2022-11-30 11:55:39.653000+00:00 Completed \n", "2 2022-11-30 11:51:52.082000+00:00 Completed \n", "3 2022-11-30 11:48:23.376000+00:00 Completed \n", "4 2022-11-30 11:51:01.816000+00:00 Completed \n", "5 2022-11-30 11:41:59.304000+00:00 Completed \n", "6 2022-11-30 11:35:01.425000+00:00 Completed " ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sm = boto3.Session().client('sagemaker')\n", "jobs = sm.list_processing_jobs(NameContains=base_job_name)\n", "pd.DataFrame(jobs['ProcessingJobSummaries'])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "download: s3://sagemaker-ap-northeast-1-797821601610/amazon_review/output/output_0.csv to output/output_0.csv\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtstxtcol0col1col2col3col4token_nowordposwords_pairdepvariable
002012-12-05残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…B000001GBJ957145596SONGS FROM A SECRET GARDEMusic13趣味名詞NaNNaNNaN
102012-12-05残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…B000001GBJ957145596SONGS FROM A SECRET GARDEMusic16ある動詞NaNNaNNaN
202012-12-05残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…B000001GBJ957145596SONGS FROM A SECRET GARDEMusic112ケルト名詞NaNNaNNaN
302012-12-05残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…B000001GBJ957145596SONGS FROM A SECRET GARDEMusic113音楽名詞音楽 - 範疇nmodtoken_no_1
402012-12-05残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…B000001GBJ957145596SONGS FROM A SECRET GARDEMusic115範疇名詞音楽 - 範疇nmodtoken_no_2
\n", "
" ], "text/plain": [ " id ts txt col0 \\\n", "0 0 2012-12-05 残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね… B000001GBJ \n", "1 0 2012-12-05 残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね… B000001GBJ \n", "2 0 2012-12-05 残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね… B000001GBJ \n", "3 0 2012-12-05 残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね… B000001GBJ \n", "4 0 2012-12-05 残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね… B000001GBJ \n", "\n", " col1 col2 col3 col4 token_no word pos \\\n", "0 957145596 SONGS FROM A SECRET GARDE Music 1 3 趣味 名詞 \n", "1 957145596 SONGS FROM A SECRET GARDE Music 1 6 ある 動詞 \n", "2 957145596 SONGS FROM A SECRET GARDE Music 1 12 ケルト 名詞 \n", "3 957145596 SONGS FROM A SECRET GARDE Music 1 13 音楽 名詞 \n", "4 957145596 SONGS FROM A SECRET GARDE Music 1 15 範疇 名詞 \n", "\n", " words_pair dep variable \n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 音楽 - 範疇 nmod token_no_1 \n", "4 音楽 - 範疇 nmod token_no_2 " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.makedirs('output', exist_ok=True)\n", "!aws s3 cp s3://{bucket}/{project_name}/output/output_0.csv ./output\n", "df3 = pd.read_csv(\"output/output_0.csv\")\n", "df3.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. QuickSightデータセットの作成" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Amazon QuickSight のサインアップ\n", "- QuickSight に初めてアクセスする際には「Sing up for QuickSight」からサインアップを実施します。\n", "- リソースへのアクセス権限の設定では、S3 バケット「sagemaker-リージョン名-アカウント名」を許可します。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### IAM 権限設定\n", "\n", "- SageMakerノートブックからQuickSightの操作を行うために 「IAM」-> 「ポリシー」->「ポリシーの作成」から以下のポリシー(sagemaker-quicksight-policy)を作成します\n", "\n", "```json\n", "{\n", " \"Version\": \"2012-10-17\",\n", " \"Statement\": [\n", " {\n", " \"Effect\": \"Allow\",\n", " \"Action\": [\n", " \"quicksight:CreateAnalysis\",\n", " \"quicksight:PassDataSet\",\n", " \"quicksight:CreateDataSet\",\n", " \"quicksight:PassDataSource\",\n", " \"quicksight:CreateDataSource\",\n", " \"quicksight:DescribeTemplate\"\n", " ],\n", " \"Resource\": \"*\"\n", " }\n", " ]\n", "}\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- 作成したポリシーを SageMaker ノートブックの IAM ロール(AmazonSageMaker-ExecutionRole-xxxxx)にアタッチします。" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "import json\n", "import boto3\n", "import uuid\n", "\n", "quicksight = boto3.client('quicksight')\n", "account_id = boto3.client('sts').get_caller_identity().get('Account')" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "manifest = {\n", " \"fileLocations\": [\n", " {\n", " \"URIPrefixes\": [\n", " f's3://{bucket}/{project_name}/output/'\n", " ]\n", " }\n", " ]\n", "}" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "upload: ./manifest.json to s3://sagemaker-ap-northeast-1-797821601610/amazon_review/manifest/manifest.json\n" ] } ], "source": [ "with open('manifest.json', 'w') as f:\n", " json.dump(manifest, f)\n", "!aws s3 cp ./manifest.json s3://{bucket}/{project_name}/manifest/ " ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'ResponseMetadata': {'RequestId': '7cbccc1e-3a73-4ab9-8ab4-036eeedd5671',\n", " 'HTTPStatusCode': 202,\n", " 'HTTPHeaders': {'date': 'Wed, 30 Nov 2022 12:27:06 GMT',\n", " 'content-type': 'application/json',\n", " 'content-length': '249',\n", " 'connection': 'keep-alive',\n", " 'x-amzn-requestid': '7cbccc1e-3a73-4ab9-8ab4-036eeedd5671'},\n", " 'RetryAttempts': 0},\n", " 'Status': 202,\n", " 'Arn': 'arn:aws:quicksight:ap-northeast-1:797821601610:datasource/785e02b5-a706-4afa-bc94-bd007faa949a',\n", " 'DataSourceId': '785e02b5-a706-4afa-bc94-bd007faa949a',\n", " 'CreationStatus': 'CREATION_IN_PROGRESS',\n", " 'RequestId': '7cbccc1e-3a73-4ab9-8ab4-036eeedd5671'}" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "response = quicksight.create_data_source(\n", " AwsAccountId=account_id,\n", " DataSourceId=str(uuid.uuid4()),\n", " Name=project_name,\n", " Type='S3',\n", " DataSourceParameters={\n", " 'S3Parameters': {\n", " 'ManifestFileLocation': {\n", " 'Bucket': bucket,\n", " 'Key': f'{project_name}/manifest/manifest.json'\n", " }\n", " }\n", " }\n", ")\n", "\n", "data_source_arn = response['Arn']\n", "response" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "以下のセルの quicksight_username、quicksight_region、date_format を指定します。quicksight_username は QuickSight 画面右上のメニューから Username を確認します。" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'ResponseMetadata': {'RequestId': '724dde7a-97a6-4c5a-b8a9-8302759c741f',\n", " 'HTTPStatusCode': 201,\n", " 'HTTPHeaders': {'date': 'Wed, 30 Nov 2022 12:27:32 GMT',\n", " 'content-type': 'application/json',\n", " 'content-length': '412',\n", " 'connection': 'keep-alive',\n", " 'x-amzn-requestid': '724dde7a-97a6-4c5a-b8a9-8302759c741f'},\n", " 'RetryAttempts': 0},\n", " 'Status': 201,\n", " 'Arn': 'arn:aws:quicksight:ap-northeast-1:797821601610:dataset/7fa11e12-c798-478e-ad69-027060dbc633',\n", " 'DataSetId': '7fa11e12-c798-478e-ad69-027060dbc633',\n", " 'IngestionArn': 'arn:aws:quicksight:ap-northeast-1:797821601610:dataset/7fa11e12-c798-478e-ad69-027060dbc633/ingestion/f4fa27e7-df45-4442-9a95-e4bd2fb64eec',\n", " 'IngestionId': 'f4fa27e7-df45-4442-9a95-e4bd2fb64eec',\n", " 'RequestId': '724dde7a-97a6-4c5a-b8a9-8302759c741f'}" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "quicksight_username = 'XXXX'\n", "quicksight_region = 'ap-northeast-1'\n", "date_format = 'yyyy-MM-dd'\n", "\n", "response = quicksight.create_data_set(\n", " AwsAccountId=account_id,\n", " DataSetId=str(uuid.uuid4()),\n", " Name=project_name,\n", " PhysicalTableMap={\n", " 'phsicalTable': {\n", " 'S3Source': {\n", " 'DataSourceArn': data_source_arn,\n", " 'InputColumns': list(map(lambda x: {'Name': x, 'Type': 'STRING'}, df3.columns))\n", " }\n", " }\n", " },\n", " LogicalTableMap={\n", " 'string': {\n", " 'Alias': project_name,\n", " 'DataTransforms': [\n", " {\n", " 'CastColumnTypeOperation': {\n", " 'ColumnName': 'ts',\n", " 'NewColumnType': 'DATETIME',\n", " 'Format': date_format\n", " }\n", " },\n", " {\n", " 'CastColumnTypeOperation': {\n", " 'ColumnName': 'id',\n", " 'NewColumnType': 'INTEGER'\n", " }\n", " },\n", " ],\n", " 'Source': {\n", " 'PhysicalTableId': 'phsicalTable'\n", " }\n", " }\n", " },\n", " ImportMode='SPICE',\n", " Permissions=[\n", " {\n", " 'Principal': f'arn:aws:quicksight:{quicksight_region}:{account_id}:user/default/{quicksight_username}',\n", " 'Actions': [\n", " 'quicksight:PassDataSet',\n", " 'quicksight:DescribeIngestion',\n", " 'quicksight:CreateIngestion',\n", " 'quicksight:UpdateDataSet',\n", " 'quicksight:DeleteDataSet',\n", " 'quicksight:DescribeDataSet',\n", " 'quicksight:CancelIngestion',\n", " 'quicksight:DescribeDataSetPermissions',\n", " 'quicksight:ListIngestions',\n", " 'quicksight:UpdateDataSetPermissions'\n", " ]\n", " },\n", " ]\n", ")\n", "\n", "data_set_arn = response['Arn']\n", "response" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. QuickSight分析の作成\n", "- 別途作成済みの分析定義を元に、分析を作成します。" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "!wget -q https://raw.githubusercontent.com/aws-samples/aws-ml-jp/main/tasks/nlp/nlp_amazon_review/nlp_voc_dashboard/voc-analysis.json" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "source_str = ['$IDENTIFIER', '$DATASETARN', '$COL0', '$COL1', '$COL2', '$COL3', '$COL4']\n", "target_str = [project_name] + [data_set_arn] + structured_fields\n", "\n", "with open('voc-analysis.json') as f:\n", " voc_analysis_str = f.read()\n", " \n", "for i in range(len(source_str)):\n", " voc_analysis_str = voc_analysis_str.replace(source_str[i],target_str[i])\n", " \n", "voc_analysis_dict = json.loads(voc_analysis_str)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'ResponseMetadata': {'RequestId': '421227ae-e882-4a0f-89d8-96192c36b992',\n", " 'HTTPStatusCode': 202,\n", " 'HTTPHeaders': {'date': 'Wed, 30 Nov 2022 12:27:55 GMT',\n", " 'content-type': 'application/json',\n", " 'content-length': '245',\n", " 'connection': 'keep-alive',\n", " 'x-amzn-requestid': '421227ae-e882-4a0f-89d8-96192c36b992'},\n", " 'RetryAttempts': 0},\n", " 'Status': 202,\n", " 'Arn': 'arn:aws:quicksight:ap-northeast-1:797821601610:analysis/947e497c-2733-48b4-83c2-9689ad23210e',\n", " 'AnalysisId': '947e497c-2733-48b4-83c2-9689ad23210e',\n", " 'CreationStatus': 'CREATION_IN_PROGRESS',\n", " 'RequestId': '421227ae-e882-4a0f-89d8-96192c36b992'}" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "response = quicksight.create_analysis(\n", " AwsAccountId=account_id,\n", " AnalysisId=str(uuid.uuid4()),\n", " Name=project_name,\n", " Permissions=[\n", " {\n", " 'Principal': f'arn:aws:quicksight:{quicksight_region}:{account_id}:user/default/{quicksight_username}',\n", " 'Actions': [\n", " 'quicksight:QueryAnalysis',\n", " 'quicksight:DescribeAnalysis',\n", " 'quicksight:UpdateAnalysis',\n", " 'quicksight:DeleteAnalysis',\n", " 'quicksight:RestoreAnalysis',\n", " 'quicksight:DescribeAnalysisPermissions',\n", " 'quicksight:UpdateAnalysisPermissions'\n", " ]\n", " },\n", " ],\n", " Definition = voc_analysis_dict[\"Definition\"]\n", ")\n", "response" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 4 }