{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "6be9e30b-ea91-46b9-ab76-f51f0ef3be8b", "metadata": {}, "outputs": [], "source": [ "!pip install xgboost==1.0.1 -q" ] }, { "cell_type": "code", "execution_count": null, "id": "082422fd-d7de-4316-b36b-bec0252a6690", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import xgboost\n", "from sklearn.model_selection import train_test_split\n", "from my_custom_library.cross_validation_xgboost import cross_validation\n", "from math import sqrt\n", "from sklearn.metrics import mean_squared_error" ] }, { "cell_type": "code", "execution_count": null, "id": "bb9e1d61-9f86-45cb-9185-d44637998535", "metadata": {}, "outputs": [], "source": [ "data_dir = \"data\"\n", "\n", "df_ratings = pd.read_csv(f\"{data_dir}/ratings.csv\")\n", "df_tracks = pd.read_csv(f\"{data_dir}/tracks.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "4a3b4380-6a22-49d0-96be-d0be2d6e8827", "metadata": {}, "outputs": [], "source": [ "# Perform one-hot encoding\n", "tracks_tmp = pd.get_dummies(df_tracks, columns=[\"genre\"], prefix=\"genre\")\n", "\n", "# calculate danceability\n", "tracks_tmp[\"danceability\"] = 0.3*tracks_tmp.valence + 0.1*tracks_tmp.liveness + 0.1*tracks_tmp.energy\n", "\n", "# Join the two dataframes\n", "tracks_rating = pd.merge(tracks_tmp, df_ratings, how='inner', on='trackId')\n", "\n", "num_feat_cols = ['userId', 'energy', 'acousticness', 'valence', 'speechiness', 'instrumentalness', 'liveness', 'tempo', 'danceability', 'genre_Latin', 'genre_Folk', 'genre_Blues', 'genre_Rap', 'genre_Reggae', 'genre_Jazz', 'genre_RnB', 'genre_Country', 'genre_Electronic', 'genre_Pop_Rock']\n", "\n", "df_tmp = tracks_rating[tracks_rating.Rating==5][num_feat_cols]\n", "fivestar_ratings = df_tmp.groupby('userId').mean().add_suffix('_5star').reset_index()\n", "\n", "# Drop columns\n", "col_drop = [\"ratingEventId\", \"ts\", \"sessionId\", \"itemInSession\", \"trackId\"]\n", "tracks_rating = tracks_rating.drop(col_drop, axis=1)\n", "\n", "# join five start rating df with tracks_rating dataframe\n", "df_output = pd.merge(tracks_rating, fivestar_ratings, how='inner', on='userId').drop(['userId'], axis=1)\n", "first_col = df_output.pop('Rating')\n", "df_output.insert(0, 'Rating', first_col)\n", "df_output.drop_duplicates(inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "2a53692f-5e4d-4968-907d-6430568136f9", "metadata": {}, "outputs": [], "source": [ "# split data \n", "train, val = train_test_split(df_output, test_size=0.2, random_state=42)\n", "val, test = train_test_split(val, test_size=0.05, random_state=42)" ] }, { "cell_type": "code", "execution_count": null, "id": "2319c367-0be2-479c-83b4-a627d95074f8", "metadata": {}, "outputs": [], "source": [ "train.reset_index(drop=True, inplace=True)\n", "test.reset_index(drop=True, inplace=True)\n", "train.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "a510f9c4-f159-400d-be8c-d571e270fea3", "metadata": {}, "outputs": [], "source": [ "hyperparameters = {\n", " \"max_depth\": 5,\n", " \"eta\": 0.2,\n", " \"objective\": \"reg:squarederror\",\n", " \"num_round\": 6,\n", "}\n", "\n", "K = 5\n", "\n", "rmse_list, model = cross_validation(train, K, hyperparameters)\n", "k_fold_avg = sum(rmse_list) / len(rmse_list)" ] }, { "cell_type": "code", "execution_count": null, "id": "ca2cd7ff-6c06-4ecb-b18b-c81dad4cd9aa", "metadata": {}, "outputs": [], "source": [ "y_test = test.iloc[:, 0].values\n", "test.drop(test.columns[0], axis=1, inplace=True)\n", "X_test = xgboost.DMatrix(test)" ] }, { "cell_type": "code", "execution_count": null, "id": "2a703d7d-f7e3-4415-a00b-a41c50327e0e", "metadata": {}, "outputs": [], "source": [ "predictions = model.predict(X_test)\n", "\n", "mse = mean_squared_error(y_test, predictions)\n", "rmse = sqrt(mse)" ] }, { "cell_type": "code", "execution_count": null, "id": "d14d45ee-0cf1-4bc3-90d0-c55b1827e3f9", "metadata": {}, "outputs": [], "source": [ "print(f\"rmse: {rmse}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "90e34bc9-628a-4dd4-9b70-7db379549233", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "Python 3 (Data Science)", "language": "python", "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 5 }