In [None]:
!pip install xgboost==1.0.1 -q

In [None]:
import pandas as pd
import xgboost
from sklearn.model_selection import train_test_split
from my_custom_library.cross_validation_xgboost import cross_validation
from math import sqrt
from sklearn.metrics import mean_squared_error

In [None]:
data_dir = "data"

df_ratings = pd.read_csv(f"{data_dir}/ratings.csv")
df_tracks = pd.read_csv(f"{data_dir}/tracks.csv")

In [None]:
# Perform one-hot encoding
tracks_tmp = pd.get_dummies(df_tracks, columns=["genre"], prefix="genre")

# calculate danceability
tracks_tmp["danceability"] = 0.3*tracks_tmp.valence + 0.1*tracks_tmp.liveness + 0.1*tracks_tmp.energy

# Join the two dataframes
tracks_rating = pd.merge(tracks_tmp, df_ratings, how='inner', on='trackId')

num_feat_cols = ['userId', 'energy', 'acousticness', 'valence', 'speechiness', 'instrumentalness', 'liveness', 'tempo', 'danceability', 'genre_Latin', 'genre_Folk', 'genre_Blues', 'genre_Rap', 'genre_Reggae', 'genre_Jazz', 'genre_RnB', 'genre_Country', 'genre_Electronic', 'genre_Pop_Rock']

df_tmp = tracks_rating[tracks_rating.Rating==5][num_feat_cols]
fivestar_ratings = df_tmp.groupby('userId').mean().add_suffix('_5star').reset_index()

# Drop columns
col_drop = ["ratingEventId", "ts", "sessionId", "itemInSession", "trackId"]
tracks_rating = tracks_rating.drop(col_drop, axis=1)

# join five start rating df with tracks_rating dataframe
df_output = pd.merge(tracks_rating, fivestar_ratings, how='inner', on='userId').drop(['userId'], axis=1)
first_col = df_output.pop('Rating')
df_output.insert(0, 'Rating', first_col)
df_output.drop_duplicates(inplace=True)

In [None]:
# split data 
train, val = train_test_split(df_output, test_size=0.2, random_state=42)
val, test = train_test_split(val, test_size=0.05, random_state=42)

In [None]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
train.head()

In [None]:
hyperparameters = {
 "max_depth": 5,
 "eta": 0.2,
 "objective": "reg:squarederror",
 "num_round": 6,
}

K = 5

rmse_list, model = cross_validation(train, K, hyperparameters)
k_fold_avg = sum(rmse_list) / len(rmse_list)

In [None]:
y_test = test.iloc[:, 0].values
test.drop(test.columns[0], axis=1, inplace=True)
X_test = xgboost.DMatrix(test)

In [None]:
predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = sqrt(mse)

In [None]:
print(f"rmse: {rmse}")