import numpy as np import pandas as pd import joblib from sklearn.model_selection import train_test_split import xgboost as xgb from sklearn.metrics import accuracy_score data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header = None) # specify columns extracted from wbdc.names data.columns = ["id","diagnosis","radius_mean","texture_mean","perimeter_mean","area_mean","smoothness_mean", "compactness_mean","concavity_mean","concave points_mean","symmetry_mean","fractal_dimension_mean", "radius_se","texture_se","perimeter_se","area_se","smoothness_se","compactness_se","concavity_se", "concave points_se","symmetry_se","fractal_dimension_se","radius_worst","texture_worst", "perimeter_worst","area_worst","smoothness_worst","compactness_worst","concavity_worst", "concave points_worst","symmetry_worst","fractal_dimension_worst"] # save the data data.to_csv("data.csv", sep=',', index=False) # print the shape of the data file print(data.shape) y = data["diagnosis"] X = data.drop(["id", "diagnosis"], axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) model = xgb.XGBClassifier().fit(X_train, y_train) # make prediction y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Model Accuracy: %.2f%%" % (accuracy * 100.0)) # save the trained model file model_file_name = "bc-xgboost-model" joblib.dump(model, model_file_name) print("Model file {} saved successfully".format(model_file_name))