Code
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, roc_curve, classification_report, confusion_matrix
from packaging import version
import sklearn
FIG_W = 9
FIG_H = 5
FIG_DPI = 110
color_blue = "#033c73"
color_indigo = "#6610f2"
color_purple = "#6f42c1"
plt.rcParams.update(
{
"figure.figsize": (FIG_W, FIG_H),
"figure.dpi": FIG_DPI,
"font.family": "Ramabhadra",
"font.weight": "bold",
"axes.titlesize": 14,
"axes.titleweight": "bold",
"axes.labelsize": 12,
"axes.labelweight": "bold",
"xtick.labelsize": 10,
"ytick.labelsize": 10,
"legend.fontsize": 10,
"axes.grid": True,
"grid.alpha": 0.25,
}
)
pd.set_option("display.width", 100)
pd.set_option("display.max_columns", 10)
pd.set_option("display.float_format", lambda x: f"{x:0.3f}")
def make_ohe() -> OneHotEncoder:
if version.parse(sklearn.__version__) >= version.parse("1.2"):
return OneHotEncoder(handle_unknown="ignore", sparse_output=True)
else:
return OneHotEncoder(handle_unknown="ignore", sparse=True)
def section(title: str):
bar = "-" * len(title)
print(f"\n{title}\n{bar}")
def fit_and_report(name, pipeline, X_train, y_train, X_test, y_test):
section(f"{name} – fit")
pipeline.fit(X_train, y_train)
y_proba = pipeline.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.5).astype(int)
auc = roc_auc_score(y_test, y_proba)
print(f"ROC AUC: {auc:0.3f}")
cr = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
cr_df = pd.DataFrame(cr).T[["precision", "recall", "f1-score", "support"]]
section(f"{name} – classification report")
print(cr_df.round(3))
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["true_0", "true_1"], columns=["pred_0", "pred_1"])
section(f"{name} – confusion matrix")
print(cm_df)
fpr, tpr, _ = roc_curve(y_test, y_proba)
return auc, fpr, tpr, y_proba, y_pred
data_path = Path("data") / "MusicData.csv"
df = pd.read_csv(data_path, encoding="ISO-8859-1")
print('Shape:', df.shape)Shape: (7574, 39)
Columns (first 15): ['year', 'songtitle', 'artistname', 'songID', 'artistID', 'timesignature', 'timesignature_confidence', 'loudness', 'tempo', 'tempo_confidence', 'key', 'key_confidence', 'energy', 'pitch', 'timbre_0_min']
year songtitle artistname songID \
0 2010 This Is the House That Doubt Built A Day to Remember SOBGGAB12C5664F054
1 2010 Sticks & Bricks A Day to Remember SOPAQHU1315CD47F31
2 2010 All I Want A Day to Remember SOOIZOU1376E7C6386
artistID ... timbre_10_min timbre_10_max timbre_11_min timbre_11_max Top10
0 AROBSHL1187B9AFB01 ... -126.440 18.658 -44.770 25.989 0
1 AROBSHL1187B9AFB01 ... -103.808 121.935 -38.892 22.513 0
2 AROBSHL1187B9AFB01 ... -108.313 33.300 -43.733 25.744 0
[3 rows x 39 columns]
Code
drop_cols = ["songID", "artistID", "songtitle", "artistname"]
X = df.drop(columns=["Top10"] + [c for c in drop_cols if c in df.columns], errors="ignore")
y = df["Top10"].astype(int)
cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42, stratify=y
)
print('Train shape:', X_train.shape, '| Test shape:', X_test.shape)Train shape: (5301, 34) | Test shape: (2273, 34)
Code
preprocess = ColumnTransformer(
transformers=[
("num", StandardScaler(with_mean=False), num_cols),
("cat", make_ohe(), cat_cols),
],
sparse_threshold=1.0,
)
baseline = Pipeline(
steps=[
("prep", preprocess),
("clf", LogisticRegression(max_iter=1000, solver="saga")),
]
)
auc_base, fpr_b, tpr_b, y_proba_base, y_pred_base = fit_and_report(
"Baseline logistic regression",
baseline,
X_train,
y_train,
X_test,
y_test,
)
Baseline logistic regression – fit
----------------------------------
ROC AUC: 0.812
Baseline logistic regression – classification report
----------------------------------------------------
precision recall f1-score support
0 0.874 0.983 0.925 1937.000
1 0.656 0.182 0.284 336.000
accuracy 0.865 0.865 0.865 0.865
macro avg 0.765 0.583 0.605 2273.000
weighted avg 0.842 0.865 0.831 2273.000
Baseline logistic regression – confusion matrix
-----------------------------------------------
pred_0 pred_1
true_0 1905 32
true_1 275 61
Code
(0.0, 1.0)
(0.0, 1.0)
Code

Code
logit_cv = LogisticRegressionCV(
Cs=5,
cv=3,
penalty="l1",
solver="saga",
scoring="roc_auc",
max_iter=1500,
refit=True,
n_jobs=-1,
)
improved = Pipeline(
steps=[
("prep", preprocess),
("clf", logit_cv),
]
)
auc_imp, fpr_i, tpr_i, y_proba_imp, y_pred_imp = fit_and_report(
"Improved L1-regularized logistic regression",
improved,
X_train,
y_train,
X_test,
y_test,
)
Improved L1-regularized logistic regression – fit
-------------------------------------------------
ROC AUC: 0.816
Improved L1-regularized logistic regression – classification report
-------------------------------------------------------------------
precision recall f1-score support
0 0.877 0.977 0.924 1937.000
1 0.614 0.208 0.311 336.000
accuracy 0.864 0.864 0.864 0.864
macro avg 0.745 0.593 0.618 2273.000
weighted avg 0.838 0.864 0.834 2273.000
Improved L1-regularized logistic regression – confusion matrix
--------------------------------------------------------------
pred_0 pred_1
true_0 1893 44
true_1 266 70
Code
plt.figure()
plt.plot(
fpr_b,
tpr_b,
label=f"BASELINE AUC = {auc_base:0.3f}",
color=color_blue,
linewidth=2.0,
)
plt.plot(
fpr_i,
tpr_i,
label=f"IMPROVED AUC = {auc_imp:0.3f}",
color=color_indigo,
linewidth=2.0,
)
plt.plot(
[0, 1],
[0, 1],
"--",
color=color_purple,
linewidth=1.5,
label="CHANCE",
)
plt.xlim(0, 1)(0.0, 1.0)
(0.0, 1.0)
Code

Code
prep = improved.named_steps["prep"]
cat_names = []
if cat_cols:
enc = prep.named_transformers_["cat"]
try:
cat_names = list(enc.get_feature_names_out(cat_cols))
except AttributeError:
cat_names = enc.get_feature_names(cat_cols)
features = num_cols + cat_names
coefs = improved.named_steps["clf"].coef_.ravel()
coef_table = pd.DataFrame(
{
"feature": features,
"beta": coefs,
"odds_multiplier": np.exp(coefs),
}
)
coef_table = coef_table.reindex(
coef_table["beta"].abs().sort_values(ascending=False).index
)
print(coef_table.head(15).round({"beta": 3, "odds_multiplier": 3})) feature beta odds_multiplier
11 timbre_0_max -1.066 0.344
3 loudness 0.902 2.464
9 pitch -0.621 0.537
22 timbre_6_min -0.422 0.656
12 timbre_1_min 0.393 1.481
17 timbre_3_max -0.356 0.700
32 timbre_11_min -0.293 0.746
10 timbre_0_min 0.260 1.297
33 timbre_11_max 0.248 1.282
2 timesignature_confidence 0.229 1.258
18 timbre_4_min 0.212 1.236
20 timbre_5_min -0.180 0.836
19 timbre_4_max 0.169 1.185
8 energy -0.160 0.852
5 tempo_confidence 0.150 1.161
