Ashish Srikanth - Predicting Top-10 Songs: A Logistic + LASSO Walkthrough in Python

Project Spotlight

Code

import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, roc_curve, classification_report, confusion_matrix

from packaging import version
import sklearn

FIG_W = 9
FIG_H = 5
FIG_DPI = 110

color_blue = "#033c73"
color_indigo = "#6610f2"
color_purple = "#6f42c1"

plt.rcParams.update(
    {
        "figure.figsize": (FIG_W, FIG_H),
        "figure.dpi": FIG_DPI,
        "font.family": "Ramabhadra",
        "font.weight": "bold",
        "axes.titlesize": 14,
        "axes.titleweight": "bold",
        "axes.labelsize": 12,
        "axes.labelweight": "bold",
        "xtick.labelsize": 10,
        "ytick.labelsize": 10,
        "legend.fontsize": 10,
        "axes.grid": True,
        "grid.alpha": 0.25,
    }
)

pd.set_option("display.width", 100)
pd.set_option("display.max_columns", 10)
pd.set_option("display.float_format", lambda x: f"{x:0.3f}")


def make_ohe() -> OneHotEncoder:
    if version.parse(sklearn.__version__) >= version.parse("1.2"):
        return OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    else:
        return OneHotEncoder(handle_unknown="ignore", sparse=True)


def section(title: str):
    bar = "-" * len(title)
    print(f"\n{title}\n{bar}")


def fit_and_report(name, pipeline, X_train, y_train, X_test, y_test):
    section(f"{name} – fit")
    pipeline.fit(X_train, y_train)
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= 0.5).astype(int)
    auc = roc_auc_score(y_test, y_proba)
    print(f"ROC AUC: {auc:0.3f}")

    cr = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    cr_df = pd.DataFrame(cr).T[["precision", "recall", "f1-score", "support"]]
    section(f"{name} – classification report")
    print(cr_df.round(3))

    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm, index=["true_0", "true_1"], columns=["pred_0", "pred_1"])
    section(f"{name} – confusion matrix")
    print(cm_df)

    fpr, tpr, _ = roc_curve(y_test, y_proba)
    return auc, fpr, tpr, y_proba, y_pred


data_path = Path("data") / "MusicData.csv"
df = pd.read_csv(data_path, encoding="ISO-8859-1")

print('Shape:', df.shape)

Shape: (7574, 39)

Code

print('Columns (first 15):', list(df.columns)[:15])

Columns (first 15): ['year', 'songtitle', 'artistname', 'songID', 'artistID', 'timesignature', 'timesignature_confidence', 'loudness', 'tempo', 'tempo_confidence', 'key', 'key_confidence', 'energy', 'pitch', 'timbre_0_min']

Code

df.head(3)

   year                           songtitle         artistname              songID  \
0  2010  This Is the House That Doubt Built  A Day to Remember  SOBGGAB12C5664F054   
1  2010                     Sticks & Bricks  A Day to Remember  SOPAQHU1315CD47F31   
2  2010                          All I Want  A Day to Remember  SOOIZOU1376E7C6386   

             artistID  ...  timbre_10_min  timbre_10_max  timbre_11_min  timbre_11_max  Top10  
0  AROBSHL1187B9AFB01  ...       -126.440         18.658        -44.770         25.989      0  
1  AROBSHL1187B9AFB01  ...       -103.808        121.935        -38.892         22.513      0  
2  AROBSHL1187B9AFB01  ...       -108.313         33.300        -43.733         25.744      0  

[3 rows x 39 columns]

Code

drop_cols = ["songID", "artistID", "songtitle", "artistname"]
X = df.drop(columns=["Top10"] + [c for c in drop_cols if c in df.columns], errors="ignore")
y = df["Top10"].astype(int)

cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print('Train shape:', X_train.shape, '| Test shape:', X_test.shape)

Train shape: (5301, 34) | Test shape: (2273, 34)

Code

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), num_cols),
        ("cat", make_ohe(), cat_cols),
    ],
    sparse_threshold=1.0,
)

baseline = Pipeline(
    steps=[
        ("prep", preprocess),
        ("clf", LogisticRegression(max_iter=1000, solver="saga")),
    ]
)

auc_base, fpr_b, tpr_b, y_proba_base, y_pred_base = fit_and_report(
    "Baseline logistic regression",
    baseline,
    X_train,
    y_train,
    X_test,
    y_test,
)


Baseline logistic regression – fit
----------------------------------
ROC AUC: 0.812

Baseline logistic regression – classification report
----------------------------------------------------
              precision  recall  f1-score  support
0                 0.874   0.983     0.925 1937.000
1                 0.656   0.182     0.284  336.000
accuracy          0.865   0.865     0.865    0.865
macro avg         0.765   0.583     0.605 2273.000
weighted avg      0.842   0.865     0.831 2273.000

Baseline logistic regression – confusion matrix
-----------------------------------------------
        pred_0  pred_1
true_0    1905      32
true_1     275      61

Code

plt.figure()
plt.plot(
    fpr_b,
    tpr_b,
    label=f"BASELINE AUC = {auc_base:0.3f}",
    color=color_blue,
    linewidth=2.0,
)
plt.plot(
    [0, 1],
    [0, 1],
    "--",
    color=color_indigo,
    linewidth=1.5,
    label="CHANCE",
)
plt.xlim(0, 1)

(0.0, 1.0)

Code

plt.ylim(0, 1)

(0.0, 1.0)

Code

plt.xlabel("FALSE POSITIVE RATE")
plt.ylabel("TRUE POSITIVE RATE")
plt.title("ROC CURVE – BASELINE LOGISTIC REGRESSION")
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()

Code

logit_cv = LogisticRegressionCV(
    Cs=5,
    cv=3,
    penalty="l1",
    solver="saga",
    scoring="roc_auc",
    max_iter=1500,
    refit=True,
    n_jobs=-1,
)

improved = Pipeline(
    steps=[
        ("prep", preprocess),
        ("clf", logit_cv),
    ]
)

auc_imp, fpr_i, tpr_i, y_proba_imp, y_pred_imp = fit_and_report(
    "Improved L1-regularized logistic regression",
    improved,
    X_train,
    y_train,
    X_test,
    y_test,
)


Improved L1-regularized logistic regression – fit
-------------------------------------------------
ROC AUC: 0.816

Improved L1-regularized logistic regression – classification report
-------------------------------------------------------------------
              precision  recall  f1-score  support
0                 0.877   0.977     0.924 1937.000
1                 0.614   0.208     0.311  336.000
accuracy          0.864   0.864     0.864    0.864
macro avg         0.745   0.593     0.618 2273.000
weighted avg      0.838   0.864     0.834 2273.000

Improved L1-regularized logistic regression – confusion matrix
--------------------------------------------------------------
        pred_0  pred_1
true_0    1893      44
true_1     266      70

Code

plt.figure()
plt.plot(
    fpr_b,
    tpr_b,
    label=f"BASELINE AUC = {auc_base:0.3f}",
    color=color_blue,
    linewidth=2.0,
)
plt.plot(
    fpr_i,
    tpr_i,
    label=f"IMPROVED AUC = {auc_imp:0.3f}",
    color=color_indigo,
    linewidth=2.0,
)
plt.plot(
    [0, 1],
    [0, 1],
    "--",
    color=color_purple,
    linewidth=1.5,
    label="CHANCE",
)
plt.xlim(0, 1)

(0.0, 1.0)

Code

plt.ylim(0, 1)

(0.0, 1.0)

Code

plt.xlabel("FALSE POSITIVE RATE")
plt.ylabel("TRUE POSITIVE RATE")
plt.title("ROC CURVE – BASELINE VS IMPROVED")
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()

Code

prep = improved.named_steps["prep"]
cat_names = []

if cat_cols:
    enc = prep.named_transformers_["cat"]
    try:
        cat_names = list(enc.get_feature_names_out(cat_cols))
    except AttributeError:
        cat_names = enc.get_feature_names(cat_cols)

features = num_cols + cat_names
coefs = improved.named_steps["clf"].coef_.ravel()

coef_table = pd.DataFrame(
    {
        "feature": features,
        "beta": coefs,
        "odds_multiplier": np.exp(coefs),
    }
)

coef_table = coef_table.reindex(
    coef_table["beta"].abs().sort_values(ascending=False).index
)

print(coef_table.head(15).round({"beta": 3, "odds_multiplier": 3}))

                     feature   beta  odds_multiplier
11              timbre_0_max -1.066            0.344
3                   loudness  0.902            2.464
9                      pitch -0.621            0.537
22              timbre_6_min -0.422            0.656
12              timbre_1_min  0.393            1.481
17              timbre_3_max -0.356            0.700
32             timbre_11_min -0.293            0.746
10              timbre_0_min  0.260            1.297
33             timbre_11_max  0.248            1.282
2   timesignature_confidence  0.229            1.258
18              timbre_4_min  0.212            1.236
20              timbre_5_min -0.180            0.836
19              timbre_4_max  0.169            1.185
8                     energy -0.160            0.852
5           tempo_confidence  0.150            1.161