Predicting Top-10 Songs: A Logistic + LASSO Walkthrough in Python

From music features to hit probabilities

predictive analytics
music
classification
glmnet
ROC
Python
Author

A. Srikanth

Published

December 5, 2025

Project Spotlight

Code
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, roc_curve, classification_report, confusion_matrix

from packaging import version
import sklearn

FIG_W = 9
FIG_H = 5
FIG_DPI = 110

color_blue = "#033c73"
color_indigo = "#6610f2"
color_purple = "#6f42c1"

plt.rcParams.update(
    {
        "figure.figsize": (FIG_W, FIG_H),
        "figure.dpi": FIG_DPI,
        "font.family": "Ramabhadra",
        "font.weight": "bold",
        "axes.titlesize": 14,
        "axes.titleweight": "bold",
        "axes.labelsize": 12,
        "axes.labelweight": "bold",
        "xtick.labelsize": 10,
        "ytick.labelsize": 10,
        "legend.fontsize": 10,
        "axes.grid": True,
        "grid.alpha": 0.25,
    }
)

pd.set_option("display.width", 100)
pd.set_option("display.max_columns", 10)
pd.set_option("display.float_format", lambda x: f"{x:0.3f}")


def make_ohe() -> OneHotEncoder:
    if version.parse(sklearn.__version__) >= version.parse("1.2"):
        return OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    else:
        return OneHotEncoder(handle_unknown="ignore", sparse=True)


def section(title: str):
    bar = "-" * len(title)
    print(f"\n{title}\n{bar}")


def fit_and_report(name, pipeline, X_train, y_train, X_test, y_test):
    section(f"{name} – fit")
    pipeline.fit(X_train, y_train)
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= 0.5).astype(int)
    auc = roc_auc_score(y_test, y_proba)
    print(f"ROC AUC: {auc:0.3f}")

    cr = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    cr_df = pd.DataFrame(cr).T[["precision", "recall", "f1-score", "support"]]
    section(f"{name} – classification report")
    print(cr_df.round(3))

    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm, index=["true_0", "true_1"], columns=["pred_0", "pred_1"])
    section(f"{name} – confusion matrix")
    print(cm_df)

    fpr, tpr, _ = roc_curve(y_test, y_proba)
    return auc, fpr, tpr, y_proba, y_pred


data_path = Path("data") / "MusicData.csv"
df = pd.read_csv(data_path, encoding="ISO-8859-1")

print('Shape:', df.shape)
Shape: (7574, 39)
Code
print('Columns (first 15):', list(df.columns)[:15])
Columns (first 15): ['year', 'songtitle', 'artistname', 'songID', 'artistID', 'timesignature', 'timesignature_confidence', 'loudness', 'tempo', 'tempo_confidence', 'key', 'key_confidence', 'energy', 'pitch', 'timbre_0_min']
Code
df.head(3)
   year                           songtitle         artistname              songID  \
0  2010  This Is the House That Doubt Built  A Day to Remember  SOBGGAB12C5664F054   
1  2010                     Sticks & Bricks  A Day to Remember  SOPAQHU1315CD47F31   
2  2010                          All I Want  A Day to Remember  SOOIZOU1376E7C6386   

             artistID  ...  timbre_10_min  timbre_10_max  timbre_11_min  timbre_11_max  Top10  
0  AROBSHL1187B9AFB01  ...       -126.440         18.658        -44.770         25.989      0  
1  AROBSHL1187B9AFB01  ...       -103.808        121.935        -38.892         22.513      0  
2  AROBSHL1187B9AFB01  ...       -108.313         33.300        -43.733         25.744      0  

[3 rows x 39 columns]
Code
drop_cols = ["songID", "artistID", "songtitle", "artistname"]
X = df.drop(columns=["Top10"] + [c for c in drop_cols if c in df.columns], errors="ignore")
y = df["Top10"].astype(int)

cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print('Train shape:', X_train.shape, '| Test shape:', X_test.shape)
Train shape: (5301, 34) | Test shape: (2273, 34)
Code
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), num_cols),
        ("cat", make_ohe(), cat_cols),
    ],
    sparse_threshold=1.0,
)

baseline = Pipeline(
    steps=[
        ("prep", preprocess),
        ("clf", LogisticRegression(max_iter=1000, solver="saga")),
    ]
)

auc_base, fpr_b, tpr_b, y_proba_base, y_pred_base = fit_and_report(
    "Baseline logistic regression",
    baseline,
    X_train,
    y_train,
    X_test,
    y_test,
)

Baseline logistic regression – fit
----------------------------------
ROC AUC: 0.812

Baseline logistic regression – classification report
----------------------------------------------------
              precision  recall  f1-score  support
0                 0.874   0.983     0.925 1937.000
1                 0.656   0.182     0.284  336.000
accuracy          0.865   0.865     0.865    0.865
macro avg         0.765   0.583     0.605 2273.000
weighted avg      0.842   0.865     0.831 2273.000

Baseline logistic regression – confusion matrix
-----------------------------------------------
        pred_0  pred_1
true_0    1905      32
true_1     275      61
Code
plt.figure()
plt.plot(
    fpr_b,
    tpr_b,
    label=f"BASELINE AUC = {auc_base:0.3f}",
    color=color_blue,
    linewidth=2.0,
)
plt.plot(
    [0, 1],
    [0, 1],
    "--",
    color=color_indigo,
    linewidth=1.5,
    label="CHANCE",
)
plt.xlim(0, 1)
(0.0, 1.0)
Code
plt.ylim(0, 1)
(0.0, 1.0)
Code
plt.xlabel("FALSE POSITIVE RATE")
plt.ylabel("TRUE POSITIVE RATE")
plt.title("ROC CURVE – BASELINE LOGISTIC REGRESSION")
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()

Code
logit_cv = LogisticRegressionCV(
    Cs=5,
    cv=3,
    penalty="l1",
    solver="saga",
    scoring="roc_auc",
    max_iter=1500,
    refit=True,
    n_jobs=-1,
)

improved = Pipeline(
    steps=[
        ("prep", preprocess),
        ("clf", logit_cv),
    ]
)

auc_imp, fpr_i, tpr_i, y_proba_imp, y_pred_imp = fit_and_report(
    "Improved L1-regularized logistic regression",
    improved,
    X_train,
    y_train,
    X_test,
    y_test,
)

Improved L1-regularized logistic regression – fit
-------------------------------------------------
ROC AUC: 0.816

Improved L1-regularized logistic regression – classification report
-------------------------------------------------------------------
              precision  recall  f1-score  support
0                 0.877   0.977     0.924 1937.000
1                 0.614   0.208     0.311  336.000
accuracy          0.864   0.864     0.864    0.864
macro avg         0.745   0.593     0.618 2273.000
weighted avg      0.838   0.864     0.834 2273.000

Improved L1-regularized logistic regression – confusion matrix
--------------------------------------------------------------
        pred_0  pred_1
true_0    1893      44
true_1     266      70
Code
plt.figure()
plt.plot(
    fpr_b,
    tpr_b,
    label=f"BASELINE AUC = {auc_base:0.3f}",
    color=color_blue,
    linewidth=2.0,
)
plt.plot(
    fpr_i,
    tpr_i,
    label=f"IMPROVED AUC = {auc_imp:0.3f}",
    color=color_indigo,
    linewidth=2.0,
)
plt.plot(
    [0, 1],
    [0, 1],
    "--",
    color=color_purple,
    linewidth=1.5,
    label="CHANCE",
)
plt.xlim(0, 1)
(0.0, 1.0)
Code
plt.ylim(0, 1)
(0.0, 1.0)
Code
plt.xlabel("FALSE POSITIVE RATE")
plt.ylabel("TRUE POSITIVE RATE")
plt.title("ROC CURVE – BASELINE VS IMPROVED")
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()

Code
prep = improved.named_steps["prep"]
cat_names = []

if cat_cols:
    enc = prep.named_transformers_["cat"]
    try:
        cat_names = list(enc.get_feature_names_out(cat_cols))
    except AttributeError:
        cat_names = enc.get_feature_names(cat_cols)

features = num_cols + cat_names
coefs = improved.named_steps["clf"].coef_.ravel()

coef_table = pd.DataFrame(
    {
        "feature": features,
        "beta": coefs,
        "odds_multiplier": np.exp(coefs),
    }
)

coef_table = coef_table.reindex(
    coef_table["beta"].abs().sort_values(ascending=False).index
)

print(coef_table.head(15).round({"beta": 3, "odds_multiplier": 3}))
                     feature   beta  odds_multiplier
11              timbre_0_max -1.066            0.344
3                   loudness  0.902            2.464
9                      pitch -0.621            0.537
22              timbre_6_min -0.422            0.656
12              timbre_1_min  0.393            1.481
17              timbre_3_max -0.356            0.700
32             timbre_11_min -0.293            0.746
10              timbre_0_min  0.260            1.297
33             timbre_11_max  0.248            1.282
2   timesignature_confidence  0.229            1.258
18              timbre_4_min  0.212            1.236
20              timbre_5_min -0.180            0.836
19              timbre_4_max  0.169            1.185
8                     energy -0.160            0.852
5           tempo_confidence  0.150            1.161