EDA: Match Histories

Exploring the scraped eloratings.net data and prototyping the per-team model

Author

Miguel R.

Published

July 5, 2026

The snapshot dataset turned out to be a map rather than a model, so we went to the source: eloratings.net publishes the complete match history of every national team, with the Elo rating before and after each match, the score, the match type and the location. A small scraper (elo_scraper.js) downloads one TSV per qualified team.

This notebook is the first contact with that data, and it doubles as the prototype of the modeling approach used by the predictor: the time-decay weighting and the per-team Poisson regression both started here.

1. Loading and reshaping

tsv_loader parses a team’s raw TSV; tsv_converter reshapes it into the model’s view of the world: one row per match from that team’s perspective - its Elo going in, the opponent’s Elo, the match type, the location, and how many goals it scored.

Code
import numpy as np
import pandas as pd

from tsv_utils import tsv_converter, tsv_loader
Code
# INFO: every value should be a tuple with the name of the file and the country code.


def load_group(group: list, date: str = None) -> list[pd.DataFrame]:
    dfs: list[pd.DataFrame] = []
    for country, code in group:
        df = tsv_loader(country)
        if date is not None:
            df = df[df['date'] <= pd.to_datetime(date)]
        dfs.append(tsv_converter(df, code))
    return dfs

2. Group A up close

Group A (the Mexico group) is the guinea pig: load all four teams and look at the Elo trajectories since the 2018 World Cup.

Code
from tsv_utils import GROUP_A
mexico, south_africa, south_korea, czechia = load_group(GROUP_A)
Code
# explore the data
import seaborn as sns
import matplotlib.pyplot as plt

wrc_2016_first_match_dt = pd.to_datetime("2018-06-14")
wrc_2008_first_match_dt = pd.to_datetime("2008-12-11")

# group A
Code
import math


def plot_group(group):
    # we should load the group
    # plot the elo
    groups = load_group(group)
    plot_data = None
    for index, df in enumerate(groups):
        df['team'] = group[index][0]
        df = df.reset_index()
        df = df[df['date'] > wrc_2016_first_match_dt]
        plot_data = pd.concat([plot_data, df]) if plot_data is not None else df

    sorted_dates = plot_data.sort_values(by=['team', 'date'], ascending=False)
    avg_elo = (sorted_dates
               .groupby('team', group_keys=False)
               .head(1)
               # .agg({'current_team_elo': 'mean'})
               .groupby('team')['current_team_elo']
               .mean()
               )
    three_best_elo = avg_elo.sort_values(ascending=False).head(3)
    ax = sns.lineplot(data=plot_data, x='date', y='current_team_elo', hue='team')
    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
    ax.set_ylabel('Elo Rating')

    ax.set_xlabel(
        f"avg elo:{math.floor(avg_elo.mean())} - top 3 avg:{math.floor(three_best_elo.mean())}")
    plt.show()

3. Prototype: time decay + Poisson

The final piece explored here became the core of the predictor. Two ideas:

  • Time decay. A match from six years ago should not weigh as much as one from last month. Each match gets an exponential weight with a two-year half-life.
  • Poisson regression on goals. Goals in football are classic Poisson territory: small counts, roughly constant intensity. We fit goals ~ opponent Elo + match type + location, weighted by recency, for a single team - Mexico - as a proof of concept.

The productionized version of this cell (with categorical-level guards, scaling and NaN handling) lives in model_utils.py.

Code
from enum import Enum

Workflow = Enum("Workflow", [("NAIVE", 1), ("ATK_DEF", 2), ("HIST", 3)])
from elo_utils import get_elo_atk_def
import statsmodels.api as sm
import statsmodels.formula.api as smf

# GROUP_A = [('Mexico', 'MX'), ('South_Africa', 'ZA'), ('South_Korea', 'KR'), ('Czechia', 'CZ')]

mx_df, za_df, kr_df, cz_df = load_group(GROUP_A)

from_date = wrc_2016_first_match_dt

mx_df = mx_df.reset_index()
mx_df = mx_df[mx_df['date'] > from_date]


# create the weighted column

def add_time_decay_weights(
        df: pd.DataFrame,
        date_col: str = "date",
        half_life_days: float = 365 * 2
):
    """
    Add exponential time-decay weights to a dataframe.

    More recent matches have higher weights.

    The max date is not today, its the max date of the data.

    Parameters
    ----------
    df : DataFrame
        Match data with a date column.

    date_col : str
        Column containing match dates.

    half_life_days : float
        How fast old matches lose importance.
        Example:
        - 365*2 → every 2 years weight halves
        - smaller → faster adaptation
    """

    df = df.copy()

    # Ensure datetime format
    df[date_col] = pd.to_datetime(df[date_col])

    # Use most recent match as reference point
    max_date = df[date_col].max()

    # Compute time difference in days
    df["days_since"] = (max_date - df[date_col]).dt.days

    # Exponential decay:
    # weight = exp(-lambda * t)
    # lambda = ln(2) / half_life
    decay_rate = np.log(2) / half_life_days
    df["time_weight"] = np.exp(-decay_rate * df["days_since"])
    return df


def train_poisson_model(team_df: pd.DataFrame):
    team_df_copy = team_df.copy()
    team_df_copy = add_time_decay_weights(team_df_copy, date_col="date")
    time_weight = 'time_weight'
    dep_var = 'goals_converted'
    # date and days_since are redundant, probably doing VIF
    ind_vars = ' + '.join(team_df_copy.drop([dep_var, time_weight, 'date'], axis=1).columns.tolist())
    formula = f'{dep_var} ~ {ind_vars}'
    # print(formula)

    model = smf.glm(
        family=sm.families.Poisson(),
        formula=formula,
        data=team_df_copy,
    ).fit()
    return model



def predict_match(
        team_1: pd.DataFrame,
                  team_2: pd.DataFrame,
                  workflow: Workflow = Workflow.NAIVE,

                  ):
    """
    With two dataframes, predicts the result of the match.
    :param team_1:pandas dataframe
    :param team_2:pandas dataframe
    :return: tuple (goals_1, goals_2)
    """
    team_1_copy = team_1.copy()
    team_1_copy = team_1_copy.reset_index()
    team_2_copy = team_2.copy()
    team_2_copy = team_2_copy.reset_index()
    dep_var = 'goals_converted'
    model_1 = train_poisson_model(team_1_copy)
    model_2 = train_poisson_model(team_2_copy)
    current_2 = team_2_copy.sort_values(by='date', ascending=False).head(1).reset_index()
    current_2['days_since'] = 0
    current_1 = team_1_copy.sort_values(by='date', ascending=False).head(1).reset_index()
    current_1['days_since'] = 0
    goals_converted_1_to_2 = model_1.predict(current_2.drop([dep_var, 'date'], axis=1))
    goals_converted_2_to_1 = model_2.predict(current_1.drop([dep_var, 'date'], axis=1))
    if workflow == Workflow.ATK_DEF:
        team_1_atk, team_1_def = get_elo_atk_def('Mexico')
        team_2_atk, team_2_def = get_elo_atk_def('South Africa')
        return (goals_converted_1_to_2[0] * (team_1_atk / team_2_def),
                goals_converted_2_to_1[0] * (team_2_atk / team_1_def))

    return goals_converted_1_to_2[0], goals_converted_2_to_1[0]





# print("NAIVE")
# goals_mx, goals_za = predict_match(mx_df, za_df)
# print(f"mx goals: {goals_mx} za goals: {goals_za}")
# goals_kr, goals_cz = predict_match(kr_df, cz_df)
# print(f"kr goals: {goals_kr} cz goals: {goals_cz}")
#
# print("ATK DEF")
# goals_mx, goals_za = predict_match(mx_df, za_df, workflow=Workflow.ATK_DEF)
# print(f"mx goals: {goals_mx} za goals: {goals_za}")
# goals_kr, goals_cz = predict_match(kr_df, cz_df, workflow=Workflow.ATK_DEF)
# print(f"kr goals: {goals_kr} cz goals: {goals_cz}")