Code
import numpy as np
import pandas as pd
from tsv_utils import tsv_converter, tsv_loader
Exploring the scraped eloratings.net data and prototyping the per-team model
Miguel R.
July 5, 2026
The snapshot dataset turned out to be a map rather than a model, so we went to the source: eloratings.net publishes the complete match history of every national team, with the Elo rating before and after each match, the score, the match type and the location. A small scraper (elo_scraper.js) downloads one TSV per qualified team.
This notebook is the first contact with that data, and it doubles as the prototype of the modeling approach used by the predictor: the time-decay weighting and the per-team Poisson regression both started here.
tsv_loader parses a team’s raw TSV; tsv_converter reshapes it into the model’s view of the world: one row per match from that team’s perspective - its Elo going in, the opponent’s Elo, the match type, the location, and how many goals it scored.
# INFO: every value should be a tuple with the name of the file and the country code.
def load_group(group: list, date: str = None) -> list[pd.DataFrame]:
dfs: list[pd.DataFrame] = []
for country, code in group:
df = tsv_loader(country)
if date is not None:
df = df[df['date'] <= pd.to_datetime(date)]
dfs.append(tsv_converter(df, code))
return dfsGroup A (the Mexico group) is the guinea pig: load all four teams and look at the Elo trajectories since the 2018 World Cup.
import math
def plot_group(group):
# we should load the group
# plot the elo
groups = load_group(group)
plot_data = None
for index, df in enumerate(groups):
df['team'] = group[index][0]
df = df.reset_index()
df = df[df['date'] > wrc_2016_first_match_dt]
plot_data = pd.concat([plot_data, df]) if plot_data is not None else df
sorted_dates = plot_data.sort_values(by=['team', 'date'], ascending=False)
avg_elo = (sorted_dates
.groupby('team', group_keys=False)
.head(1)
# .agg({'current_team_elo': 'mean'})
.groupby('team')['current_team_elo']
.mean()
)
three_best_elo = avg_elo.sort_values(ascending=False).head(3)
ax = sns.lineplot(data=plot_data, x='date', y='current_team_elo', hue='team')
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
ax.set_ylabel('Elo Rating')
ax.set_xlabel(
f"avg elo:{math.floor(avg_elo.mean())} - top 3 avg:{math.floor(three_best_elo.mean())}")
plt.show()
The final piece explored here became the core of the predictor. Two ideas:
goals ~ opponent Elo + match type + location, weighted by recency, for a single team - Mexico - as a proof of concept.The productionized version of this cell (with categorical-level guards, scaling and NaN handling) lives in model_utils.py.
from enum import Enum
Workflow = Enum("Workflow", [("NAIVE", 1), ("ATK_DEF", 2), ("HIST", 3)])
from elo_utils import get_elo_atk_def
import statsmodels.api as sm
import statsmodels.formula.api as smf
# GROUP_A = [('Mexico', 'MX'), ('South_Africa', 'ZA'), ('South_Korea', 'KR'), ('Czechia', 'CZ')]
mx_df, za_df, kr_df, cz_df = load_group(GROUP_A)
from_date = wrc_2016_first_match_dt
mx_df = mx_df.reset_index()
mx_df = mx_df[mx_df['date'] > from_date]
# create the weighted column
def add_time_decay_weights(
df: pd.DataFrame,
date_col: str = "date",
half_life_days: float = 365 * 2
):
"""
Add exponential time-decay weights to a dataframe.
More recent matches have higher weights.
The max date is not today, its the max date of the data.
Parameters
----------
df : DataFrame
Match data with a date column.
date_col : str
Column containing match dates.
half_life_days : float
How fast old matches lose importance.
Example:
- 365*2 → every 2 years weight halves
- smaller → faster adaptation
"""
df = df.copy()
# Ensure datetime format
df[date_col] = pd.to_datetime(df[date_col])
# Use most recent match as reference point
max_date = df[date_col].max()
# Compute time difference in days
df["days_since"] = (max_date - df[date_col]).dt.days
# Exponential decay:
# weight = exp(-lambda * t)
# lambda = ln(2) / half_life
decay_rate = np.log(2) / half_life_days
df["time_weight"] = np.exp(-decay_rate * df["days_since"])
return df
def train_poisson_model(team_df: pd.DataFrame):
team_df_copy = team_df.copy()
team_df_copy = add_time_decay_weights(team_df_copy, date_col="date")
time_weight = 'time_weight'
dep_var = 'goals_converted'
# date and days_since are redundant, probably doing VIF
ind_vars = ' + '.join(team_df_copy.drop([dep_var, time_weight, 'date'], axis=1).columns.tolist())
formula = f'{dep_var} ~ {ind_vars}'
# print(formula)
model = smf.glm(
family=sm.families.Poisson(),
formula=formula,
data=team_df_copy,
).fit()
return model
def predict_match(
team_1: pd.DataFrame,
team_2: pd.DataFrame,
workflow: Workflow = Workflow.NAIVE,
):
"""
With two dataframes, predicts the result of the match.
:param team_1:pandas dataframe
:param team_2:pandas dataframe
:return: tuple (goals_1, goals_2)
"""
team_1_copy = team_1.copy()
team_1_copy = team_1_copy.reset_index()
team_2_copy = team_2.copy()
team_2_copy = team_2_copy.reset_index()
dep_var = 'goals_converted'
model_1 = train_poisson_model(team_1_copy)
model_2 = train_poisson_model(team_2_copy)
current_2 = team_2_copy.sort_values(by='date', ascending=False).head(1).reset_index()
current_2['days_since'] = 0
current_1 = team_1_copy.sort_values(by='date', ascending=False).head(1).reset_index()
current_1['days_since'] = 0
goals_converted_1_to_2 = model_1.predict(current_2.drop([dep_var, 'date'], axis=1))
goals_converted_2_to_1 = model_2.predict(current_1.drop([dep_var, 'date'], axis=1))
if workflow == Workflow.ATK_DEF:
team_1_atk, team_1_def = get_elo_atk_def('Mexico')
team_2_atk, team_2_def = get_elo_atk_def('South Africa')
return (goals_converted_1_to_2[0] * (team_1_atk / team_2_def),
goals_converted_2_to_1[0] * (team_2_atk / team_1_def))
return goals_converted_1_to_2[0], goals_converted_2_to_1[0]
# print("NAIVE")
# goals_mx, goals_za = predict_match(mx_df, za_df)
# print(f"mx goals: {goals_mx} za goals: {goals_za}")
# goals_kr, goals_cz = predict_match(kr_df, cz_df)
# print(f"kr goals: {goals_kr} cz goals: {goals_cz}")
#
# print("ATK DEF")
# goals_mx, goals_za = predict_match(mx_df, za_df, workflow=Workflow.ATK_DEF)
# print(f"mx goals: {goals_mx} za goals: {goals_za}")
# goals_kr, goals_cz = predict_match(kr_df, cz_df, workflow=Workflow.ATK_DEF)
# print(f"kr goals: {goals_kr} cz goals: {goals_cz}")