Source code for pyTSPA.metrics

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler


[docs]
def result_stats(df: pd.DataFrame) -> dict:

    """
    Computes the number of home wins, draws, and away wins from the full-time result column ('FTR').

    Args:
        df (pd.DataFrame): DataFrame containing match data with a column 'FTR' indicating match outcomes.
            - 'H' for Home Win
            - 'D' for Draw
            - 'A' for Away Win

    Returns:
        dict: a dictionary with the counts of each result type, structured as:
            {
                'Home Wins': int,
                'Draws': int,
                'Away Wins': int
            }

    Raises:
        ValueError: if the 'FTR' column is not found in the DataFrame
    """
    if 'FTR' not in df.columns:
        raise ValueError("Column 'FTR' (full-time result) not found.")

    result_counts = df['FTR'].value_counts().to_dict()
    return{
        'Home Wins': result_counts.get('H', 0),
        'Draws': result_counts.get('D', 0),
        'Away Wins': result_counts.get('A', 0)
    }



[docs]
def team_performance(df: pd.DataFrame, team_name: str) -> dict:

    """
    Computes a team's performance statistics across a season.

    This function summarizes key performance metrics for a specified team, including the number of matches played, wins, draws, losses, goals scored, goals conceded, goal difference, and points.

    Args:
        df (pd.DataFrame): dataFrame containing match data with columns 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR'.
        team_name (str): the name of the team for which the performance metrics will be calculated

    Returns:
        dict: a dictionary containing the team's performance metrics with the following structure:
            {
                'Team': str,
                'Matches': int,
                'Wins': int,
                'Draws': int,
                'Losses': int,
                'Goals For': int,
                'Goals Against': int,
                'Goal Difference': int,
                'Points': int
            }

    Raises:
        ValueError: if any of the required columns ('HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR') are missing
    """
    required_columns = ['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")
    
    matches = df[(df['HomeTeam'] == team_name) | (df['AwayTeam'] == team_name)]

    wins = draws = losses = goals_for = goals_against = 0

    for _, row in matches.iterrows():
        if row['HomeTeam'] == team_name:
            gf, ga = row['FTHG'], row['FTAG']
            result = row['FTR']
            if result == 'H': wins += 1
            elif result == 'D': draws += 1
            else: losses += 1
        else:  # Away team
            gf, ga = row['FTAG'], row['FTHG']
            result = row['FTR']
            if result == 'A': wins += 1
            elif result == 'D': draws += 1
            else: losses += 1

        goals_for += gf
        goals_against += ga

    return {
        'Team': team_name,
        'Matches': len(matches),
        'Wins': wins,
        'Draws': draws,
        'Losses': losses,
        'Goals For': goals_for,
        'Goals Against': goals_against,
        'Goal Difference': goals_for - goals_against,
        'Points': 3 * wins + draws
    }



[docs]
def get_all_teams(df: pd.DataFrame) -> np.ndarray:
    
    """
    Extracts a list of all unique team names from 'HomeTeam' and 'AwayTeam' columns.

    This function aggregates unique team names from both the 'HomeTeam' and 'AwayTeam' columns to provide a comprehensive list of teams in the dataset.

    Args:
        df (pd.DataFrame): dataFrame containing match data with 'HomeTeam' and 'AwayTeam' columns

    Returns:
        np.ndarray: a sorted array of unique team names

    Raises:
        ValueError: if either 'HomeTeam' or 'AwayTeam' columns are missing
    """

    required_columns = ['HomeTeam', 'AwayTeam']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")
    
    home_teams = df['HomeTeam'].astype(str).unique()
    away_teams = df['AwayTeam'].astype(str).unique()
    all_teams = np.union1d(home_teams, away_teams)
    return all_teams



[docs]
def each_team_performance(df: pd.DataFrame) -> pd.DataFrame:
    """
    Computes performance statistics for every team in the dataset.

    This function calculates the performance metrics for each team using the `team_performance()` function and returns a DataFrame summarizing each team's performance.

    Args:
        df (pd.DataFrame): dataFrame containing match data with 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR' columns

    Returns:
        pd.DataFrame: a DataFrame where each row represents a team's performance summary, sorted by points in descending order
        Columns include:
            - 'Team': Team name
            - 'Matches': Matches played
            - 'Wins': Wins
            - 'Draws': Draws
            - 'Losses': Losses
            - 'Goals For': Goals scored
            - 'Goals Against': Goals conceded
            - 'Goal Difference': Goal difference
            - 'Points': Points accumulated
    """
    teams = get_all_teams(df)
    summaries = [team_performance(df, team) for team in teams]
    return pd.DataFrame(summaries).sort_values(by='Points', ascending=False).reset_index(drop=True)



[docs]
def win_percentage(df: pd.DataFrame, team_name: str) -> float:
    
    """
    Calculates the win percentage for a specified team.

    Args:
        df (pd.DataFrame): DataFrame containing match data with 'HomeTeam', 'AwayTeam', and 'FTR' columns
        team_name (str): the name of the team to calculate win percentage for

    Returns:
        float: the win percentage as a value between 0 and 1

    Raises:
        ValueError: if the required columns are missing
    """
    required_columns = ['HomeTeam', 'AwayTeam', 'FTR']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {required_columns}")

    matches = df[(df['HomeTeam'] == team_name) | (df['AwayTeam'] == team_name)]
    total_matches = len(matches)
    if total_matches == 0:
        return 0.0

    wins = sum((matches['HomeTeam'] == team_name) & (matches['FTR'] == 'H')) + sum((matches['AwayTeam'] == team_name) & (matches['FTR'] == 'A'))
    return (wins / total_matches)



[docs]
def each_win_percentage(df: pd.DataFrame) -> pd.DataFrame:

    """
    Calculates the win percentage for every team and returns it as a separate DataFrame.

    Args:
        df (pd.DataFrame): DataFrame containing match data with 'HomeTeam', 'AwayTeam', and 'FTR' columns

    Returns:
        pd.DataFrame: DataFrame with 'Team' and 'WinPercentage' columns
    """
    teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()
    win_percentages = {team: win_percentage(df, team) for team in teams}

    result_df = pd.DataFrame(list(win_percentages.items()), columns=['Team', 'WinPercentage'])
    return result_df



[docs]
def pythagorean_expectation(df: pd.DataFrame, team_name: str, exponent: float = 2.0) -> float:

    """
    Calculates the Pythagorean Expectation for a specified team using match data.

    Args:
        df (pd.DataFrame): DataFrame containing match data with 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG' columns
        team_name (str): the name of the team to calculate the Pythagorean Expectation for
        exponent (float): exponent value for the calculation, default is 2.0

    Returns:
        float: the Pythagorean Expectation as a value between 0 and 1
    
    Raises:
        ValueError: if the required columns are missing
    """
    required_columns = ['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns: {required_columns}")

    matches = df[(df['HomeTeam'] == team_name) | (df['AwayTeam'] == team_name)]

    goals_for = sum(matches[matches['HomeTeam'] == team_name]['FTHG']) + sum(matches[matches['AwayTeam'] == team_name]['FTAG'])
    goals_against = sum(matches[matches['HomeTeam'] == team_name]['FTAG']) + sum(matches[matches['AwayTeam'] == team_name]['FTHG'])

    if goals_for + goals_against == 0:
        return 0.0

    gf_exp = goals_for ** exponent
    ga_exp = goals_against ** exponent
    return gf_exp / (gf_exp + ga_exp)



[docs]
def each_pythagorean_expectation(df: pd.DataFrame, exponent: float = 2.0) -> pd.DataFrame:

    """
    Calculates the Pythagorean Expectation for every team and returns it as a separate DataFrame.

    Args:
        df (pd.DataFrame): DataFrame containing match data with 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG' columns
        exponent (float): exponent value for the calculation, default is 2.0

    Returns:
        pd.DataFrame: DataFrame with 'Team' and 'PythagoreanExpectation' columns
    """
    teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()
    pyth_expectations = {team: pythagorean_expectation(df, team, exponent) for team in teams}

    result_df = pd.DataFrame(list(pyth_expectations.items()), columns=['Team', 'PythagoreanExpectation'])
    return result_df



[docs]
def logistic_regression_prediction(df: pd.DataFrame) -> dict:
    """
    Predicts match outcomes (Win/Draw/Loss) using multinomial logistic regression with oversampling and additional features.

    Args:
        df (pd.DataFrame): DataFrame containing match data with necessary metrics calculated

    Returns:
        dict: a dictionary containing model accuracy, confusion matrix, predictions, and the trained model
    """
    wpc = each_win_percentage(df)
    pyth = each_pythagorean_expectation(df)

    team_stats = pd.merge(wpc, pyth, on="Team", how="left")
    df = df.merge(team_stats, left_on="HomeTeam", right_on="Team", how="left").rename(
        columns={
            "WinPercentage": "Home_WinPercentage",
            "PythagoreanExpectation": "Home_PythagoreanExpectation"
        }
    )
    df = df.merge(team_stats, left_on="AwayTeam", right_on="Team", how="left").rename(
        columns={
            "WinPercentage": "Away_WinPercentage",
            "PythagoreanExpectation": "Away_PythagoreanExpectation"
        }
    )

    df['GoalDifference'] = df['Home_PythagoreanExpectation'] - df['Away_PythagoreanExpectation']

    # Target variable: 2 for Home Win, 1 for Draw, 0 for Away Win
    df["Target"] = df["FTR"].map({"H": 2, "D": 1, "A": 0})

    # Features and target
    X = df[[
        "Home_WinPercentage", 
        "Away_WinPercentage", 
        "Home_PythagoreanExpectation", 
        "Away_PythagoreanExpectation",
        "GoalDifference"
    ]]
    y = df["Target"].astype(int)

    # Standardize the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Apply SMOTE for oversampling the minority classes
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

    # Multinomial Logistic Regression model with regularization
    model = LogisticRegression(solver='lbfgs', max_iter=1000, C=1.0)
    model.fit(X_train, y_train)

    # Predictions and evaluation
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    conf_matrix = confusion_matrix(y_test, predictions, labels=[2, 1, 0])

    # Create a DataFrame with predictions and actual results
    prediction_df = pd.DataFrame(X_test, columns=[
        "Home_WinPercentage", "Away_WinPercentage", "Home_PythagoreanExpectation", "Away_PythagoreanExpectation", "GoalDifference"
    ])
    prediction_df["Actual"] = y_test.values
    prediction_df["Predicted"] = predictions

    return {
        "accuracy": accuracy,
        "confusion_matrix": conf_matrix.tolist(),
        "predictions": prediction_df.reset_index(drop=True),
        "model": model
    }



[docs]
def predict_match_outcome(home_team: str, away_team: str, model: LogisticRegression, df: pd.DataFrame) -> dict:

    """
    Predicts the outcome of a specific match between two teams using the trained logistic regression model.

    Args:
        home_team (str): name of the home team
        away_team (str): name of the away team
        model (LogisticRegression): trained logistic regression model
        df (pd.DataFrame): DataFrame containing the match data

    Returns:
        dict: a dictionary containing predicted outcome and probabilities
    """
    wpc = each_win_percentage(df)
    pyth = each_pythagorean_expectation(df)
    team_stats = pd.merge(wpc, pyth, on="Team", how="left")

    home_stats = team_stats[team_stats["Team"] == home_team]
    away_stats = team_stats[team_stats["Team"] == away_team]

    if home_stats.empty or away_stats.empty:
        raise ValueError("One or both teams not found in the dataset!")

    X_new = pd.DataFrame({
        "Home_WinPercentage": [home_stats["WinPercentage"].values[0]],
        "Away_WinPercentage": [away_stats["WinPercentage"].values[0]],
        "Home_PythagoreanExpectation": [home_stats["PythagoreanExpectation"].values[0]],
        "Away_PythagoreanExpectation": [away_stats["PythagoreanExpectation"].values[0]],
        "GoalDifference": [home_stats["PythagoreanExpectation"].values[0] - away_stats["PythagoreanExpectation"].values[0]]
    })

    probabilities = model.predict_proba(X_new)[0]
    prediction = model.predict(X_new)[0]

    outcome_map = {2: "Home Win", 1: "Draw", 0: "Away Win"}
    predicted_outcome = outcome_map[prediction]

    return {
        "predicted_outcome": predicted_outcome,
        "probabilities": {
            "Home Win": round(probabilities[2], 3),
            "Draw": round(probabilities[1], 3),
            "Away Win": round(probabilities[0], 3)
        }
    }



[docs]
def season_half_prediction(df: pd.DataFrame) -> pd.DataFrame:

    """
    Predicts the outcomes of the second half of the season based on the Pythagorean Expectation values calculated from the first half of the season.

    Args:
        df (pd.DataFrame): DataFrame containing match data with 'Date', 'HomeTeam', 'AwayTeam' and 'FTR' columns

    Returns:
        pd.DataFrame: DataFrame with predicted outcomes for the second half of the season
    """
    df = df.sort_values(by="Date")
    mid_index = len(df) // 2
    first_half = df.iloc[:mid_index]
    second_half = df.iloc[mid_index:]

    # Calculate Pythagorean Expectation for the first half
    pyth_expectations = each_pythagorean_expectation(first_half)

    # Merge with second half data
    second_half = second_half.merge(pyth_expectations, left_on="HomeTeam", right_on="Team", how="left").rename(
        columns={"PythagoreanExpectation": "Home_PythagoreanExpectation"}
    )
    second_half = second_half.merge(pyth_expectations, left_on="AwayTeam", right_on="Team", how="left").rename(
        columns={"PythagoreanExpectation": "Away_PythagoreanExpectation"}
    )

    # Calculate predicted outcome based on Pythagorean Expectation
    second_half["PredictedOutcome"] = np.where(
        second_half["Home_PythagoreanExpectation"] > second_half["Away_PythagoreanExpectation"], "Home Win",
        np.where(
            second_half["Home_PythagoreanExpectation"] < second_half["Away_PythagoreanExpectation"], "Away Win",
            "Draw"
        )
    )

    return second_half[["Date", "HomeTeam", "AwayTeam", "FTR", "Home_PythagoreanExpectation", "Away_PythagoreanExpectation", "PredictedOutcome"]]