Source code for pyTSPA.data

import pandas as pd
import os
from typing import Literal

[docs] def load_match_data(filepath: str) -> pd.DataFrame: """ Loads match data from a CSV or Excel file into a DataFrame. Args: filepath (str): path to the CSV (.csv) or Excel (.xlsx, .xls) file Returns: pd.DataFrame: loaded match data Raises: ValueError: if file extension is unsupported or loading fails """ try: _, ext = os.path.splitext(filepath.lower()) if ext == ".csv": df = pd.read_csv(filepath) elif ext in [".xlsx", ".xls"]: df = pd.read_excel(filepath) else: raise ValueError(f"Unsupported file format: {ext}") except Exception as e: raise ValueError(f"Failed to load file '{filepath}': {e}") return df
from typing import Literal import pandas as pd
[docs] def clean_data(df: pd.DataFrame, missing_strategy: Literal["fill", "drop", "none"] = "fill") -> pd.DataFrame: """ Cleans match data: handles missing values and converts date columns. Args: df (pd.DataFrame): raw data to be cleaned missing_strategy (str): strategy for handling missing values - "fill": fill numeric missing values with column mean (default) - "drop": drop rows with any missing value in at least one variable - "none": leave missing values untouched Returns: pd.DataFrame: cleaned DataFrame Raises: ValueError: if an unknown missing_strategy is given """ date_cols = [col for col in df.columns if "date" in col.lower()] for col in date_cols: try: df[col] = pd.to_datetime(df[col], errors="coerce") except Exception as e: print(f"Warning: failed to parse date column '{col}': {e}") if missing_strategy == "fill": numeric_cols = df.select_dtypes(include=["number"]).columns for col in numeric_cols: if df[col].isnull().any(): df[col] = df[col].astype(float).fillna(df[col].mean()) elif missing_strategy == "drop": df = df.dropna() elif missing_strategy == "none": pass else: raise ValueError(f"Invalid missing_strategy: '{missing_strategy}'. Use 'fill', 'drop', or 'none'.") return df
[docs] def data_profiling(df: pd.DataFrame) -> None: """ Prints basic information about the DataFrame: column names, types, number of rows and columns, missing values and basic statistics. Args: df (pd.DataFrame): the DataFrame to analyze Returns: None: the function only prints information to the console """ print("Basic information about the DataFrame:\n") # Column names and types print("Columns and their types:") print(df.dtypes) print("\n") # Number of rows print(f"Number of rows: {df.shape[0]}") # Number of columns print(f"Number of columns: {df.shape[1]}") print("\n") # Missing values missing_values = df.isnull().sum() print("Missing values (per column):") if missing_values.sum() == 0: print("No missing values found.\n") else: print(missing_values[missing_values > 0], "\n") # Basic statistics for numeric columns print("Basic statistics for numeric columns:") print(df.describe()) print("\n") # Basic statistics for categorical columns categorical_columns = df.select_dtypes(include=["object", "category"]).columns if len(categorical_columns) > 0: print("Basic statistics for categorical columns (showing up to top 10 most frequent values):") for col in categorical_columns: print(f"\n {col} (unique: {df[col].nunique()}):") vc = df[col].value_counts().head(10) vc.name = None for idx, val in vc.items(): print(f" - {str(idx)[:20]:<20} {val}") print("\n")
if __name__ == "__main__": print("Data submodule of our pyTSPA toolbox") print("Does nothing when run, please import it in your code, for example: 'import pyTSPA.data'")