diff --git a/Scripts/main_RF.py b/Scripts/main_RF.py index 79401a50a4a04ec9810d4ef0793e24def2deaf7e..1b58cd3f933899d431cd9dc98189ef008d94909f 100644 --- a/Scripts/main_RF.py +++ b/Scripts/main_RF.py @@ -8,13 +8,13 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, confusion_matrix -# === 1. 读取数据 === + df = pd.read_csv("../Dataset/Linhac24-25_Sportlogiq.csv", encoding="utf-8") -# === 2. 创建目标变量 === + df['is_goal'] = (df['eventname'] == 'goal').astype(int) -# === 3. 选择特征 === +#选择特征 features = [ 'xg_allattempts', 'compiledgametime', 'period', 'scoredifferential', 'teamskatersonicecount', 'opposingteamskatersonicecount', @@ -25,17 +25,17 @@ features = [ df = df[features + ['is_goal']].copy() -# === 4. 数据预处理 === + df.fillna({'xg_allattempts': 0, 'compiledgametime': 0, 'scoredifferential': 0, 'teamskatersonicecount': 5, 'opposingteamskatersonicecount': 5, 'xadjcoord': 0, 'yadjcoord': 0}, inplace=True) -# 编码分类变量 + categorical_cols = ['teaminpossession', 'currentpossession', 'manpowersituation', 'ishomegame', 'playerprimaryposition', 'type', 'outcome'] df = pd.get_dummies(df, columns=categorical_cols, drop_first=True) -# === 5. 拆分数据集 === +#拆分数据集 X = df.drop(columns=['is_goal']) y = df['is_goal'] @@ -43,23 +43,23 @@ X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, stratify=y, random_state=42 ) -# === 6. 模型路径设置 === +#模型路径 model_dir = os.path.join(os.path.dirname(__file__), "../Model") os.makedirs(model_dir, exist_ok=True) model_path = os.path.join(model_dir, "RF1.pkl") -# === 7. 加载或训练模型 === + if os.path.exists(model_path): - print("✔ 模型已存在,正在加载模型...") + print("模型已存在,正在加载模型...") rf = joblib.load(model_path) else: - print("⚙️ 正在训练模型...") + print("正在训练模型...") rf = RandomForestClassifier(n_estimators=100, random_state=42) rf.fit(X_train, y_train) joblib.dump(rf, model_path) - print(f"✅ 模型已保存到: {model_path}") + print(f"模型已保存到: {model_path}") -# === 8. 模型评估 === +#模型评估 y_pred = rf.predict(X_test) print("\n=== Classification Report ===") @@ -68,7 +68,6 @@ print(classification_report(y_test, y_pred)) print("=== Confusion Matrix ===") print(confusion_matrix(y_test, y_pred)) -# === 9. 特征重要性可视化 === importances = pd.Series(rf.feature_importances_, index=X.columns) top_importances = importances.sort_values(ascending=False).head(20) @@ -79,9 +78,8 @@ plt.title("Top 20 Important Features for Predicting Goals") plt.xlabel("Feature Importance") plt.tight_layout() -# 👉 保存图表到指定位置(例如模型文件夹) plot_path = os.path.join("../Graph", "RF1.jpg") plt.savefig(plot_path, dpi=300, bbox_inches='tight') -print(f"📊 图表已保存到: {plot_path}") +print(f"图表已保存到: {plot_path}") plt.show() \ No newline at end of file diff --git a/Scripts/main_RF2.py b/Scripts/main_RF2.py index 29fd0452f3ed75f67f08f8f225622528b54791ac..a15f04991ce1808afb4cdab7750b367a636b2c1e 100644 --- a/Scripts/main_RF2.py +++ b/Scripts/main_RF2.py @@ -6,43 +6,37 @@ from sklearn.metrics import classification_report, confusion_matrix from sklearn.preprocessing import LabelEncoder import os -# 1. 读取数据 -df = pd.read_csv("../Dataset/Linhac24-25_Sportlogiq.csv", encoding="utf-8") # 修改为你的文件名和分隔符 -# 2. 目标列:是否为进球 +df = pd.read_csv("../Dataset/Linhac24-25_Sportlogiq.csv", encoding="utf-8") + df['is_goal'] = (df['eventname'] == 'goal').astype(int) -# 3. 删除无用或无关紧要的列(如gameid、playerid等) +#删除无用或无关紧要的列(如gameid、playerid等) drop_cols = ['gameid', 'playerid', 'eventname', 'compiledgametime'] df.drop(columns=drop_cols, inplace=True, errors='ignore') -# 4. 编码分类变量 categorical_cols = df.select_dtypes(include='object').columns for col in categorical_cols: df[col] = LabelEncoder().fit_transform(df[col].astype(str)) -# 5. 分离特征与标签 X = df.drop(columns=['is_goal']) y = df['is_goal'] -# 6. 分割训练集与测试集 + X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, stratify=y, random_state=42 ) - - -# 7. 建立模型,使用 class_weight 处理不平衡 +# 使用 class_weight 处理不平衡 model = RandomForestClassifier( n_estimators=100, random_state=42, - class_weight='balanced', # 核心修改 + class_weight='balanced', n_jobs=-1 ) model.fit(X_train, y_train) -#保存模型 joblib.dump(model, "../Model/RF2.pkl") print("模型已保存") @@ -51,7 +45,6 @@ print("模型已保存") model = joblib.load("../Model/RF2.pkl") -# 8. 预测与评估 y_pred = model.predict(X_test) print("=== Classification Report ===") @@ -60,7 +53,6 @@ print(classification_report(y_test, y_pred, digits=4)) print("\n=== Confusion Matrix ===") print(confusion_matrix(y_test, y_pred)) -# 9. 特征重要性(可选) importances = pd.Series(model.feature_importances_, index=X.columns) print("\n=== Top 10 Important Features ===") print(importances.sort_values(ascending=False).head(10)) diff --git a/Scripts/main_RF3_visual.py b/Scripts/main_RF3_visual.py index b61b56f6577410d7ac3192e401b51c6c4323bb03..61544b9eb67bbb67e47a7fc9587ddef968356a6c 100644 --- a/Scripts/main_RF3_visual.py +++ b/Scripts/main_RF3_visual.py @@ -7,31 +7,25 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix from sklearn.preprocessing import LabelEncoder -# 1. 读取数据 + df = pd.read_csv("../Dataset/Linhac24-25_Sportlogiq.csv", encoding="utf-8") -# 2. 设置目标列:是否为进球 df['is_goal'] = (df['eventname'] == 'goal').astype(int) -# 3. 删除无用列 drop_cols = ['gameid', 'playerid', 'eventname', 'compiledgametime'] df.drop(columns=drop_cols, inplace=True, errors='ignore') -# 4. 编码分类变量 categorical_cols = df.select_dtypes(include='object').columns for col in categorical_cols: df[col] = LabelEncoder().fit_transform(df[col].astype(str)) -# 5. 分离特征与标签 X = df.drop(columns=['is_goal']) y = df['is_goal'] -# 6. 划分训练集与测试集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, stratify=y, random_state=42 ) -# 7. 建立并训练随机森林模型(处理类别不平衡) model = RandomForestClassifier( n_estimators=100, random_state=42, @@ -40,7 +34,6 @@ model = RandomForestClassifier( ) model.fit(X_train, y_train) -# 9. 加载模型并进行评估 model = joblib.load('../Model/RF2.pkl') y_pred = model.predict(X_test) @@ -50,17 +43,15 @@ print(classification_report(y_test, y_pred, digits=4)) print("\n=== Confusion Matrix ===") print(confusion_matrix(y_test, y_pred)) -# 10. 特征重要性 +# 特征重要性 importances = pd.Series(model.feature_importances_, index=X.columns) print("\n=== Top 10 Important Features ===") print(importances.sort_values(ascending=False).head(10)) -# 11. 可视化进球位置 goal_data = df[df['is_goal'] == 1] -# ——【方式一】KDE 热力图(平滑)—— -# ——【方式一】KDE 热力图(平滑)—— +#KDE 热力图 plt.figure(figsize=(10, 6)) sns.kdeplot( x=goal_data['xadjcoord'], @@ -79,14 +70,13 @@ plt.ylabel('Y Coordinate') plt.grid(True) plt.legend() -# ✅ 保存 KDE 图像 plt.tight_layout() plt.savefig("../Graph/goal_heatmap_KDE.png", dpi=300, bbox_inches='tight') -print("✅ KDE 热力图已保存为: ../Model/goal_heatmap_KDE.png") +print("KDE 热力图已保存为: ../Model/goal_heatmap_KDE.png") plt.show() -# ——【方式二】Hexbin 热力图(像素块)—— +# Hexbin 热力图 plt.figure(figsize=(10, 6)) plt.hexbin( goal_data['xadjcoord'], @@ -105,9 +95,8 @@ plt.ylabel('Y Coordinate') plt.grid(True) plt.legend() -# ✅ 保存 Hexbin 图像 plt.tight_layout() plt.savefig("../Graph/goal_heatmap_Hexbin.png", dpi=300, bbox_inches='tight') -print("✅ Hexbin 热力图已保存为: ../Model/goal_heatmap_Hexbin.png") +print("Hexbin 热力图已保存为: ../Model/goal_heatmap_Hexbin.png") plt.show() diff --git a/Scripts/main_XGB.py b/Scripts/main_XGB.py index e264e59ee7d26ab8ef258cbe9af5870fb3ba7441..824cf6b41c353dbd16d35836b286cffa27757f49 100644 --- a/Scripts/main_XGB.py +++ b/Scripts/main_XGB.py @@ -7,16 +7,13 @@ import joblib from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, confusion_matrix -plt.rcParams['font.family'] = 'Microsoft YaHei' # 设置中文字体为微软雅黑 -plt.rcParams['axes.unicode_minus'] = False # 正确显示负号 +plt.rcParams['font.family'] = 'Microsoft YaHei' +plt.rcParams['axes.unicode_minus'] = False -# === 1. 读取数据 === df = pd.read_csv("../Dataset/Linhac24-25_Sportlogiq.csv") -# === 2. 构造目标变量 === df["is_goal"] = (df["eventname"] == "goal").astype(int) -# === 3. 选择特征 === feature_cols = [ "xadjcoord", "yadjcoord", "compiledgametime", "scoredifferential", "teamskatersonicecount", "opposingteamskatersonicecount", @@ -24,37 +21,32 @@ feature_cols = [ "type", "manpowersituation", "playerprimaryposition", "outcome" ] -# === 4. 预处理 === df["xg_allattempts"] = df["xg_allattempts"].fillna(0) df["compiledgametime"] = df["compiledgametime"].fillna(0) # One-hot 编码 df_encoded = pd.get_dummies(df[feature_cols], drop_first=True) -# 特征 & 标签 +#特征 X = df_encoded y = df["is_goal"] -# === 5. 划分训练和测试集 === X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) -# === 6. 模型保存路径 === model_dir = os.path.join(os.path.dirname(__file__), "../Model") os.makedirs(model_dir, exist_ok=True) model_path = os.path.join(model_dir, "XGB.pkl") -# === 7. 加载或训练模型 === if os.path.exists(model_path): - print("✔ 已检测到已有模型,正在加载中...") + print("已检测到已有模型,正在加载中...") model = joblib.load(model_path) else: - print("⚙️ 正在训练 XGBoost 模型...") + print("正在训练 XGBoost 模型...") model = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False) model.fit(X_train, y_train) joblib.dump(model, model_path) - print(f"✅ 模型已保存到: {model_path}") + print(f"模型已保存到: {model_path}") -# === 8. 模型评估 === y_pred = model.predict(X_test) print("\n=== 分类报告 ===") print(classification_report(y_test, y_pred)) diff --git a/Scripts/main_XGB2.py b/Scripts/main_XGB2.py index 5f3d0e5c27e675cedefdd6a6094c75826b1e80bf..4c94e417482f4c7d0b5fd95b11af47431fbde028 100644 --- a/Scripts/main_XGB2.py +++ b/Scripts/main_XGB2.py @@ -6,17 +6,14 @@ import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, confusion_matrix -# 设置中文字体 plt.rcParams['font.family'] = 'Microsoft YaHei' plt.rcParams['axes.unicode_minus'] = False -# 1. 读取数据 df = pd.read_csv("../Dataset/Linhac24-25_Sportlogiq.csv") -# 2. 构造目标变量 df["is_goal"] = (df["eventname"] == "goal").astype(int) -# 3. 选择特征列 + feature_cols = [ "xadjcoord", "yadjcoord", "compiledgametime", "scoredifferential", "teamskatersonicecount", "opposingteamskatersonicecount", @@ -24,41 +21,33 @@ feature_cols = [ "type", "manpowersituation", "playerprimaryposition", "outcome" ] -# 4. 预处理 df["xg_allattempts"] = df["xg_allattempts"].fillna(0) df["compiledgametime"] = df["compiledgametime"].fillna(0) -# 类别变量 one-hot 编码 df_encoded = pd.get_dummies(df[feature_cols], drop_first=True) -# 特征和标签 X = df_encoded y = df["is_goal"] -# 5. 划分训练/测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) -# 6. 训练模型 model = xgb.XGBClassifier(eval_metric='logloss') model.fit(X_train, y_train) -# ✅ 保存模型 #model.save_model("../Model/XGB2.pkl") -# ✅ 加载模型 loaded_model = xgb.XGBClassifier() loaded_model.load_model("../Model/XGB2.pkl") -# 7. 模型评估 y_pred = loaded_model.predict(X_test) print(classification_report(y_test, y_pred)) print("混淆矩阵:") print(confusion_matrix(y_test, y_pred)) -# 8. 获取预测概率 + y_prob = loaded_model.predict_proba(X_test)[:, 1] -# 9. 组合 df_test 用于分析 +#组合 df_test 用于分析 df_test = X_test.copy() df_test['is_goal'] = y_test df_test['goal_prob'] = y_prob @@ -67,7 +56,7 @@ df_test['yadjcoord'] = df.loc[df_test.index, 'yadjcoord'] df_test['manpowersituation'] = df.loc[df_test.index, 'manpowersituation'] df_test['scoredifferential'] = df.loc[df_test.index, 'scoredifferential'] -# 10. 进球位置与进球概率关系图 +#进球位置与进球概率关系图 plt.figure(figsize=(10, 6)) plt.scatter(df_test['xadjcoord'], df_test['yadjcoord'], c=df_test['goal_prob'], cmap='coolwarm', alpha=0.5) plt.colorbar(label="Goal Probability") @@ -82,7 +71,7 @@ plt.show() -# 11. 局势 vs 进球概率 +#局势 vs 进球概率 plt.figure(figsize=(12, 6)) sns.boxplot(x=df_test['manpowersituation'], y=df_test['goal_prob']) plt.title("Goal Probability Across Different Manpower Situations") @@ -94,7 +83,7 @@ plt.savefig("../Graph/goal_prob_by_manpowersituation.png", dpi=300, bbox_inches= plt.show() -# 12. 比分差 vs 进球概率 +# 比分差 vs 进球概率 plt.figure(figsize=(10, 6)) sns.lineplot(x=df_test['scoredifferential'], y=df_test['goal_prob'], marker='o') plt.title("Goal Probability vs Score Differential") diff --git a/Scripts/process_1.py b/Scripts/process_1.py index 84c1e8de038cbb27de6b38bf14a0f8c9c90609a4..840e7b71591406da3c7d48c8538015053b7e70a5 100644 --- a/Scripts/process_1.py +++ b/Scripts/process_1.py @@ -1,38 +1,35 @@ import pandas as pd import matplotlib.pyplot as plt import seaborn as sns -plt.rcParams['font.family'] = 'Microsoft YaHei' # 设置中文字体为微软雅黑 -plt.rcParams['axes.unicode_minus'] = False # 正确显示负号 -# 读取数据(假设你已经保存为 CSV 文件) +plt.rcParams['font.family'] = 'Microsoft YaHei' +plt.rcParams['axes.unicode_minus'] = False + df = pd.read_csv("Linhac24-25_Sportlogiq.csv") -# 显示基本信息 + print(df.info()) print(df.head()) - event_counts = df['eventname'].value_counts() print("事件类型统计:\n", event_counts) - team_possession_counts = df['teaminpossession'].value_counts() print("控球队出现次数:\n", team_possession_counts) - df['xg_allattempts'] = pd.to_numeric(df['xg_allattempts'], errors='coerce') xg_by_team = df.groupby('teamid')['xg_allattempts'].sum() print("每支球队的总xG:\n", xg_by_team) - player_actions = df['playerid'].value_counts().head(10) print("参与最多事件的前10位球员:\n", player_actions) - - success_rate = df['outcome'].value_counts(normalize=True) print("事件成功与失败比例:\n", success_rate) + + + print("\n简单总结:") print(f"总事件数: {len(df)}") print(f"总xG: {df['xg_allattempts'].sum():.2f}") diff --git a/Scripts/process_2.py b/Scripts/process_2.py index c5018e18c8967cbf8e777aed901baa9446631f54..c025647c9c7b385305ecf69758f11d22fa63d06e 100644 --- a/Scripts/process_2.py +++ b/Scripts/process_2.py @@ -10,12 +10,12 @@ df = pd.read_csv("Linhac24-25_Sportlogiq.csv") sns.set(style="whitegrid") -print("📌 数据基本信息:") +print("数据基本信息:") print(df.info()) -print("\n📈 描述性统计:") +print("\n描述性统计:") print(df.describe()) -print("\n📊 不同事件类型分布:") +print("\n不同事件类型分布:") print(df['eventname'].value_counts()) plt.figure(figsize=(12, 5))