95 lines
4.0 KiB
Python
Executable File
95 lines
4.0 KiB
Python
Executable File
|
|
import os
|
|
import sys
|
|
import psycopg2
|
|
import pandas as pd
|
|
from datetime import datetime
|
|
|
|
# Database Connection
|
|
DSN = os.getenv('DATABASE_URL', 'postgresql://suggestbet:SuGGesT2026SecuRe@localhost:15432/boilerplate_db')
|
|
if '?' in DSN: DSN = DSN.split('?')[0]
|
|
|
|
def diagnose():
|
|
try:
|
|
conn = psycopg2.connect(DSN)
|
|
cursor = conn.cursor()
|
|
|
|
print("🔍 DIAGNOSTIC REPORT: AI Data Coverage")
|
|
print("=======================================")
|
|
|
|
# 1. Total Football Matches (Finished)
|
|
cursor.execute("SELECT COUNT(*) FROM matches WHERE sport='football' AND score_home IS NOT NULL")
|
|
total_matches = cursor.fetchone()[0]
|
|
print(f"Total Finished Football Matches: {total_matches:,}")
|
|
|
|
if total_matches == 0:
|
|
print("❌ No matches found!")
|
|
return
|
|
|
|
# 2. Stats Coverage (match_team_stats)
|
|
cursor.execute("SELECT COUNT(DISTINCT match_id) FROM match_team_stats")
|
|
stats_count = cursor.fetchone()[0]
|
|
print(f"Matches with Team Stats: {stats_count:,} ({stats_count/total_matches*100:.1f}%)")
|
|
|
|
# 3. Squad Coverage (match_player_participation)
|
|
cursor.execute("SELECT COUNT(DISTINCT match_id) FROM match_player_participation")
|
|
squad_count = cursor.fetchone()[0]
|
|
print(f"Matches with Lineups (Squad): {squad_count:,} ({squad_count/total_matches*100:.1f}%)")
|
|
|
|
# 4. Officials Coverage
|
|
cursor.execute("SELECT COUNT(DISTINCT match_id) FROM match_officials")
|
|
officials_count = cursor.fetchone()[0]
|
|
print(f"Matches with Officials: {officials_count:,} ({officials_count/total_matches*100:.1f}%)")
|
|
|
|
# 5. Overlap (Gold Standard Data)
|
|
cursor.execute("""
|
|
SELECT COUNT(m.id)
|
|
FROM matches m
|
|
JOIN match_team_stats mts ON m.id = mts.match_id
|
|
JOIN match_player_participation mpp ON m.id = mpp.match_id
|
|
WHERE m.sport='football' AND m.score_home IS NOT NULL
|
|
""")
|
|
# Note: This join might be slow on huge DB without distinct on join inputs, but distinct count matches is better logic
|
|
# Rewrite for speed: check distinct IDs in intersection
|
|
|
|
cursor.execute("""
|
|
SELECT COUNT(*) FROM (
|
|
SELECT id FROM matches WHERE sport='football' AND score_home IS NOT NULL
|
|
INTERSECT
|
|
SELECT match_id FROM match_team_stats
|
|
INTERSECT
|
|
SELECT match_id FROM match_player_participation
|
|
INTERSECT
|
|
SELECT match_id FROM match_officials
|
|
) as overlap
|
|
""")
|
|
overlap_count = cursor.fetchone()[0]
|
|
print(f"Matches with ALL Data (Golden): {overlap_count:,} ({overlap_count/total_matches*100:.1f}%)")
|
|
|
|
print("\n🔍 RECENT DATA QUALITY (Last 1000 Matches)")
|
|
print("==========================================")
|
|
# Check last 1000 matches specifically
|
|
cursor.execute("""
|
|
WITH recent AS (
|
|
SELECT id FROM matches
|
|
WHERE sport='football' AND score_home IS NOT NULL
|
|
ORDER BY mst_utc DESC LIMIT 1000
|
|
)
|
|
SELECT
|
|
(SELECT COUNT(DISTINCT match_id) FROM match_team_stats WHERE match_id IN (SELECT id FROM recent)) as has_stats,
|
|
(SELECT COUNT(DISTINCT match_id) FROM match_player_participation WHERE match_id IN (SELECT id FROM recent)) as has_squad,
|
|
(SELECT COUNT(DISTINCT match_id) FROM match_officials WHERE match_id IN (SELECT id FROM recent)) as has_officials
|
|
""")
|
|
recent_stats = cursor.fetchone()
|
|
print(f"Has Stats: {recent_stats[0]/10:.1f}%")
|
|
print(f"Has Lineups: {recent_stats[1]/10:.1f}%")
|
|
print(f"Has Officials: {recent_stats[2]/10:.1f}%")
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
finally:
|
|
if conn: conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
diagnose()
|