import os import sys import psycopg2 import pandas as pd from datetime import datetime # Database Connection DSN = os.getenv('DATABASE_URL', 'postgresql://suggestbet:SuGGesT2026SecuRe@localhost:15432/boilerplate_db') if '?' in DSN: DSN = DSN.split('?')[0] def diagnose(): try: conn = psycopg2.connect(DSN) cursor = conn.cursor() print("šŸ” DIAGNOSTIC REPORT: AI Data Coverage") print("=======================================") # 1. Total Football Matches (Finished) cursor.execute("SELECT COUNT(*) FROM matches WHERE sport='football' AND score_home IS NOT NULL") total_matches = cursor.fetchone()[0] print(f"Total Finished Football Matches: {total_matches:,}") if total_matches == 0: print("āŒ No matches found!") return # 2. Stats Coverage (match_team_stats) cursor.execute("SELECT COUNT(DISTINCT match_id) FROM match_team_stats") stats_count = cursor.fetchone()[0] print(f"Matches with Team Stats: {stats_count:,} ({stats_count/total_matches*100:.1f}%)") # 3. Squad Coverage (match_player_participation) cursor.execute("SELECT COUNT(DISTINCT match_id) FROM match_player_participation") squad_count = cursor.fetchone()[0] print(f"Matches with Lineups (Squad): {squad_count:,} ({squad_count/total_matches*100:.1f}%)") # 4. Officials Coverage cursor.execute("SELECT COUNT(DISTINCT match_id) FROM match_officials") officials_count = cursor.fetchone()[0] print(f"Matches with Officials: {officials_count:,} ({officials_count/total_matches*100:.1f}%)") # 5. Overlap (Gold Standard Data) cursor.execute(""" SELECT COUNT(m.id) FROM matches m JOIN match_team_stats mts ON m.id = mts.match_id JOIN match_player_participation mpp ON m.id = mpp.match_id WHERE m.sport='football' AND m.score_home IS NOT NULL """) # Note: This join might be slow on huge DB without distinct on join inputs, but distinct count matches is better logic # Rewrite for speed: check distinct IDs in intersection cursor.execute(""" SELECT COUNT(*) FROM ( SELECT id FROM matches WHERE sport='football' AND score_home IS NOT NULL INTERSECT SELECT match_id FROM match_team_stats INTERSECT SELECT match_id FROM match_player_participation INTERSECT SELECT match_id FROM match_officials ) as overlap """) overlap_count = cursor.fetchone()[0] print(f"Matches with ALL Data (Golden): {overlap_count:,} ({overlap_count/total_matches*100:.1f}%)") print("\nšŸ” RECENT DATA QUALITY (Last 1000 Matches)") print("==========================================") # Check last 1000 matches specifically cursor.execute(""" WITH recent AS ( SELECT id FROM matches WHERE sport='football' AND score_home IS NOT NULL ORDER BY mst_utc DESC LIMIT 1000 ) SELECT (SELECT COUNT(DISTINCT match_id) FROM match_team_stats WHERE match_id IN (SELECT id FROM recent)) as has_stats, (SELECT COUNT(DISTINCT match_id) FROM match_player_participation WHERE match_id IN (SELECT id FROM recent)) as has_squad, (SELECT COUNT(DISTINCT match_id) FROM match_officials WHERE match_id IN (SELECT id FROM recent)) as has_officials """) recent_stats = cursor.fetchone() print(f"Has Stats: {recent_stats[0]/10:.1f}%") print(f"Has Lineups: {recent_stats[1]/10:.1f}%") print(f"Has Officials: {recent_stats[2]/10:.1f}%") except Exception as e: print(f"Error: {e}") finally: if conn: conn.close() if __name__ == "__main__": diagnose()