The Pokémon VGC (Video Game Championships) dataset contains competitive double battle data with the following structure:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from wordcloud import WordCloud
# Load the dataset
df = pd.read_csv('Pokemon_Double_Teams_DB.csv')Understanding how Python's core data structures interact with pandas is crucial for effective analysis.
Use Cases: Column selection, iteration, data creation
# Column selection for numerical stats
num_cols = ['Total Stats', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
stats_df = df[num_cols] # Multi-column selection
# Binary columns for encoding
binary_cols = ['Offensive Cores', 'Support Roles', 'Speed Control', 'Defensive Backbone']
for col in binary_cols:
df[col] = df[col].apply(lambda x: 1 if '✅' in str(x) else 0)
# Creating new DataFrames from lists
pokemon_list = ['Jellicent', 'Cofagrigus', 'Gothitelle']
roles_list = ['Bulky Support', 'Bulky Support', 'Bulky Support']
new_df = pd.DataFrame({'Pokemon': pokemon_list, 'Role': roles_list})Use Cases: Value mapping, DataFrame creation, aggregation
# Value mapping for role simplification
role_mapping = {
'Bulky Support / Trick Room Enabler': 'Support',
'Trick Room Attacker': 'Attacker',
'Weather Setter': 'Utility'
}
df['Simple_Role'] = df['Role'].map(role_mapping).fillna(df['Role'])
# Creating DataFrames from dictionaries
data_dict = {
'Pokemon': ['Porygon2', 'Farigiraf'],
'Role': ['Bulky Support', 'Bulky Support'],
'Total Stats': [515, 520]
}
support_df = pd.DataFrame(data_dict)
# Aggregation with dictionaries
agg_dict = {
'HP': ['mean', 'std'],
'Attack': ['min', 'max'],
'Speed': 'median'
}
role_stats = df.groupby('Role').agg(agg_dict)Use Cases: Unique values, fast filtering, set operations
# Finding unique roles
unique_roles = set(df['Role'].dropna())
# Fast filtering with sets
support_roles = {'Bulky Support', 'Utility Support', 'Defensive Support'}
support_pokemon = df[df['Role'].isin(support_roles)]
# Set operations for analysis
trick_room_users = set(df[df['Speed Control'].str.contains('Trick Room', na=False)]['Pokemon'])
weather_setters = set(df[df['Role'].str.contains('Weather', na=False)]['Pokemon'])
dual_utility = trick_room_users & weather_setters # IntersectionUse Cases: Multi-indexing, grouping, fixed data structures
# Multi-index creation
index_tuples = [('Water', 'Jellicent'), ('Ghost', 'Cofagrigus')]
multi_index = pd.MultiIndex.from_tuples(index_tuples, names=['Type', 'Pokemon'])
# Grouping by multiple columns (results in tuple indices)
grouped = df.groupby(['Cluster', 'Role'])['Speed'].mean()
# Plot dimensions (common tuple usage)
plt.figure(figsize=(12, 6)) # (width, height) tuple# Basic dataset information
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())
# Sample data examination
print("\nFirst 5 rows:")
print(df.head())# Numerical statistics
print("Statistical summary:")
print(df[num_cols].describe())
# Categorical analysis
print("\nRole distribution:")
print(df['Role'].value_counts())
# Speed control methods
print("\nSpeed control distribution:")
print(df['Speed Control'].value_counts())# Handle missing values
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
# Binary encoding for analysis
binary_mapping = {'✅': 1, '❌': 0}
for col in binary_cols:
df[col] = df[col].map(lambda x: 1 if '✅' in str(x) else 0)
# Create derived fields
df['Offensive_Power'] = (df['Attack'] + df['Sp. Atk']) / 2
df['Defensive_Power'] = (df['Defense'] + df['Sp. Def']) / 2
df['Stat_Balance'] = df['Offensive_Power'] - df['Defensive_Power']# Statistical ratios
df['BST_Ratio'] = df['Total Stats'] / df['Total Stats'].max()
df['Speed_Percentile'] = df['Speed'].rank(pct=True)
# Categorical binning
df['Speed_Tier'] = pd.cut(
df['Speed'],
bins=[0, 50, 100, 150, 200],
labels=['Slow', 'Medium', 'Fast', 'Extreme']
)
# Complex conditions
df['Role_Type'] = np.where(
df['Role'].str.contains('Support|Utility'), 'Support',
np.where(df['Role'].str.contains('Attacker'), 'Offense', 'Other')
)# Column reordering
priority_cols = ['Pokemon', 'Role', 'Total Stats', 'Speed_Tier', 'Cluster']
remaining_cols = [col for col in df.columns if col not in priority_cols]
df = df[priority_cols + remaining_cols]
# Multi-level sorting
df = df.sort_values(
by=['Speed_Tier', 'Total Stats', 'Pokemon'],
ascending=[True, False, True]
).reset_index(drop=True)# Create supplementary tables
type_effectiveness = pd.DataFrame({
'Pokemon': ['Jellicent', 'Cofagrigus', 'Gothitelle'],
'Primary_Type': ['Water', 'Ghost', 'Psychic'],
'Weakness_Count': [4, 2, 3]
})
# Merge with main dataset
df = df.merge(type_effectiveness, on='Pokemon', how='left')
# Self-merge for comparisons
cluster_avg = df.groupby('Cluster')['Total Stats'].mean().reset_index()
cluster_avg.columns = ['Cluster', 'Cluster_Avg_Stats']
df = df.merge(cluster_avg, on='Cluster', how='left')# Role vs Cluster analysis
role_cluster_pivot = pd.pivot_table(
df,
index='Role',
columns='Cluster',
values='Total Stats',
aggfunc='mean',
fill_value=0
)
# Multi-value pivot
complex_pivot = pd.pivot_table(
df,
index=['Role', 'Speed_Tier'],
columns='Cluster',
values=['HP', 'Speed'],
aggfunc={'HP': 'mean', 'Speed': 'median'}
)# Convert wide stat data to long format
melted_stats = df.melt(
id_vars=['Pokemon', 'Role', 'Cluster'],
value_vars=['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed'],
var_name='Stat_Type',
value_name='Stat_Value'
)
# Useful for visualization
melted_stats.head()# Create multi-index for stacking
df_multi = df.set_index(['Role', 'Pokemon'])
# Stack columns to create tall format
stacked = df_multi[['Offensive_Power', 'Defensive_Power']].stack()
stacked.name = 'Power_Value'
# Unstack to create wide format
unstacked = stacked.unstack(level=0) # Role becomes columns# Correlation matrix
correlation_matrix = df[num_cols].corr()
# Visualization
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Pokémon Stat Correlations')
plt.tight_layout()
plt.show()
# Find highly correlated pairs
high_corr = correlation_matrix.abs().unstack().sort_values(ascending=False)
high_corr = high_corr[high_corr != 1.0] # Remove self-correlations
print("Highest correlations:")
print(high_corr.head(10))# Statistical tests and distributions
from scipy import stats
# Test for normality
for col in ['HP', 'Attack', 'Speed']:
stat, p_value = stats.shapiro(df[col].dropna())
print(f"{col}: Shapiro-Wilk p-value = {p_value:.4f}")
# Compare stat distributions by role
role_speed_comparison = []
for role in df['Role'].unique():
role_data = df[df['Role'] == role]['Speed'].dropna()
if len(role_data) > 5: # Minimum sample size
role_speed_comparison.append({
'Role': role,
'Mean_Speed': role_data.mean(),
'Std_Speed': role_data.std(),
'Sample_Size': len(role_data)
})
speed_comparison_df = pd.DataFrame(role_speed_comparison)
print(speed_comparison_df.sort_values('Mean_Speed', ascending=False))# Prepare data for clustering
features = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
X = df[features].dropna()
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Determine optimal number of clusters
inertias = []
k_range = range(1, 11)
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_scaled)
inertias.append(kmeans.inertia_)
# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(k_range, inertias, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
plt.show()
# Apply clustering
optimal_k = 5 # Based on elbow method
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['Stat_Cluster'] = kmeans.fit_predict(X_scaled)
# Analyze clusters
cluster_analysis = df.groupby('Stat_Cluster')[features].mean()
print("Cluster characteristics:")
print(cluster_analysis)# Prepare data for role prediction
# Create target variable (simplified roles)
role_mapping = {
'Bulky Support': 'Support',
'Trick Room Attacker': 'Attacker',
'Weather Setter': 'Utility'
}
df['Role_Category'] = df['Role'].map(role_mapping).fillna('Other')
# Features and target
X = df[features].dropna()
y = df.loc[X.index, 'Role_Category']
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Train Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# Evaluate model
from sklearn.metrics import classification_report, confusion_matrix
y_pred = rf_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
# Feature importance
feature_importance = pd.DataFrame({
'Feature': features,
'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)# Basic text processing
fun_descriptions = df['Why Fun'].dropna()
# Word frequency analysis
from collections import Counter
import re
def extract_words(text):
"""Extract words from text, removing punctuation and converting to lowercase"""
words = re.findall(r'\b\w+\b', text.lower())
return [word for word in words if len(word) > 2] # Remove short words
all_words = []
for desc in fun_descriptions:
all_words.extend(extract_words(desc))
word_freq = Counter(all_words)
print("Most common words:")
for word, count in word_freq.most_common(20):
print(f"{word}: {count}")
# Word cloud visualization
wordcloud = WordCloud(
width=800,
height=400,
background_color='white',
colormap='viridis'
).generate_from_frequencies(word_freq)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Common Themes in "Why Fun" Descriptions')
plt.show()# Simple sentiment analysis
positive_words = ['fun', 'powerful', 'versatile', 'reliable', 'effective']
negative_words = ['slow', 'weak', 'limited', 'fragile']
def calculate_sentiment(text):
if pd.isna(text):
return 0
text = text.lower()
pos_score = sum(1 for word in positive_words if word in text)
neg_score = sum(1 for word in negative_words if word in text)
return pos_score - neg_score
df['Fun_Sentiment'] = df['Why Fun'].apply(calculate_sentiment)
# Analyze sentiment by role
sentiment_by_role = df.groupby('Role')['Fun_Sentiment'].mean().sort_values(ascending=False)
print("Average sentiment by role:")
print(sentiment_by_role)# Distribution plots
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
stat_cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
for i, col in enumerate(stat_cols):
ax = axes[i//3, i%3]
df[col].hist(bins=20, ax=ax, alpha=0.7)
ax.set_title(f'{col} Distribution')
ax.set_xlabel(col)
ax.set_ylabel('Frequency')
plt.tight_layout()
plt.show()
# Box plots by role
plt.figure(figsize=(12, 8))
df.boxplot(column='Speed', by='Role', ax=plt.gca())
plt.title('Speed Distribution by Role')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()# Scatter plot matrix
selected_stats = ['HP', 'Attack', 'Defense', 'Speed']
scatter_matrix = pd.plotting.scatter_matrix(
df[selected_stats],
figsize=(12, 12),
alpha=0.6,
diagonal='hist'
)
plt.suptitle('Pokémon Stat Relationships', y=0.95)
plt.show()
# Correlation heatmap with annotations
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(
correlation_matrix,
mask=mask,
annot=True,
cmap='coolwarm',
center=0,
square=True
)
plt.title('Pokémon Stat Correlation Matrix')
plt.show()# Identify complementary Pokémon
def find_team_synergies(df, focus_role='Bulky Support'):
"""Find Pokémon that complement a focus role"""
# Get focus Pokémon stats
focus_pokemon = df[df['Role'] == focus_role]
# Calculate team balance scores
team_suggestions = []
for _, focus in focus_pokemon.iterrows():
# Find Pokémon with complementary stats
complementary = df[
(df['Role'] != focus_role) &
(df['Speed'] != focus['Speed']) # Different speed tiers
].copy()
# Calculate synergy score
complementary['Synergy_Score'] = (
abs(complementary['Offensive_Power'] - focus['Defensive_Power']) +
abs(complementary['Speed'] - focus['Speed'])
)
best_partners = complementary.nsmallest(3, 'Synergy_Score')
team_suggestions.append({
'Focus_Pokemon': focus['Pokemon'],
'Best_Partners': best_partners['Pokemon'].tolist(),
'Synergy_Scores': best_partners['Synergy_Score'].tolist()
})
return pd.DataFrame(team_suggestions)
synergies = find_team_synergies(df)
print("Team synergy suggestions:")
print(synergies.head())# Analyze competitive meta trends
def analyze_meta_trends(df):
"""Analyze current meta characteristics"""
# Speed tier distribution
speed_tiers = df['Speed_Tier'].value_counts()
# Role popularity
role_usage = df['Role'].value_counts()
# Stat preferences
avg_stats = df[num_cols].mean()
# Offensive vs Defensive meta
offensive_count = len(df[df['Offensive_Power'] > df['Defensive_Power']])
defensive_count = len(df[df['Defensive_Power'] > df['Offensive_Power']])
meta_report = {
'Speed_Tier_Distribution': speed_tiers,
'Role_Popularity': role_usage,
'Average_Stats': avg_stats,
'Offensive_vs_Defensive': {
'Offensive': offensive_count,
'Defensive': defensive_count,
'Ratio': offensive_count / defensive_count
}
}
return meta_report
meta_analysis = analyze_meta_trends(df)
print("Meta Analysis Results:")
for key, value in meta_analysis.items():
print(f"\n{key}:")
print(value)# Create a battle outcome predictor
def create_battle_predictor(df):
"""Create a simple battle outcome predictor based on stats"""
# Calculate battle ratings
df['Battle_Rating'] = (
df['Total Stats'] * 0.3 +
df['Speed'] * 0.2 +
df['Offensive_Power'] * 0.25 +
df['Defensive_Power'] * 0.25
)
# Normalize to 0-100 scale
df['Battle_Rating'] = (
(df['Battle_Rating'] - df['Battle_Rating'].min()) /
(df['Battle_Rating'].max() - df['Battle_Rating'].min()) * 100
)
# Create tier rankings
df['Battle_Tier'] = pd.cut(
df['Battle_Rating'],
bins=[0, 25, 50, 75, 100],
labels=['Low', 'Medium', 'High', 'Elite']
)
return df[['Pokemon', 'Role', 'Battle_Rating', 'Battle_Tier']].sort_values(
'Battle_Rating', ascending=False
)
battle_rankings = create_battle_predictor(df.copy())
print("Battle Performance Rankings:")
print(battle_rankings.head(10))# Comprehensive data validation
def validate_pokemon_data(df):
"""Perform data quality checks"""
issues = []
# Check for impossible stat values
if (df[num_cols] < 0).any().any():
issues.append("Negative stat values found")
# Check for extreme outliers
for col in num_cols:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)]
if len(outliers) > 0:
issues.append(f"Extreme outliers in {col}: {len(outliers)} cases")
# Check for duplicate Pokémon
if df['Pokemon'].duplicated().any():
issues.append("Duplicate Pokémon entries found")
return issues
data_issues = validate_pokemon_data(df)
if data_issues:
print("Data Quality Issues:")
for issue in data_issues:
print(f"- {issue}")
else:
print("Data quality checks passed!")# Optimize memory usage
def optimize_dataframe(df):
"""Optimize DataFrame memory usage"""
# Convert object columns to category where appropriate
for col in df.select_dtypes(include=['object']).columns:
if df[col].nunique() / len(df) < 0.5: # Less than 50% unique values
df[col] = df[col].astype('category')
# Downcast numeric types
for col in df.select_dtypes(include=['int64']).columns:
df[col] = pd.to_numeric(df[col], downcast='integer')
for col in df.select_dtypes(include=['float64']).columns:
df[col] = pd.to_numeric(df[col], downcast='float')
return df
# Apply optimization
optimized_df = optimize_dataframe(df.copy())
print(f"Memory usage reduction: {df.memory_usage().sum() - optimized_df.memory_usage().sum()} bytes")# Create reusable analysis pipeline
class PokemonAnalyzer:
def __init__(self, data_path):
self.df = pd.read_csv(data_path)
self.clean_data()
def clean_data(self):
"""Clean and prepare data"""
# Handle missing values
num_cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
self.df[num_cols] = self.df[num_cols].fillna(self.df[num_cols].median())
# Create derived fields
self.df['Offensive_Power'] = (self.df['Attack'] + self.df['Sp. Atk']) / 2
self.df['Defensive_Power'] = (self.df['Defense'] + self.df['Sp. Def']) / 2
def analyze_role_distribution(self):
"""Analyze role distribution and characteristics"""
return self.df.groupby('Role').agg({
'Total Stats': ['mean', 'std', 'count'],
'Speed': 'median',
'Offensive_Power': 'mean',
'Defensive_Power': 'mean'
})
def find_similar_pokemon(self, pokemon_name, top_n=5):
"""Find similar Pokémon based on stat profile"""
if pokemon_name not in self.df['Pokemon'].values:
return f"Pokémon '{pokemon_name}' not found in dataset"
target_stats = self.df[self.df['Pokemon'] == pokemon_name][
['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
].iloc[0]
# Calculate similarity scores
stat_cols = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']
similarity_scores = []
for _, row in self.df.iterrows():
if row['Pokemon'] == pokemon_name:
continue
# Euclidean distance
distance = np.sqrt(sum((row[col] - target_stats[col])**2 for col in stat_cols))
similarity_scores.append({
'Pokemon': row['Pokemon'],
'Role': row['Role'],
'Distance': distance
})
similar_df = pd.DataFrame(similarity_scores).nsmallest(top_n, 'Distance')
return similar_df
# Usage example
analyzer = PokemonAnalyzer('Pokemon_Double_Teams_DB.csv')
role_analysis = analyzer.analyze_role_distribution()
similar_pokemon = analyzer.find_similar_pokemon('Jellicent')This comprehensive guide covers the essential techniques for analyzing Pokémon VGC data, from basic data manipulation to advanced machine learning applications. The key takeaways include:
The techniques demonstrated here are applicable to any structured dataset and form the foundation for data-driven decision making in competitive gaming analysis.
This guide serves as a practical reference for analysts working with Pokémon VGC data or similar competitive gaming datasets. Each code example is designed to be educational while providing real analytical value.