import pandas as pd
df = pd.read_csv(r"C:\Users\lakshita rawat\Downloads\Project_2\Competitor.csv")
df.head()

print(df.shape) #total rows and columns
print()

print(df.columns)
print()

print(df.isnull().sum())
print()

print(df.duplicated().sum())
print()

print(df.describe())
print()

print(df.info())

(1006, 8)

Index(['Brand', 'Model', 'Type', 'Gender', 'Size', 'Color', 'Material',
       'Price (USD)'],
      dtype='object')

Brand          0
Model          0
Type           0
Gender         0
Size           0
Color          0
Material       0
Price (USD)    0
dtype: int64

81

       Price (USD)
count  1006.000000
mean    101.306163
std      39.215436
min      25.000000
25%      70.000000
50%      90.000000
75%     130.000000
max     250.000000

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006 entries, 0 to 1005
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Brand        1006 non-null   object
 1   Model        1006 non-null   object
 2   Type         1006 non-null   object
 3   Gender       1006 non-null   object
 4   Size         1006 non-null   object
 5   Color        1006 non-null   object
 6   Material     1006 non-null   object
 7   Price (USD)  1006 non-null   int64 
dtypes: int64(1), object(7)
memory usage: 63.0+ KB
None

import matplotlib.pyplot as plt
import seaborn as sns

# Use seaborn style
sns.set(style="whitegrid", palette="muted", font_scale=1.1)


# 1. Total products
total_products = df.shape[0]
print("Total products:", total_products)

print()


# 2. Unique brands
unique_brands = df['Brand'].nunique()
print("Unique brands:", unique_brands)


# 3. Brand with largest product portfolio
brand_portfolio = df['Brand'].value_counts()

plt.figure(figsize=(12,6))
ax = brand_portfolio.plot(kind='bar', color='skyblue')
ax.grid(False)
plt.title("Total Product Sold per Brand")
plt.ylabel("Number of Products")
plt.xlabel("Brand")
plt.xticks(rotation=45)

# Add values on bars
for i, v in enumerate(brand_portfolio):
    ax.text(i, v + 0.5, str(v), ha='center', fontweight='bold')

plt.show()



# 4. Product type distribution
type_distribution = df['Type'].value_counts()

plt.figure(figsize=(12,6))
ax = type_distribution.plot(kind='bar', color='lightcoral')
ax.grid(False)
plt.title("Product Distribution by Type")
plt.ylabel("Count")
plt.xlabel("Product Type")
plt.xticks(rotation=45)

# Add values on bars
for i, v in enumerate(type_distribution):
    ax.text(i, v + 0.5, str(v), ha='center', fontweight='bold')

plt.show()


# 5. Unique product types
unique_types = df['Type'].nunique()
print("Unique product types:", unique_types)

print()

# 6. Average price (overall)
avg_price = df['Price (USD)'].mean()
print("Average price across all products: $", round(avg_price, 2))

print()

# 7. Overall price range
min_price = df['Price (USD)'].min()
max_price = df['Price (USD)'].max()
print("Overall price range: $", min_price, "– $", max_price)

print()


# 8. Highest & lowest price brand
price_extremes = df.groupby('Brand')['Price (USD)'].agg(['min','max']).sort_values('max', ascending=False)
print("Price extremes per brand:\n", price_extremes)

print()
print()

# 9. Price Distribution
# Make sure no missing values in Price
df = df.dropna(subset=['Price (USD)'])

# Define price bins dynamically
max_price = df['Price (USD)'].max()

bins = [0, 75, 150, 250, max_price + 1]  # add +1 to avoid edge issues
labels = ['Low (<$75)', 'Mid ($75–150)', 'High ($150–250)', f'Premium (>$250)']

# Create price range column
df['Price Range'] = pd.cut(df['Price (USD)'], bins=bins, labels=labels, include_lowest=True)

# Count products in each range
price_dist = df['Price Range'].value_counts().sort_index()
print("Price distribution by range:\n", price_dist)

# Pie chart
plt.figure(figsize=(8,8))
plt.pie(price_dist, labels=price_dist.index, autopct='%1.1f%%', startangle=140,
        colors=['#99c2ff','#66b3ff','#3399ff','#0066cc'])
plt.title("Price Distribution by Range")
plt.show()


# 10. Average price per product type
avg_price_type = df.groupby('Type')['Price (USD)'].mean().sort_values(ascending=False)
print("Average price per product type:\n", avg_price_type)

plt.figure(figsize=(8,8))
plt.pie(avg_price_type, labels=avg_price_type.index, autopct='%1.1f%%', startangle=140, 
        colors=sns.color_palette("Set2", len(avg_price_type)))
plt.title("Average Price Share by Product Type")
plt.show()

Total products: 1006

Unique brands: 10

Unique product types: 18

Average price across all products: $ 101.31

Overall price range: $ 25 – $ 250

Price extremes per brand:
              min  max
Brand                
Nike          85  250
Adidas        25  220
Reebok        60  200
Asics         70  180
New Balance   80  175
Puma          65  140
Skechers      50  125
Converse      55  120
Fila          60   95
Vans          50   70


Price distribution by range:
 Price Range
Low (<$75)         357
Mid ($75–150)      522
High ($150–250)    127
Premium (>$250)      0
Name: count, dtype: int64

Average price per product type:
 Type
Weightlifting     187.500000
Cross-training    130.000000
CrossFit          130.000000
Crossfit          130.000000
Running           129.081325
Lifestyle         122.828947
Trail Running     113.333333
Basketball        112.692308
Training          110.294118
Racing            110.000000
Trail              99.166667
Retro              90.000000
Fashion            83.895349
Casual             79.938272
Hiking             77.500000
Walking            64.978261
Skate              62.200000
Slides             31.666667
Name: Price (USD), dtype: float64

import matplotlib.pyplot as plt

# 1. In which category is Puma cheapest? Most premium?
category_price = df.groupby(['Brand', 'Type'])['Price (USD)'].mean().unstack()
puma_category_prices = category_price.loc['Puma'].dropna().sort_values()

cheapest_category = puma_category_prices.idxmin()
most_premium_category = puma_category_prices.idxmax()

fig, ax = plt.subplots(figsize=(10,5))
bars = ax.bar(puma_category_prices.index, puma_category_prices.values, color='#1f77b4')

for rect, cat in zip(bars, puma_category_prices.index):
    if cat == cheapest_category:
        rect.set_color('lightgreen')
    elif cat == most_premium_category:
        rect.set_color('salmon')

for rect in bars:
    h = rect.get_height()
    ax.annotate(f'{h:,.0f}', xy=(rect.get_x()+rect.get_width()/2, h),
                xytext=(0,3), textcoords='offset points',
                ha='center', va='bottom', fontsize=12, fontweight='bold')

ax.set_title("Puma Average Price by Type", fontsize=15, fontweight='bold')
ax.set_ylabel("Average Price (USD)")
ax.set_xticklabels(puma_category_prices.index, rotation=45, ha='right')
ax.grid(False)
plt.tight_layout()
plt.show()



# 2. Are Puma’s men’s products priced higher/lower than competitors’?
men_prices = df[df['Gender_std']=='Men'].groupby('Brand')['Price (USD)'].mean().sort_values()

fig, ax = plt.subplots(figsize=(12,5))
colors = ['#ffd700' if b=='Puma' else '#1f77b4' for b in men_prices.index]
bars = ax.bar(men_prices.index, men_prices.values, color=colors)

for rect in bars:
    h = rect.get_height()
    ax.annotate(f'{h:,.0f}', xy=(rect.get_x()+rect.get_width()/2, h),
                xytext=(0,3), textcoords='offset points',
                ha='center', va='bottom', fontsize=12, fontweight='bold')

ax.set_title("Average Price (Men's Products)", fontsize=15, fontweight='bold')
ax.set_ylabel("Average Price (USD)")
ax.set_xticklabels(men_prices.index, rotation=45, ha='right')
ax.grid(False)
plt.tight_layout()
plt.show()



# 3. Are Puma’s women’s products more affordable or expensive?
women_prices = df[df['Gender_std']=='Women'].groupby('Brand')['Price (USD)'].mean().sort_values()

fig, ax = plt.subplots(figsize=(12,5))
colors = ['#ff69b4' if b=='Puma' else '#90ee90' for b in women_prices.index]
bars = ax.bar(women_prices.index, women_prices.values, color=colors)

for rect in bars:
    h = rect.get_height()
    ax.annotate(f'{h:,.0f}', xy=(rect.get_x()+rect.get_width()/2, h),
                xytext=(0,3), textcoords='offset points',
                ha='center', va='bottom', fontsize=13, fontweight='bold')

ax.set_title("Average Price (Women's Products)", fontsize=14, fontweight='bold')
ax.set_ylabel("Average Price (USD)")
ax.set_xticklabels(women_prices.index, rotation=45, ha='right')
ax.grid(False)
plt.tight_layout()
plt.show()



# 4.Puma’s pricing gap with rest of the Brands
import matplotlib.pyplot as plt

# Average price per brand
brand_means = df.groupby('Brand')['Price (USD)'].mean()

# Puma baseline
puma_mean = brand_means.loc['Puma']

# Calculate gaps for all brands
gaps = brand_means - puma_mean

# Sort for better readability
gaps = gaps.sort_values()

# Plot
fig, ax = plt.subplots(figsize=(10,6))
bars = ax.bar(gaps.index, gaps.values, 
              color=['yellow' if brand=='Puma' else '#1f77b4' for brand in gaps.index])

# Annotations
for rect in bars:
    h = rect.get_height()
    ax.annotate(f'{h:+.1f}', 
                xy=(rect.get_x()+rect.get_width()/2, h),
                xytext=(0,5), textcoords='offset points',
                ha='center', va='bottom', fontsize=12, fontweight='bold')

# Formatting
ax.axhline(0, color='black', linewidth=1)
ax.set_title("Average Price Gap vs Puma (All Brands)", fontsize=14, fontweight='bold')
ax.set_ylabel("Difference in Avg Price (USD)")
ax.grid(False)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



# 5.Are Puma’s women’s products consistently priced lower than men’s across all categories?

# Average price by Brand, Category, Gender
gender_prices = df.groupby(['Brand','Type','Gender'])['Price (USD)'].mean().reset_index()

# Filter only Puma
puma_gender_prices = gender_prices[gender_prices['Brand'] == 'Puma']

# Pivot for Men vs Women comparison
pivot_puma = puma_gender_prices.pivot(index='Type', columns='Gender', values='Price (USD)').reset_index()

# Plot Puma Men vs Women category prices
plt.figure(figsize=(8,5))
bar_width = 0.35
categories = pivot_puma['Type']
x = range(len(categories))

bars1 = plt.bar([i - bar_width/2 for i in x], pivot_puma['Men'], width=bar_width, color='orange', label='Puma Men')
bars2 = plt.bar([i + bar_width/2 for i in x], pivot_puma['Women'], width=bar_width, color='pink', label='Puma Women')

# Annotations
for bars in [bars1, bars2]:
    for bar in bars:
        h = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, h + 0.5, f"{h:.1f}",
                 ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.title("Puma: Men vs Women Average Price by Type", fontsize=14, fontweight='bold')
plt.ylabel("Average Price (USD)")
plt.xticks(x, categories, rotation=45)
plt.legend()
plt.grid(False)
plt.show()

C:\Users\lakshita rawat\AppData\Local\Temp\ipykernel_16792\2339940667.py:27: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_xticklabels(puma_category_prices.index, rotation=45, ha='right')

C:\Users\lakshita rawat\AppData\Local\Temp\ipykernel_16792\2339940667.py:49: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_xticklabels(men_prices.index, rotation=45, ha='right')

C:\Users\lakshita rawat\AppData\Local\Temp\ipykernel_16792\2339940667.py:71: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_xticklabels(women_prices.index, rotation=45, ha='right')

posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

# ---------- Styling: no gridlines ----------
sns.set(style="white", rc={'axes.grid': False})
plt.rcParams['figure.dpi'] = 100
plt.rcParams['axes.grid'] = False

# ---------- Data cleaning & checks ----------
df = df.copy()

required_cols = ['Brand','Model','Type','Gender','Price (USD)']
for c in required_cols:
    if c not in df.columns:
        raise ValueError(f"Required column missing: {c}")

df = df.dropna(subset=['Brand','Model'])
df['Brand'] = df['Brand'].astype(str).str.strip()
df['Model'] = df['Model'].astype(str).str.strip()
df['Type'] = df['Type'].astype(str).str.strip()
df['Gender'] = df['Gender'].astype(str).str.strip()

df['Price (USD)'] = pd.to_numeric(df['Price (USD)'], errors='coerce')
df = df.dropna(subset=['Price (USD)'])

has_puma = 'Puma' in df['Brand'].unique()

# ---------- Metrics ----------
brand_metrics = df.groupby('Brand').agg(
    num_models = ('Model', 'nunique'),
    listings = ('Model','count'),
    avg_price = ('Price (USD)', 'mean'),
    median_price = ('Price (USD)', 'median'),
    min_price = ('Price (USD)', 'min'),
    max_price = ('Price (USD)', 'max')
).round(2)

brand_counts = brand_metrics.sort_values('num_models', ascending=False)
brand_by_price = brand_metrics.sort_values('avg_price', ascending=False)

top_n = 6
top_brands = df['Brand'].value_counts().nlargest(top_n).index.tolist()
if has_puma and 'Puma' not in top_brands:
    top_brands[-1] = 'Puma'


# Portfolio Strength
plt.figure(figsize=(12,6))
order = brand_counts.index
colors = ["#33c4f0" if b == 'Puma' else "#98da5f" for b in order]
bars = plt.bar(order, brand_counts['num_models'], color=colors)
ax = plt.gca(); ax.grid(False)

plt.title("Portfolio Strength — Unique Models per Brand")
plt.ylabel("Unique Models")
plt.xlabel("Brand")
plt.xticks(rotation=45, ha='right')

for rect in bars:
    h = rect.get_height()
    plt.text(rect.get_x() + rect.get_width()/2, h + 0.5, int(h),
             ha='center', va='bottom', fontweight='bold', fontsize=12)

plt.tight_layout()
plt.show()


# Pricing Strategy: Average price per brand (Bar)
plt.figure(figsize=(12,6))
order_price = brand_by_price.index
colors = ["#33c4f0" if b == 'Puma' else "#98da5f" for b in order_price]
bars = plt.bar(order_price, brand_by_price['avg_price'], color=colors)
ax = plt.gca(); ax.grid(False)

plt.title("Average Price per Brand (USD)")
plt.ylabel("Average Price (USD)")
plt.xlabel("Brand")
plt.xticks(rotation=45, ha='right')

for rect in bars:
    h = rect.get_height()
    plt.text(
        rect.get_x() + rect.get_width()/2,   
        h,                                   
        int(h),                              
        ha='center', va='bottom',            
        fontweight='bold', fontsize=12
    )


plt.tight_layout()
plt.show()


# Revenue estimation: count of listings × avg price
revenue_by_brand = df.groupby('Brand').apply(
    lambda x: x['Price (USD)'].mean() * len(x)
).sort_values(ascending=False)

# Visual
revenue_by_brand.head(10).plot(kind='bar', figsize=(12,6), color=['red' if b=='Puma' else 'steelblue' for b in revenue_by_brand.head(10).index])
plt.title("Estimated Revenue Contribution by Brand — Highlighting Puma")
plt.ylabel("Revenue (Units × Avg Price)")
plt.xlabel("Brand")
plt.xticks(rotation=45, ha='right')

for rect in bars:
    h = rect.get_height()
    plt.annotate(
        f'{int(h)}',                        # Text to display
        xy=(rect.get_x() + rect.get_width()/2, h),  # X=center of bar, Y=top of bar
        xytext=(0, 3),                      # Offset in points (3 pixels above)
        textcoords="offset points",         # Make offset relative to points
        ha='center', va='bottom', fontsize=12, fontweight='bold'
    )
plt.tight_layout()
plt.show()


# Puma Positioning
q1, q2, q3 = df['Price (USD)'].quantile([0.33, 0.5, 0.66]).values
print("\nPrice quantiles (33%, 50%, 66%):", round(q1,2), round(q2,2), round(q3,2))

if has_puma:
    puma_stats = brand_metrics.loc['Puma']
    if puma_stats['avg_price'] <= q1:
        puma_band = 'Budget'
    elif puma_stats['avg_price'] <= q3:
        puma_band = 'Mid-range'
    else:
        puma_band = 'Premium'
    print("\nPuma summary:")
    display(puma_stats)
    print("Puma is positioned as:", puma_band)
else:
    print("\nPuma not found in dataset; cannot compute Puma positioning.")


# Brand summary
brand_summary = brand_metrics.reset_index().copy()
brand_summary['avg_price_rank'] = brand_summary['avg_price'].rank(method='min', ascending=False).astype(int)
brand_summary = brand_summary.sort_values(['avg_price_rank','num_models'], ascending=[True, False])

print("\n=== Brand Summary (top rows) ===")
display(brand_summary[['Brand','num_models','listings','min_price','median_price','avg_price','max_price','avg_price_rank']].head(20))
brand_summary.to_csv("brand_summary.csv", index=False)

print()


# Puma vs others (avg & median diffs)
if has_puma:
    puma_row = brand_metrics.loc['Puma']
    comp = brand_metrics.copy()
    comp['diff_avg_vs_puma'] = comp['avg_price'] - puma_row['avg_price']
    comp['diff_median_vs_puma'] = comp['median_price'] - puma_row['median_price']
    comp = comp.sort_values('diff_avg_vs_puma', ascending=False)
    print("\n=== Comparison vs Puma (top 10 rows) ===")
    display(comp[['avg_price','median_price','min_price','max_price','listings','diff_avg_vs_puma','diff_median_vs_puma']].head(10))
else:
    print("\nPuma not in data — skipping Puma-difference table.")

print()

# Top 3 product types per brand 
top_types = (df.groupby(['Brand','Type']).size()
             .groupby(level=0, group_keys=False)
             .nlargest(3)
             .reset_index(name='count'))
print("\n=== Top 3 product Types per Brand (by count) ===")
display(top_types)

C:\Users\lakshita rawat\AppData\Local\Temp\ipykernel_16792\773373617.py:98: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  revenue_by_brand = df.groupby('Brand').apply(

Price quantiles (33%, 50%, 66%): 75.0 90.0 120.0

Puma summary:

num_models       28.0
listings        100.0
avg_price       100.9
median_price    100.0
min_price        65.0
max_price       140.0
Name: Puma, dtype: float64

Puma is positioned as: Mid-range

=== Brand Summary (top rows) ===


=== Comparison vs Puma (top 10 rows) ===


=== Top 3 product Types per Brand (by count) ===

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

# -------------------------
# 0. Data prep (standardize genders)
# -------------------------
# Normalize Gender text to consistent categories
df['Gender_std'] = (
    df['Gender'].astype(str)
      .str.strip()
      .str.lower()
      .replace({
          'male':'men','m':'men','man':'men',
          'female':'women','f':'women','woman':'women'
      })
)

# Map to title-case canonical names (keep 'unisex' if present)
df['Gender_std'] = df['Gender_std'].map({'men':'Men','women':'Women','unisex':'Unisex'}).fillna(
    df['Gender'].astype(str).str.title()
)


# Brand and Gender: Product Counts

counts = df.groupby(['Brand','Gender_std']).size().unstack(fill_value=0)
brands = counts.index.tolist()
genders = counts.columns.tolist()
# Ensure consistent gender order
preferred_order = ['Men','Women','Unisex']
genders = [g for g in preferred_order if g in genders] + [g for g in genders if g not in preferred_order]

# Colors: defaults vs Puma
default_colors = {'Men':'#1f77b4',     # blue
                  'Women':'#90ee90'}  # grey
puma_colors    = {'Men':'#ffd700',     # yellow
                  'Women':'#ff69b4'}   # pink
                  
n_brands = len(brands)
n_genders = len(genders)
bar_width = 0.8 / n_genders
indices = np.arange(n_brands)

fig, ax = plt.subplots(figsize=(14,6))
total_width = n_genders * bar_width
start = indices - total_width/2

for i, gender in enumerate(genders):
    positions = start + i*bar_width
    # Puma gets special colors
    colors = [ puma_colors.get(gender, '#cccccc') if (b.strip().lower() == 'puma') 
               else default_colors.get(gender, '#cccccc')
               for b in brands ]
    vals = counts[gender].reindex(brands).values
    bars = ax.bar(positions, vals, width=bar_width, color=colors, label=gender)

    # Annotations
    for rect in bars:
        h = rect.get_height()
        if h > 0:
            ax.annotate(
                f'{int(h)}',
                xy=(rect.get_x() + rect.get_width()/2, h),
                xytext=(0, 3),
                textcoords='offset points',
                ha='center', va='bottom', fontsize=11, fontweight='bold'
            )

ax.set_xticks(indices)
ax.set_xticklabels(brands, rotation=45, ha='right')
ax.set_ylabel('Number of Products')
ax.set_title('Product Counts by Brand and Gender (Men / Women)')
ax.grid(False)

# Legend
legend_elems = [
    Patch(facecolor=default_colors['Men'], label='Men (other brands)'),
    Patch(facecolor=default_colors['Women'], label='Women (other brands)')
]
if any([b.strip().lower() == 'puma' for b in brands]):
    legend_elems += [
        Patch(facecolor=puma_colors['Men'], label='Men (Puma)'),
        Patch(facecolor=puma_colors['Women'], label='Women (Puma)')
    ]
ax.legend(handles=legend_elems, bbox_to_anchor=(1.02, 1), loc='upper left')
plt.tight_layout()
plt.show()



# Gender based Market Share Data

gender_market_share = (
    df.groupby(['Brand','Gender']).size()
    .groupby(level=1, group_keys=False)
    .apply(lambda x: 100 * x / x.sum())
    .reset_index(name='share_%')
)

pivot = gender_market_share.pivot(index='Brand', columns='Gender', values='share_%').fillna(0)

# Sort by total market share (Men + Women)
pivot = pivot.loc[pivot.sum(axis=1).sort_values(ascending=False).index]

brands = pivot.index
men_values = pivot['Men'].values
women_values = pivot['Women'].values

default_colors = {'Men':'#1f77b4', 'Women':'#90ee90'}    # Blue & Light Green
puma_colors    = {'Men':"#e3321e", 'Women':"#7819d1"}    # Yellow-brown & Magenta


# Plot Dumbbell Chart
fig, ax = plt.subplots(figsize=(12,6))

for i, brand in enumerate(brands):
    men_val = men_values[i]
    women_val = women_values[i]
    
    # Puma gets special colors
    men_color = puma_colors['Men'] if brand.strip().lower()=="puma" else default_colors['Men']
    women_color = puma_colors['Women'] if brand.strip().lower()=="puma" else default_colors['Women']
    
    # Line between Men and Women
    ax.plot([men_val, women_val], [i, i], color="gray", linewidth=2, zorder=1)
    
    # Men point + annotation
    ax.scatter(men_val, i, color=men_color, s=120, zorder=2)
    ax.text(men_val, i+0.15, f"{men_val:.1f}%", ha='center', va='bottom',
            fontsize=9, fontweight="bold")
    
    # Women point + annotation
    ax.scatter(women_val, i, color=women_color, s=120, zorder=2)
    ax.text(women_val, i-0.25, f"{women_val:.1f}%", ha='center', va='top',
            fontsize=9, fontweight="bold")

# -------------------------------
# Formatting
# -------------------------------
ax.set_yticks(range(len(brands)))
ax.set_yticklabels(brands)
ax.set_xlabel("Market Share (%)")
ax.set_title("Gender-wise Market Share (%) — Puma vs Competitors",
             fontsize=14, fontweight="bold")

# Custom Legend
legend_elements = [
    Patch(facecolor=default_colors['Men'], label='Other Brands - Men'),
    Patch(facecolor=default_colors['Women'], label='Other Brands - Women'),
    Patch(facecolor=puma_colors['Men'], label='Puma - Men'),
    Patch(facecolor=puma_colors['Women'], label='Puma - Women')
]
ax.legend(handles=legend_elements, loc='best')

plt.tight_layout()
plt.show()


# Average Price by Brand & Gender

# Colors: defaults vs Puma
default_colors = {'Men':'#1f77b4',     # blue
                  'Women':'#90ee90'}   # light green
puma_colors    = {'Men':"#be87ef",     # purple
                  'Women':"#92ded6"}   # teal

avg_price = df.groupby(['Brand','Gender_std'])['Price (USD)'].mean().unstack()
avg_price = avg_price.reindex(brands).fillna(np.nan)

fig, ax = plt.subplots(figsize=(14,6))
for i, gender in enumerate(genders):
    positions = start + i*bar_width
    colors = [ puma_colors.get(gender) if (b.strip().lower() == 'puma') 
               else default_colors.get(gender)
               for b in brands ]
    vals = avg_price[gender].values
    vals_for_plot = np.nan_to_num(vals, nan=0.0)
    bars = ax.bar(positions, vals_for_plot, width=bar_width, color=colors, label=gender, alpha=0.95)

    # Add annotations above bars
    for j, rect in enumerate(bars):
        h = vals[j]
        if not np.isnan(h):
            ax.annotate(
                f'{h:,.0f}',
                xy=(rect.get_x() + rect.get_width()/2, vals_for_plot[j]),
                xytext=(0, 3),
                textcoords='offset points',
                ha='center', va='bottom', fontsize=11, fontweight='bold'
            )

# Axis formatting
ax.set_xticks(indices)
ax.set_xticklabels(brands, rotation=45, ha='right')
ax.set_ylabel('Average Price (USD)')
ax.set_title('Average Price by Brand and Gender')
ax.grid(False)

# Custom Legend
legend_elems = [
    Patch(facecolor=default_colors['Men'], label='Other Brands - Men'),
    Patch(facecolor=default_colors['Women'], label='Other Brands - Women'),
    Patch(facecolor=puma_colors['Men'], label='Puma - Men'),
    Patch(facecolor=puma_colors['Women'], label='Puma - Women')
]
ax.legend(handles=legend_elems, bbox_to_anchor=(1.02, 1), loc='upper left', title="Legend")

plt.tight_layout()
plt.show()



# Revenue Contribution by Brand & Gender

default_colors = {'Men':'#1f77b4', 'Women':'#90ee90'}   # other brands
puma_colors    = {'Men':"#e214ca", 'Women':"#dee063"}   # Puma colors
fmt_currency = lambda v: f'${int(v):,}'                  # formatter for annotations


# Prepare pivot data again (revenue by gender)
gender_revenue = df.groupby(['Brand','Gender']).apply(
    lambda x: x['Price (USD)'].mean() * len(x)
).reset_index(name='Revenue')

pivot = gender_revenue.pivot(index='Brand', columns='Gender', values='Revenue').fillna(0)

# Sort by total revenue
pivot = pivot.loc[pivot.sum(axis=1).sort_values(ascending=False).index]

brands = pivot.index
men_values = pivot['Men'].values
women_values = pivot['Women'].values

fig, ax = plt.subplots(figsize=(12,6))

# Dumbbell lines
for i, brand in enumerate(brands):
    men_val = men_values[i]
    women_val = women_values[i]
    # Puma gets special colors
    men_color = puma_colors['Men'] if brand.strip().lower()=="puma" else default_colors['Men']
    women_color = puma_colors['Women'] if brand.strip().lower()=="puma" else default_colors['Women']
    
    # Line between Men and Women
    ax.plot([men_val, women_val], [i, i], color="gray", linewidth=2, zorder=1)
    
    # Men point
    ax.scatter(men_val, i, color=men_color, s=120, zorder=2)
    ax.text(men_val, i+0.15, f"{int(men_val):,}", ha='center', va='bottom', fontsize=11, fontweight="bold")
    
    # Women point
    ax.scatter(women_val, i, color=women_color, s=120, zorder=2)
    ax.text(women_val, i-0.25, f"{int(women_val):,}", ha='center', va='top', fontsize=11, fontweight="bold")

# Formatting
ax.set_yticks(range(len(brands)))
ax.set_yticklabels(brands)
ax.set_xlabel("Estimated Revenue (Units × Price)")
ax.set_title("Gender-wise Revenue Contribution — Puma vs Competitors",
             fontsize=14, fontweight="bold")

# Custom Legend
legend_elements = [
    Patch(facecolor=default_colors['Men'], label='Other Brands - Men'),
    Patch(facecolor=default_colors['Women'], label='Other Brands - Women'),
    Patch(facecolor=puma_colors['Men'], label='Puma - Men'),
    Patch(facecolor=puma_colors['Women'], label='Puma - Women')
]
ax.legend(handles=legend_elements, loc='best')

plt.tight_layout()
plt.show()

C:\Users\lakshita rawat\AppData\Local\Temp\ipykernel_16792\2713903690.py:223: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  gender_revenue = df.groupby(['Brand','Gender']).apply(

print("Product Material in the Dataset")
unique_material = df['Material'].unique()
print(unique_material)
print()

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Ensure Revenue column exists
if 'Revenue' not in df.columns:
    # If you don't have a Units column, assume 1 unit per row
    df['Revenue'] = df['Price (USD)']

# Clean columns
df['Brand'] = df['Brand'].astype(str).str.strip()
df['Material'] = df['Material'].astype(str).str.strip()


# Aggregate: counts, revenue, avg price
agg = (
    df.groupby(['Brand', 'Material'])
      .agg(count=('Material', 'size'),
           Revenue=('Revenue', 'sum'),
           avg_price=('Price (USD)', 'mean'))
      .reset_index()
)

# brand totals for share %
brand_totals = agg.groupby('Brand')['count'].sum().rename('brand_total').reset_index()
agg = agg.merge(brand_totals, on='Brand', how='left')
agg['share_%'] = 100 * agg['count'] / agg['brand_total']


# Keep Top 5 materials PER BRAND

agg = agg.sort_values(['Brand', 'share_%'], ascending=[True, False])
top5 = agg.groupby('Brand').head(5).reset_index(drop=True)


# Color map for materials (consistent across plots)
materials = top5['Material'].unique().tolist()
palette = sns.color_palette("tab20", n_colors=len(materials))
mat_color = {m: palette[i] for i, m in enumerate(materials)}

# Helper: brand order and x positions
brands = top5['Brand'].unique().tolist()
x = np.arange(len(brands))
bar_width = 0.6   # width of each brand bar (stacked style)


# Material Share(%) by Brand

plt.figure(figsize=(max(12, len(brands) * 0.9), 10))
ax = plt.gca()

for i, brand in enumerate(brands):
    rows = top5[top5['Brand'] == brand].sort_values('Material')  # sort for stable stacking
    bottom = 0.0
    for _, r in rows.iterrows():
        val = r['share_%']
        mat = r['Material']
        is_puma = (brand.strip().lower() == 'puma')
        edgecol = 'black' if is_puma else None
        lw = 1.6 if is_puma else 0.0

        bar = ax.bar(x[i], val, bottom=bottom, width=bar_width,
                     color=mat_color[mat], edgecolor=edgecol, linewidth=lw)

        # annotate inside segment if large enough
        if val >= 3:
            ax.text(x[i], bottom + val/2, f"{val:.1f}%", ha='center', va='center', fontsize=10, fontweight='bold')

        bottom += val

# Legend (materials)
handles = [plt.Rectangle((0,0),1,1, color=mat_color[m]) for m in materials]
ax.legend(handles, materials, title='Material', bbox_to_anchor=(1.02,1), loc='upper left')

ax.set_xticks(x)
ax.set_xticklabels(brands, rotation=45, ha='right')
ax.set_xlabel('Brand')
ax.set_ylabel('Share (%)')
ax.set_title('Material Share by Brand (%) — Top 5 per Brand')
plt.grid(False)
plt.tight_layout()
plt.show()



# Revenue by Material — stacked 

plt.figure(figsize=(max(12, len(brands) * 0.9), 8))
ax = plt.gca()

for i, brand in enumerate(brands):
    rows = top5[top5['Brand'] == brand].sort_values('Material')
    bottom = 0.0
    for _, r in rows.iterrows():
        val = r['Revenue']
        mat = r['Material']
        is_puma = (brand.strip().lower() == 'puma')
        edgecol = 'black' if is_puma else None
        lw = 1.6 if is_puma else 0.0

        ax.bar(x[i], val, bottom=bottom, width=bar_width,
               color=mat_color[mat], edgecolor=edgecol, linewidth=lw)

        # annotate if segment is visible (relative threshold)
        if val > 0 and (val >= 0.03 * top5['Revenue'].max()):  # annotate only larger segments
            ax.text(x[i], bottom + val/2, f"{int(val):,}", ha='center', va='center', fontsize=10)

        bottom += val

# legend
handles = [plt.Rectangle((0,0),1,1, color=mat_color[m]) for m in materials]
ax.legend(handles, materials, title='Material', bbox_to_anchor=(1.02,1), loc='upper left')

ax.set_xticks(x)
ax.set_xticklabels(brands, rotation=45, ha='right')
ax.set_xlabel('Brands')
ax.set_ylabel('Revenue (USD)')
ax.set_title('Revenue by Material — Top 5 per Brand')
plt.grid(False)
plt.tight_layout()
plt.show()

Product Material in the Dataset
['Leather' 'Primeknit' 'Canvas' 'Mesh' 'Suede/Canvas' 'Suede/Mesh'
 'Leather/Synthetic' 'Suede' 'Synthetic' 'Mesh/Synthetic' 'Suede/Textile'
 'Knit' 'Nylon' 'Flexweave' 'Flyknit' 'Mesh/Leather' 'Suede/Leather'
 'Canvas/Suede' 'Textile/Leather' 'Leather/Suede' 'Synthetic/Mesh'
 'Canvas/Leather' 'Nylon/Suede' 'Flexweave/Knit' 'Suede/Nylon'
 'Primeknit/Synthetic' 'Synthetic/Leather' 'Leather/Mesh' 'Knit/Synthetic'
 'Synthetic/Textile' 'Textile' 'Flexweave/Cushioning'
 'Flexweave/Synthetic' 'Mesh/Suede']

import matplotlib.pyplot as plt
import seaborn as sns

# Price Segment White Space

price_bins = [0,50,100,150,200,500]
df['Price Segment'] = pd.cut(df['Price (USD)'], bins=price_bins)

price_seg = df.groupby(['Brand','Price Segment']).size().unstack(fill_value=0)

plt.figure(figsize=(12,6))
ax = price_seg.T.plot(kind='bar', figsize=(14,6), width=1)

# Highlight Puma
bars = ax.patches
brands = price_seg.columns
for i, brand in enumerate(brands):
    if brand == "Puma":
        for j in range(len(price_seg)):
            bars[i + j*len(brands)].set_color("orange")

plt.title("Price Segment Coverage by Brand", fontsize=14, fontweight="bold")
plt.ylabel("Number of Products")
plt.xlabel("Price Segment")
plt.xticks(rotation=45)
plt.grid(False)

# Annotations
for p in ax.patches:
    h = p.get_height()
    if h > 0:
        ax.text(p.get_x() + p.get_width()/2, h+0.2, str(int(h)), 
                ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.show()



# Feature/Attribute Gaps (Colors, Materials, Gender)

feature_dist = {
    "Colors": df.groupby('Brand')['Color'].nunique(),
    "Materials": df.groupby('Brand')['Material'].nunique(),
    "Gender": df.groupby(['Brand','Gender']).size().unstack(fill_value=0).sum(axis=1)
}
feature_df = pd.DataFrame(feature_dist)

plt.figure(figsize=(12,6))
for i, col in enumerate(feature_df.columns):
    colors = ['orange' if idx == 'Puma' else 'skyblue' for idx in feature_df.index]
    bars = plt.bar(feature_df.index + " - " + col, feature_df[col], color=colors)
    for bar in bars:
        h = bar.get_height()
        plt.text(bar.get_x()+bar.get_width()/2, h+0.5, str(int(h)), 
                 ha='center', va='bottom', fontweight='bold')
plt.title("Feature/Attribute Coverage by Brand", fontsize=14, fontweight='bold')
plt.ylabel("Unique Count")
plt.xticks(rotation=90)
plt.grid(False)
plt.show()



#  Future Trend White Space 

trend_keywords = "eco|sustain|flyknit|recycled|knit|boost|foam|future|quantum"
df['Future Trend'] = df['Model'].str.contains(trend_keywords, case=False, na=False)

trend_counts = df.groupby('Brand')['Future Trend'].sum().sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(12,7))
colors = ["orange" if b=="Puma" else "skyblue" for b in trend_counts.index]
bars = ax.barh(trend_counts.index, trend_counts.values, color=colors, height=0.6)

for bar in bars:
    w = bar.get_width()
    ax.text(w+0.5, bar.get_y()+bar.get_height()/2,
            str(int(w)), va='center', fontsize=11, fontweight='bold')

ax.set_title("Future Trend Products (Eco/Sustainable/Innovative Models)", fontsize=16, fontweight="bold")
ax.set_xlabel("Number of Trendy Models")
ax.set_ylabel("Brand")
ax.grid(False)
plt.tight_layout()
plt.show()

C:\Users\lakshita rawat\AppData\Local\Temp\ipykernel_16792\3171486252.py:9: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  price_seg = df.groupby(['Brand','Price Segment']).size().unstack(fill_value=0)

<Figure size 1200x600 with 0 Axes>

Puma's Competitiors Analysis¶

Data Overview¶

Price Benchmarking & Gap Analysis¶

Final Insights¶

Brand Analysis¶

Final Insights¶

Gender Analysis¶

Final Insights¶

Material Analysis¶

Final Insights¶

Final Insights¶

	Brand	Model	Type	Gender	Size	Color	Material	Price (USD)
0	Nike	Air Jordan 1	Basketball	Men	US 10	Red/Black	Leather	170
1	Adidas	Ultra Boost 21	Running	Men	US 9.5	Black	Primeknit	180
2	Reebok	Classic Leather	Casual	Men	US 11	White	Leather	75
3	Converse	Chuck Taylor	Casual	Women	US 8	Navy	Canvas	55
4	Puma	Future Rider	Lifestyle	Women	US 7.5	Pink	Mesh	80

	Brand	num_models	listings	min_price	median_price	avg_price	max_price	avg_price_rank
0	Adidas	21	100	25	140.0	146.55	220	1
1	Asics	26	100	70	120.0	130.80	180	2
5	Nike	29	118	85	120.0	122.54	250	3
4	New Balance	27	100	80	110.0	117.15	175	4
7	Reebok	26	100	60	120.0	109.15	200	5
6	Puma	28	100	65	100.0	100.90	140	6
2	Converse	16	100	55	75.0	77.55	120	7
8	Skechers	36	89	50	65.0	71.00	125	8
3	Fila	23	99	60	70.0	69.95	95	9
9	Vans	7	100	50	65.0	60.00	70	10

	avg_price	median_price	min_price	max_price	listings	diff_avg_vs_puma	diff_median_vs_puma
Brand
Adidas	146.55	140.0	25	220	100	45.65	40.0
Asics	130.80	120.0	70	180	100	29.90	20.0
Nike	122.54	120.0	85	250	118	21.64	20.0
New Balance	117.15	110.0	80	175	100	16.25	10.0
Reebok	109.15	120.0	60	200	100	8.25	20.0
Puma	100.90	100.0	65	140	100	0.00	0.0
Converse	77.55	75.0	55	120	100	-23.35	-25.0
Skechers	71.00	65.0	50	125	89	-29.90	-35.0
Fila	69.95	70.0	60	95	99	-30.95	-30.0
Vans	60.00	65.0	50	70	100	-40.90	-35.0