import pandas as pd
df = pd.read_csv(r"C:\Users\lakshita rawat\Downloads\Jupyter_files\cleaned_procressed_file_2.csv")
df.head()

print(df.shape) #total rows and columns
print()

print(df.columns)
print()

print(df.isnull().sum())
print()

print(df.duplicated().sum())
print()

print(df.describe())
print()

print(df.info())

(75, 5)

Index(['Months', 'Year', 'No. of Banks live on UPI',
       'Count of UPI transactions (In Mn)',
       'Total Amount of UPI transactions (In Mn)                              '],
      dtype='object')

Months                                                                    0
Year                                                                      0
No. of Banks live on UPI                                                  0
Count of UPI transactions (In Mn)                                         0
Total Amount of UPI transactions (In Mn)                                  0
dtype: int64

0

              Year  No. of Banks live on UPI  \
count    75.000000                 75.000000   
mean   2021.640000                336.946667   
std       1.820603                175.851127   
min    2019.000000                134.000000   
25%    2020.000000                166.000000   
50%    2022.000000                304.000000   
75%    2023.000000                488.000000   
max    2025.000000                661.000000   

       Count of UPI transactions (In Mn)  \
count                          75.000000   
mean                         6449.493333   
std                          5298.089243   
min                           673.000000   
25%                          1558.000000   
50%                          4617.000000   
75%                         10571.000000   
max                         18302.000000   

       Total Amount of UPI transactions (In Mn)                                
count                                       7.500000e+01                       
mean                                        9.996774e+05                       
std                                         7.356006e+05                       
min                                         1.067370e+05                       
25%                                         2.944230e+05                       
50%                                         8.319930e+05                       
75%                                         1.577835e+06                       
max                                         2.477222e+06                       

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 5 columns):
 #   Column                                                                  Non-Null Count  Dtype 
---  ------                                                                  --------------  ----- 
 0   Months                                                                  75 non-null     object
 1   Year                                                                    75 non-null     int64 
 2   No. of Banks live on UPI                                                75 non-null     int64 
 3   Count of UPI transactions (In Mn)                                       75 non-null     int64 
 4   Total Amount of UPI transactions (In Mn)                                75 non-null     int64 
dtypes: int64(4), object(1)
memory usage: 3.1+ KB
None

import matplotlib.pyplot as plt
import pandas as pd

df["Year"] = pd.to_numeric(df["Year"], errors="coerce")

yearly_txn = df.groupby("Year")["Count of UPI transactions (In Mn)"].sum().reset_index()
all_years = pd.DataFrame({"Year": range(int(yearly_txn["Year"].min()), int(yearly_txn["Year"].max()) + 1)})
yearly_txn = all_years.merge(yearly_txn, on="Year", how="left").fillna(0)
yearly_txn["Count of UPI transactions (In Mn)"] = yearly_txn["Count of UPI transactions (In Mn)"].astype(int)

# Plot
plt.figure(figsize=(10,6))
bars = plt.bar(yearly_txn["Year"], yearly_txn["Count of UPI transactions (In Mn)"], color="skyblue")

# Show all years on x-axis
plt.xticks(yearly_txn["Year"])  

plt.title("Total UPI Transactions per Year", fontsize=14, weight="bold")
plt.xlabel("Year", fontsize=12)

# Make it clear on y-axis
plt.ylabel("Transactions Count (In Million)", fontsize=12)

# Format y-axis ticks with commas for readability
plt.gca().get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, _: f"{int(x):,}"))

# Show values on top of bars with 'Million'
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + (yval*0.01 + 10),  # shift a little above the bar
             f"{yval:,} M", ha="center", va="bottom", fontsize=10, weight="bold")

plt.tight_layout()
plt.show()

df.rename(columns={"Count transactions (In Mn)": "Count of UPI transactions (In Mn)"}, inplace=True)

df.columns = df.columns.str.strip()  # remove spaces
print(df.columns)

yearly_txn = df.groupby("Year")[[
    "Total Amount of UPI transactions (In Mn)",
    "Count of UPI transactions (In Mn)"
]].sum().reset_index()

Index(['Months', 'Year', 'No. of Banks live on UPI',
       'Count of UPI transactions (In Mn)',
       'Total Amount of UPI transactions (In Mn)'],
      dtype='object')

import pandas as pd
import matplotlib.pyplot as plt

# Example DataFrame
data = {
    "Year": [2019, 2020, 2021, 2022,2023,2024,2025],
    "Total Amount of UPI transactions (In Mn)": [1836638, 3387746, 7159286, 12595078,18292795,24682502,7021741]
}
df = pd.DataFrame(data)

# ✅ Convert into absolute values
df["Total Amount of UPI transactions (In Mn)"] = df["Total Amount of UPI transactions (In Mn)"] * 1_000_000

# Plot as bar chart
plt.figure(figsize=(10,6))
bars = plt.bar(df["Year"], df["Total Amount of UPI transactions (In Mn)"], color="lightpink", edgecolor="black")

# Add labels above bars
for bar, amount in zip(bars, df["Total Amount of UPI transactions (In Mn)"]):
    plt.text(bar.get_x() + bar.get_width()/2, 
             bar.get_height(), 
             f"{amount/1_000_000:.0f}M",   # shows in millions
             ha="center", va="bottom", fontsize=9, fontweight="bold")


# Show all years from 2019 to 2025
plt.xticks(range(2019, 2026))
plt.title("Total UPI Transaction Amount Over Years")
plt.xlabel("Year")
plt.ylabel("Total Transaction Amount")
plt.xticks(df["Year"])  # keep years continuous
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

import matplotlib.pyplot as plt

# Group by year and calculate total transactions
yearly_adoption = df.groupby("Year")["Count of UPI transactions (In Mn)"].sum()

# Plot
plt.figure(figsize=(8,5))
yearly_adoption.plot(kind="bar", color="purple", edgecolor="black")

plt.title("UPI Adoption by Year")
plt.xlabel("Year")
plt.ylabel("Total Number of Transactions")
plt.xticks(rotation=0)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

# Print year with highest adoption
highest_year = yearly_adoption.idxmax()
print(f"Highest UPI adoption occurred in the year: {highest_year}")

Highest UPI adoption occurred in the year: 2024

# Group by month and calculate total transactions
monthly_adoption = df.groupby("Months")["Count of UPI transactions (In Mn)"].sum()

# Reorder by month order
month_order = ["January","February","March","April","May","June",
               "July","August","September","October","November","December"]
monthly_adoption = monthly_adoption.reindex(month_order)

# Plot
plt.figure(figsize=(10,5))
monthly_adoption.plot(kind="bar", color="lightgreen", edgecolor="black")

plt.title("UPI Adoption by Month")
plt.xlabel("Month")
plt.ylabel("Total Number of Transactions")
plt.xticks(rotation=45)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

# Print month with highest adoption
highest_month = monthly_adoption.idxmax()
print(f"Highest UPI adoption occurred in the month: {highest_month}")

Highest UPI adoption occurred in the month: March

import matplotlib.pyplot as plt
import pandas as pd

# --- Ensure df is loaded ---
try:
    df
except NameError:
    raise ValueError("DataFrame 'df' is not defined. Please load your dataset first.")

# --- Clean column names just in case ---
df.columns = df.columns.str.strip()

# Define month order
month_order = ["January","February","March","April","May","June",
               "July","August","September","October","November","December"]

# --- Validate required columns ---
required_cols = ["Months", "Year", "Total Amount of UPI transactions (In Mn)"]
for col in required_cols:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# Convert Months column into ordered categorical
df["Months"] = pd.Categorical(df["Months"], categories=month_order, ordered=True)

# Pivot table
pivot_value = df.pivot_table(
    index="Months",
    columns="Year",
    values="Total Amount of UPI transactions (In Mn)",
    aggfunc="sum"
).reindex(month_order)

# --- Plot ---
plt.figure(figsize=(12,6))
for year in pivot_value.columns:
    plt.plot(pivot_value.index, pivot_value[year], marker='o', label=year)

plt.title("Seasonality of UPI Transactions")
plt.xlabel("Months")
plt.ylabel("Total Amount of UPI Transactions (In Mn)")
plt.xticks(rotation=45)
plt.legend(title="Year")
plt.grid(True)
plt.show()

C:\Users\lakshita rawat\AppData\Local\Temp\ipykernel_15748\558070324.py:27: FutureWarning: The default value of observed=False is deprecated and will change to observed=True in a future version of pandas. Specify observed=False to silence this warning and retain the current behavior
  pivot_value = df.pivot_table(

import pandas as pd
df = pd.read_csv(r"C:\Users\lakshita rawat\Downloads\Jupyter_files\cleaned_procressed_file_2.csv")
df.head()

df.columns

Index(['Months', 'Year', 'No. of Banks live on UPI',
       'Count of UPI transactions (In Mn)',
       'Total Amount of UPI transactions (In Mn)                              '],
      dtype='object')

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker

# Clean up column names (removes leading/trailing spaces)
df.columns = df.columns.str.strip()

# Create a complete range of years
years = np.arange(2019, 2026)

# Group by Year and sum the values
grouped = df.groupby("Year").agg({
    "Count of UPI transactions (In Mn)": "sum",
    "Total Amount of UPI transactions (In Mn)": "sum"
}).reindex(years, fill_value=0)

plt.figure(figsize=(10, 5))

# Plot transactions count
plt.plot(grouped.index, grouped["Count of UPI transactions (In Mn)"], 
         label="Transactions Count (Mn)", marker='o')

# Plot transaction value
plt.plot(grouped.index, grouped["Total Amount of UPI transactions (In Mn)"], 
         label="Transaction Value (Mn)", marker='o')

# Annotate only transaction counts (to avoid huge values breaking the canvas)
for x, y in zip(grouped.index, grouped["Count of UPI transactions (In Mn)"]):
    plt.text(x, y, f"{y:.1f}", ha='center', va='bottom', fontsize=7, clip_on=True)

# Title & labels
plt.title("Transactions Count vs Value by Year")
plt.xlabel("Year")
plt.ylabel("Scale (Mn)")

# Show all years
plt.xticks(years)

# Log scale with nicer formatting (e.g., 1M, 10M, 1B)
plt.yscale('log')
plt.gca().yaxis.set_major_formatter(
    ticker.FuncFormatter(lambda x, _: f'{x/1e3:.0f}K' if x < 1e6 else f'{x/1e6:.0f}M')
)

plt.legend()
plt.grid(True, which="both", linestyle="--", linewidth=0.5)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,5))
plt.scatter(df["No. of Banks live on UPI"], df["Total Amount of UPI transactions (In Mn)"], alpha=0.7)
plt.yscale("log")   # log scale for better visibility
plt.title("Effect of Banks on Transaction Value (Log Scale)")
plt.xlabel("Number of Banks Live")
plt.ylabel("Total Amount (Mn, Log Scale)")
plt.grid(True)
plt.show()

import matplotlib.pyplot as plt

# Group by Month
monthly = df.groupby("Months").agg({
    "Count of UPI transactions (In Mn)": "sum",
    "Total Amount of UPI transactions (In Mn)": "sum"
})

fig, ax1 = plt.subplots(figsize=(12,6))

# Bar plot for transaction count
bars = ax1.bar(monthly.index, monthly["Count of UPI transactions (In Mn)"], 
               color="skyblue", label="Transaction Count")
ax1.set_ylabel("Transaction Count (Mn)", color="blue")
ax1.tick_params(axis="y", labelcolor="blue")

# Annotate bars with values
for bar in bars:
    yval = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2, yval, f"{yval:.0f}", 
             ha='center', va='bottom', fontsize=9, color="blue")

# Line plot for transaction value on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(monthly.index, monthly["Total Amount of UPI transactions (In Mn)"], 
         color="green", marker="o", label="Transaction Value")
ax2.set_ylabel("Transaction Value (Mn)", color="green")
ax2.tick_params(axis="y", labelcolor="green")

# Annotate line points with values
for x, y in zip(monthly.index, monthly["Total Amount of UPI transactions (In Mn)"]):
    ax2.text(x, y, f"{y:.0f}", ha='center', va='bottom', fontsize=9, color="green")

plt.title("Do Months Influence UPI Transactions?")
fig.tight_layout()
plt.show()

C:\Users\lakshita rawat\AppData\Local\Temp\ipykernel_4268\4087693079.py:4: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  monthly = df.groupby("Months").agg({

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Ensure numeric columns
df["Total Amount of UPI transactions (In Mn)"] = pd.to_numeric(
    df["Total Amount of UPI transactions (In Mn)"], errors="coerce"
)
df["Count of UPI transactions (In Mn)"] = pd.to_numeric(
    df["Count of UPI transactions (In Mn)"], errors="coerce"
)
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")

# Avg value per transaction
df["Avg_Value_per_Transaction"] = (
    df["Total Amount of UPI transactions (In Mn)"] / df["Count of UPI transactions (In Mn)"]
)

# Group and make the x-axis continuous
years = pd.Index(range(2019, 2026), name="Year")
yearly_avg = df.groupby("Year")["Avg_Value_per_Transaction"].mean().reindex(years)

plt.figure(figsize=(11,6))
plt.plot(yearly_avg.index, yearly_avg.values, marker="o",color="purple")
plt.title("Average Value per Transaction Over Years")
plt.xlabel("Year")
plt.ylabel("Avg Value per Transaction (Mn)")
plt.xticks(years)
plt.grid(True, alpha=0.3)

# Safe annotations: drop NaNs, compute a relative offset
ymax = np.nanmax(yearly_avg.values)
offset = 0.02 * ymax if np.isfinite(ymax) else 1.0

for x, y in yearly_avg.dropna().items():   # (x=year, y=value)
    y = float(y)
    plt.text(x, y + offset, f"{y:,.0f}", ha="center", va="bottom", fontsize=9)

plt.tight_layout()
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt

# Select only numeric columns
numeric_df = df.select_dtypes(include=["int64", "float64"])

plt.figure(figsize=(8,6))
sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".2f")

plt.title("Correlation Heatmap of UPI Dataset")
plt.show()

Data Overview¶

Univariate Analysis¶

Examining the data to find out the highest year of UPI Adoption¶

Reviewing the data to highlight the month where UPI adoption is maximum¶

From this seasonality graph we can get the following insights-¶

Bivariate Analysis¶

Comparing growth in Transaction Count with the Transaction Value (Overall trend of UPI transactions from 2016 to 2025)¶

Insights from the graph-¶

Impact of Live Banks on Transaction Growth¶

Insights from the graph-¶

Interpreting the influence of months on transaction counts or values¶

Insights from the graph¶

Let's see over the years, is transaction growth linked more to usage (count) or value per transaction?¶

Insights from the graph¶

Correlation Analysis of UPI Dataset¶

Insights from the graph¶

	Months	Year	No. of Banks live on UPI	Count of UPI transactions (In Mn)	Total Amount of UPI transactions (In Mn)
0	January	2019	134	673	109932
1	February	2019	139	674	106737
2	March	2019	142	800	133461
3	April	2019	144	782	142034
4	May	2019	143	734	152449