import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

file_path = "lobster_loyalty.csv"
df = pd.read_csv(file_path)

print(df.head())

df.info()
print(df.isnull().sum())

categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Categorical columns:", categorical_columns)
print("Numerical columns:", numerical_columns)

   Customer_ID Membership_Tier  Spending_Per_Visit  Visit_Count
0            1          Silver               38.42           17
1            2          Bronze               35.01            1
2            3          Bronze               27.60            6
3            4          Silver               29.23           23
4            5            Gold               36.52           25
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Customer_ID         500 non-null    int64  
 1   Membership_Tier     500 non-null    object 
 2   Spending_Per_Visit  500 non-null    float64
 3   Visit_Count         500 non-null    int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 15.8+ KB
Customer_ID           0
Membership_Tier       0
Spending_Per_Visit    0
Visit_Count           0
dtype: int64
Categorical columns: ['Membership_Tier']
Numerical columns: ['Customer_ID', 'Spending_Per_Visit', 'Visit_Count']

import matplotlib.pyplot as plt
import seaborn as sns

# Calculate average spending per visit for each membership tier
avg_spending = df.groupby("Membership_Tier")["Spending_Per_Visit"].mean().sort_values()

# Define custom colors: fuchsia, peach, and amber
custom_colors = ["#FF00FF", "#FFDAB9", "#FFBF00"]  # Fuchsia, Peach, Amber

# Create the bar plot
plt.figure(figsize=(8, 5))
sns.barplot(x=avg_spending.index, y=avg_spending.values, hue=avg_spending.index, 
            palette=custom_colors, legend=False)  # Explicit hue assignment

# Add labels and title
plt.xlabel("Membership Tier")
plt.ylabel("Average Spending Per Visit ($)")
plt.title("Average Spending Per Visit by Membership Tier")

# Show the plot
plt.show()

from scipy.stats import f_oneway

# Perform one-way ANOVA test
anova_result = f_oneway(bronze_spending, silver_spending, gold_spending)

# Output F-statistic and p-value
print(f"ANOVA Test Results: F-statistic = {anova_result.statistic:.4f}, p-value = {anova_result.pvalue:.4f}")

ANOVA Test Results: F-statistic = 231.9584, p-value = 0.0000

from scipy.stats import ttest_ind

# Perform pairwise t-tests with unequal variance (Welch’s t-test)
t_stat1, p_val1 = ttest_ind(gold_spending, silver_spending, equal_var=False)
t_stat2, p_val2 = ttest_ind(gold_spending, bronze_spending, equal_var=False)
t_stat3, p_val3 = ttest_ind(silver_spending, bronze_spending, equal_var=False)

# Apply Bonferroni correction
alpha_corrected = 0.05 / 3

# Print results
print(f"T-test (Gold vs Silver): p-value = {p_val1:.4f} {'(Significant)' if p_val1 < alpha_corrected else '(Not Significant)'}")
print(f"T-test (Gold vs Bronze): p-value = {p_val2:.4f} {'(Significant)' if p_val2 < alpha_corrected else '(Not Significant)'}")
print(f"T-test (Silver vs Bronze): p-value = {p_val3:.4f} {'(Significant)' if p_val3 < alpha_corrected else '(Not Significant)'}")

T-test (Gold vs Silver): p-value = 0.0000 (Significant)
T-test (Gold vs Bronze): p-value = 0.0000 (Significant)
T-test (Silver vs Bronze): p-value = 0.0000 (Significant)

# Define a function to categorize visit frequency
def categorize_visits(visits):
    if visits >= 10:
        return "Frequent"
    elif 4 <= visits <= 9:
        return "Occasional"
    else:
        return "Rare"

# Apply function to create a new column
df["frequency_group"] = df["Visit_Count"].apply(categorize_visits)

# Display the first few rows to verify
print(df[["Visit_Count", "frequency_group"]].head())

   Visit_Count frequency_group
0           17        Frequent
1            1            Rare
2            6      Occasional
3           23        Frequent
4           25        Frequent

# Count the number of customers in each frequency group
print(df["frequency_group"].value_counts())

frequency_group
Frequent      371
Occasional     97
Rare           32
Name: count, dtype: int64

import numpy as np

# Create a contingency table (actual counts)
contingency_table = pd.crosstab(df["Membership_Tier"], df["frequency_group"])

# Calculate expected frequencies
row_totals = contingency_table.sum(axis=1).values.reshape(-1, 1)  # Row totals
col_totals = contingency_table.sum(axis=0).values  # Column totals
grand_total = contingency_table.values.sum()  # Total number of observations

expected_frequencies = (row_totals @ col_totals.reshape(1, -1)) / grand_total

# Convert to DataFrame for better readability
expected_df = pd.DataFrame(expected_frequencies, index=contingency_table.index, columns=contingency_table.columns)

# Print results
print("Actual Frequencies:\n", contingency_table)
print("\nExpected Frequencies:\n", expected_df)

Actual Frequencies:
 frequency_group  Frequent  Occasional  Rare
Membership_Tier                            
Bronze                 67          55    32
Gold                  154           0     0
Silver                150          42     0

Expected Frequencies:
 frequency_group  Frequent  Occasional    Rare
Membership_Tier                              
Bronze            114.268      29.876   9.856
Gold              114.268      29.876   9.856
Silver            142.464      37.248  12.288

from scipy.stats import chi2_contingency

# Perform Chi-Square test
chi2_stat, p_value, dof, expected_values = chi2_contingency(contingency_table)

# Print results
print(f"Chi-Square Statistic: {chi2_stat:.4f}")
print(f"p-value: {p_value:.4f}")

Chi-Square Statistic: 157.2728
p-value: 0.0000

# Observed frequencies (from contingency table)
observed = [
    [67, 55, 32],  # Bronze (Frequent, Occasional, Rare)
    [154, 0, 0],   # Gold (Frequent, Occasional, Rare)
    [150, 42, 0]   # Silver (Frequent, Occasional, Rare)
]

# Expected frequencies (calculated earlier)
expected = [
    [114.268, 29.876, 9.856],   # Bronze
    [114.268, 29.876, 9.856],   # Gold
    [142.464, 37.248, 12.288]   # Silver
]

# Manually calculating chi-square statistic
chi_square = 0

# Loop through observed and expected values to compute chi-square
for i in range(len(observed)):  # Membership tiers
    for j in range(len(observed[i])):  # Frequency groups
        O = observed[i][j]  # Observed value
        E = expected[i][j]  # Expected value
        chi_square += ((O - E) ** 2) / E  # Apply chi-square formula

# Display result
print("Manually Calculated Chi-Square Statistic:", round(chi_square, 4))

Manually Calculated Chi-Square Statistic: 157.2728

Part I: Data Exploration and Initial Analysis:¶

A. Data Exploration¶

B. Customer Behavior by Membership Tier¶

Part II: Hypothesis Testing – Spending by Membership Tier¶

A. Formulating Hypotheses¶

B. Running the ANOVA Test¶

C. Additional Analysis¶

Part III: Chi-Square Goodness of Fit – Visit Frequency by Membership Tier¶

A. Data Engineering¶

B. Next, let’s get ready to run a statistical test to explore the relationship between membership tier and frequency_group.¶

C. Running the Chi-Square Test¶

D. Interpreting the Results¶

E. Demonstrate where the chi-square number from your test came from.¶