import pandas as pd

# Load the dataset
file_path = "water_rides.csv"
df = pd.read_csv(file_path)

# Display basic information
df.head()  # Show the first 5 rows

# Check dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   rideID                 146 non-null    int64  
 1   rider_group            146 non-null    int64  
 2   max_speed              146 non-null    float64
 3   total_height           146 non-null    float64
 4   soak_level             146 non-null    float64
 5   max_hourly_throughput  146 non-null    float64
 6   avg_duration           146 non-null    float64
 7   square_feet            146 non-null    float64
 8   installation_cost      146 non-null    float64
 9   maintenance_cost       146 non-null    float64
dtypes: float64(8), int64(2)
memory usage: 11.5 KB

# Drop the rideID column
df = df.drop(columns=["rideID"])

# Verify the change
df.head()

# Summary statistics
df.describe()

# Check for missing values
missing_values = df.isnull().sum()

# Display only columns with missing values
missing_values[missing_values > 0]

Series([], dtype: int64)

# Identify potential impossible values
print("Negative values in numerical columns:")
print(df[(df < 0).any(axis=1)])

# Check specific constraints
print("\nInvalid soak_level values:")
print(df[(df["soak_level"] < 0) | (df["soak_level"] > 8)])

print("\nInvalid max_speed values:")
print(df[df["max_speed"] < 0])

print("\nInvalid installation_cost values:")
print(df[df["installation_cost"] < 0])

Negative values in numerical columns:
   rider_group  max_speed  total_height  soak_level  max_hourly_throughput  \
0            4     -25.00         59.64         4.0                 658.35   
1            4      25.02        106.54         6.0                 455.65   

   avg_duration  square_feet  installation_cost  maintenance_cost  
0         66.77      7389.98            46702.3           4980.30  
1         48.15     11757.48          -100000.0           5313.93  

Invalid soak_level values:
Empty DataFrame
Columns: [rider_group, max_speed, total_height, soak_level, max_hourly_throughput, avg_duration, square_feet, installation_cost, maintenance_cost]
Index: []

Invalid max_speed values:
   rider_group  max_speed  total_height  soak_level  max_hourly_throughput  \
0            4      -25.0         59.64         4.0                 658.35   

   avg_duration  square_feet  installation_cost  maintenance_cost  
0         66.77      7389.98            46702.3            4980.3  

Invalid installation_cost values:
   rider_group  max_speed  total_height  soak_level  max_hourly_throughput  \
1            4      25.02        106.54         6.0                 455.65   

   avg_duration  square_feet  installation_cost  maintenance_cost  
1         48.15     11757.48          -100000.0           5313.93

df = df.assign(
    max_speed=df["max_speed"].fillna(df["max_speed"].mean()),
    installation_cost=df["installation_cost"].fillna(df["installation_cost"].mean())
)

print(df["max_speed"].min())  
print(df["installation_cost"].min())

11.87
37412.64

df.to_csv("water_rides_cleaned.csv", index=False)
print("Dataset successfully saved!")

Dataset successfully saved!

import pandas as pd

# Load the dataset again
file_path = "water_rides_cleaned.csv"
df = pd.read_csv(file_path)
df.drop(columns=["rideID"], inplace=True)

# Display first rows to confirm it's loaded
df.head()

from sklearn.preprocessing import StandardScaler

# Select numerical columns for scaling (exclude categorical if any)
num_cols = ["rider_group", "max_speed", "total_height", "soak_level", 
            "max_hourly_throughput", "avg_duration", "square_feet", 
            "installation_cost", "maintenance_cost"]

# Initialize scaler
scaler = StandardScaler()

# Fit and transform numerical columns
df_scaled = df.copy()  # Create a copy to keep original data
df_scaled[num_cols] = scaler.fit_transform(df[num_cols])

# Check first rows
df_scaled.head()

selected_features = ["max_speed", "soak_level", "max_hourly_throughput", 
                     "avg_duration", "installation_cost", "total_height"]

df_selected = df_scaled[selected_features].copy()

# Check first rows
df_selected.head()

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Range of cluster numbers to test
cluster_range = range(1, 11)  # From 1 to 10 clusters
inertia_values = []  # Store inertia (within-cluster sum of squares)

# Compute K-Means for each number of clusters
for k in cluster_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(df_selected)
    inertia_values.append(kmeans.inertia_)  # Save inertia value

# Plot the Elbow Method graph
plt.figure(figsize=(8, 5))
plt.plot(cluster_range, inertia_values, marker="o", linestyle="--", color="b")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia (Within-Cluster Sum of Squares)")
plt.title("Elbow Method for Optimal k")
plt.grid(True)
plt.show()

from sklearn.cluster import KMeans

# Define the KMeans model with k=4
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)

# Fit the model on scaled numerical data and assign cluster labels
df_scaled["cluster"] = kmeans.fit_predict(df_scaled[num_cols])

# Print the number of observations in each cluster
print(df_scaled["cluster"].value_counts())

# Add cluster labels to the original (unscaled) DataFrame
df["cluster"] = df_scaled["cluster"]

# Display the first rows of the dataset with cluster labels
print(df.head())

cluster
0    75
1    69
3     1
2     1
Name: count, dtype: int64
   rider_group  max_speed  total_height  soak_level  max_hourly_throughput  \
0            4  27.980552         59.64         4.0                 658.35   
1            4  25.020000        106.54         6.0                 455.65   
2            5  30.820000       9999.00         6.0                 536.13   
3            1  34.100000         97.18         6.0              100000.00   
4            3  30.380000         89.46         5.0                 518.29   

   avg_duration  square_feet  installation_cost  maintenance_cost  cluster  
0         66.77      7389.98       46702.300000           4980.30        0  
1         48.15     11757.48       48231.167586           5313.93        1  
2         65.02      9403.26       51244.810000           5510.27        3  
3         62.18      6191.53       50332.710000           5039.14        2  
4         75.54      9632.71       50069.210000           6169.58        1

# Calculate mean values for each cluster
cluster_means = df.groupby("cluster").mean()

# Display the mean values
print(cluster_means)

         rider_group  max_speed  total_height  soak_level  \
cluster                                                     
0           3.480000  28.584807     83.227200    4.826667   
1           2.550725  27.193913     85.823913    3.318841   
2           1.000000  34.100000     97.180000    6.000000   
3           5.000000  30.820000   9999.000000    6.000000   

         max_hourly_throughput  avg_duration  square_feet  installation_cost  \
cluster                                                                        
0                   674.146133     71.662000  7489.564000       47097.554133   
1                   638.538406     70.417246  9334.866957       49389.223008   
2                100000.000000     62.180000  6191.530000       50332.710000   
3                   536.130000     65.020000  9403.260000       51244.810000   

         maintenance_cost  
cluster                    
0             4951.701733  
1             5667.560145  
2             5039.140000  
3             5510.270000

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming df is the clustered dataset

# 1. Histogram of clusters
df['cluster'].value_counts().sort_index().plot(kind='bar', color='skyblue', edgecolor='black')
plt.title("Cluster Distribution")
plt.xlabel("Cluster")
plt.ylabel("Count")
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming df is the clustered dataset
# 2. Boxplot for key numerical features
plt.figure(figsize=(12, 6))
sns.boxplot(x="cluster", y="max_speed", data=df, hue="cluster", palette="Set2", legend=False)
plt.title("Boxplot of Max Speed by Cluster")
plt.xlabel("Cluster")
plt.ylabel("Max Speed")
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming df is the clustered dataset
# 3. Scatter plot of two key features
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df['max_speed'], y=df['total_height'], hue=df['cluster'], palette='tab10', s=100, edgecolor='black')
plt.title("Scatter Plot of Max Speed vs. Total Height by Cluster")
plt.xlabel("Max Speed")
plt.ylabel("Total Height")
plt.legend(title="Cluster")
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming df is the clustered dataset
# 4. Bar chart of mean values per cluster
cluster_means = df.groupby('cluster').mean()
cluster_means[['max_speed', 'total_height', 'installation_cost']].plot(kind='bar', figsize=(10,6), colormap='viridis', edgecolor='black')
plt.title("Average Feature Values by Cluster")
plt.xlabel("Cluster")
plt.ylabel("Mean Value")
plt.legend(title="Feature")
plt.show()

import pandas as pd
import random

# Generate data for the roller coaster choices
data = {
    "rocketlaunch": [random.choice(["Yes", "No"]) for _ in range(100)],  # Whether the coaster has a launch start
    "maxspeed": [random.choice([40, 60, 80]) for _ in range(100)],  # Maximum speed (mph)
    "material": [random.choice(["Wood", "Steel"]) for _ in range(100)],  # Material type
    "seats_car": [random.choice([2, 4]) for _ in range(100)],  # Number of seats per car
    "drop": [random.choice([100, 200, 300]) for _ in range(100)],  # Height of the biggest drop (feet)
    "track_color": [random.choice(["Green", "Blue", "White", "Red"]) for _ in range(100)],  # Track color
    "avg_rating": [round(random.uniform(1, 10), 1) for _ in range(100)]  # Average rating (1-10)
}

# Create a DataFrame from the generated data
df = pd.DataFrame(data)

# Save the DataFrame as a CSV file
df.to_csv("coaster_choices.csv", index=False)

print("The file coaster_choices.csv has been successfully created!")

The file coaster_choices.csv has been successfully created!

import pandas as pd

# Load the dataset
df = pd.read_csv("coaster_choices.csv")

# Display the first few rows
df.head()

# Identify numerical variables
numerical_features = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

# Identify categorical variables
categorical_features = df.select_dtypes(include=["object"]).columns.tolist()

print("Numerical Features:", numerical_features)
print("Categorical Features:", categorical_features)

Numerical Features: ['maxspeed', 'seats_car', 'drop', 'avg_rating']
Categorical Features: ['rocketlaunch', 'material', 'track_color']

# Convert categorical variables to dummy variables
df_encoded = pd.get_dummies(df, drop_first=True)

# Display the first few rows of the transformed dataset
df_encoded.head()

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define independent (X) and dependent (y) variables
X = df_encoded.drop(columns=["avg_rating"])  # All features except the target
y = df_encoded["avg_rating"]  # Target variable

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Squared Error (MSE): 8.5986
R² Score: -0.2186

# Extract model coefficients and feature names
coefficients = pd.DataFrame({"Feature": X.columns, "Coefficient": model.coef_})

# Sort by absolute value to see the most influential factors
coefficients["Abs_Coefficient"] = coefficients["Coefficient"].abs()
coefficients = coefficients.sort_values(by="Abs_Coefficient", ascending=False).drop(columns=["Abs_Coefficient"])

# Display coefficients
print(coefficients)

             Feature  Coefficient
4      material_Wood    -1.144795
7  track_color_White    -0.623123
1          seats_car     0.556483
5  track_color_Green    -0.416367
6    track_color_Red     0.241064
3   rocketlaunch_Yes     0.156145
0           maxspeed     0.005593
2               drop    -0.001800

from IPython.display import display, Image
display(Image(filename="advertisement.png"))

	rider_group	max_speed	total_height	soak_level	max_hourly_throughput	avg_duration	square_feet	installation_cost	maintenance_cost
count	146.000000	146.000000	146.000000	146.000000	146.000000	146.000000	146.000000	146.000000	146.000000
mean	3.034247	27.617671	152.466233	4.130137	1336.686575	70.963288	8365.876096	47215.885616	5294.443082
std	1.492114	6.990227	820.920491	2.407642	8226.113070	8.010506	1536.103405	12861.470569	750.138648
min	1.000000	-25.000000	15.130000	0.000000	1.350000	48.150000	4246.200000	-100000.000000	2870.020000
25%	2.000000	24.595000	67.075000	2.000000	506.185000	65.632500	7249.445000	45559.487500	4837.440000
50%	3.000000	28.110000	86.245000	4.000000	645.805000	70.770000	8347.685000	48348.580000	5334.280000
75%	4.000000	32.315000	102.112500	6.000000	874.332500	76.192500	9424.557500	51029.390000	5825.502500
max	8.000000	38.870000	9999.000000	8.000000	100000.000000	94.840000	12044.960000	56666.250000	7204.960000

	rider_group	max_speed	total_height	soak_level	max_hourly_throughput	avg_duration	square_feet	installation_cost	maintenance_cost
0	0.649466	0.000000	-0.113465	-0.054238	-0.082745	-0.525275	-0.637493	-0.397158	-0.420222
1	0.649466	-0.545686	-0.056137	0.779310	-0.107471	-2.857724	2.215527	0.000000	0.026067
2	1.321963	0.523365	12.035793	0.779310	-0.097654	-0.744491	0.677659	0.782862	0.288706
3	-1.368025	1.127931	-0.067578	0.779310	12.035204	-1.100245	-1.420367	0.545923	-0.341513
4	-0.023031	0.442264	-0.077015	0.362536	-0.099830	0.573305	0.827545	0.477473	1.170650

	max_speed	soak_level	max_hourly_throughput	avg_duration	installation_cost	total_height
0	0.000000	-0.054238	-0.082745	-0.525275	-0.397158	-0.113465
1	-0.545686	0.779310	-0.107471	-2.857724	0.000000	-0.056137
2	0.523365	0.779310	-0.097654	-0.744491	0.782862	12.035793
3	1.127931	0.779310	12.035204	-1.100245	0.545923	-0.067578
4	0.442264	0.362536	-0.099830	0.573305	0.477473	-0.077015

Part I: Segmentation¶

A. Drop the rideID variable.¶

B. Call the describe() function on your dataset.¶

C. Missing values.¶

D. Data scaling.¶

E. Variable selection. Select any 6 variables from the potential set of inputs in order to build your k-means clustering model.¶

F. Elbow chart.¶

G. Build a k-means model with your desired number of clusters.¶

J. Give a descriptive name to each one of your clusters, along with a few sentences of explanation for the name that you chose¶

K. For each cluster, also include a couple sentences about targeting. What types of visitors would be interested in these groups of rides, and how should Lobster Land reach them?¶

L. How can Lobster Land use this model?¶

Part II: Conjoint Analysis with a Linear Model¶

A. Read the dataset coaster_choices.csv into your local environment in Jupyter Notebook.¶

B. Based on the descriptions shown above, which of your variables are numeric, and which are categorical?¶

C. Use the pandas get_dummies() function in order to prepare these variables for use in a linear model.¶

D. Build a linear model with your data, using the average rating as the outcome variable, and with all of your other variables as inputs.¶

E. Display the coefficient values of your model inputs.¶

F. Write a paragraph or two for Lobster Land management about what your model is showing you.¶

Part III: Wildcard: Marketing & Segments¶

	rideID	rider_group	max_speed	total_height	soak_level	max_hourly_throughput	avg_duration	square_feet	installation_cost	maintenance_cost
0	1	4	-25.00	59.64	4.0	658.35	66.77	7389.98	46702.30	4980.30
1	2	4	25.02	106.54	6.0	455.65	48.15	11757.48	-100000.00	5313.93
2	3	5	30.82	9999.00	6.0	536.13	65.02	9403.26	51244.81	5510.27
3	4	1	34.10	97.18	6.0	100000.00	62.18	6191.53	50332.71	5039.14
4	5	3	30.38	89.46	5.0	518.29	75.54	9632.71	50069.21	6169.58

	rider_group	max_speed	total_height	soak_level	max_hourly_throughput	avg_duration	square_feet	installation_cost	maintenance_cost
0	4	27.980552	59.64	4.0	658.35	66.77	7389.98	46702.300000	4980.30
1	4	25.020000	106.54	6.0	455.65	48.15	11757.48	48231.167586	5313.93
2	5	30.820000	9999.00	6.0	536.13	65.02	9403.26	51244.810000	5510.27
3	1	34.100000	97.18	6.0	100000.00	62.18	6191.53	50332.710000	5039.14
4	3	30.380000	89.46	5.0	518.29	75.54	9632.71	50069.210000	6169.58

	rocketlaunch	maxspeed	material	seats_car	drop	track_color	avg_rating
0	Yes	80	Wood	4	100	White	2.2
1	No	60	Steel	4	300	Green	9.6
2	No	80	Steel	2	300	White	4.2
3	No	80	Wood	2	100	Blue	9.1
4	Yes	60	Steel	2	200	Green	4.4

	maxspeed	seats_car	drop	avg_rating	rocketlaunch_Yes	material_Wood	track_color_Green	track_color_Red	track_color_White
0	80	4	100	2.2	True	True	False	False	True
1	60	4	300	9.6	False	False	True	False	False
2	80	2	300	4.2	False	False	False	False	True
3	80	2	100	9.1	False	True	False	False	False
4	60	2	200	4.4	True	False	True	False	False