import pandas as pd

# Load the dataset
file_path = 'ExtraaLearn.csv'
df = pd.read_csv(file_path)

# Display basic info
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4612 entries, 0 to 4611
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     4612 non-null   object 
 1   age                    4612 non-null   int64  
 2   current_occupation     4612 non-null   object 
 3   first_interaction      4612 non-null   object 
 4   profile_completed      4612 non-null   object 
 5   website_visits         4612 non-null   int64  
 6   time_spent_on_website  4612 non-null   int64  
 7   page_views_per_visit   4612 non-null   float64
 8   last_activity          4612 non-null   object 
 9   print_media_type1      4612 non-null   object 
 10  print_media_type2      4612 non-null   object 
 11  digital_media          4612 non-null   object 
 12  educational_channels   4612 non-null   object 
 13  referral               4612 non-null   object 
 14  status                 4612 non-null   int64  
dtypes: float64(1), int64(4), object(10)
memory usage: 540.6+ KB

(None,
        ID  age current_occupation first_interaction profile_completed  \
 0  EXT001   57         Unemployed           Website              High   
 1  EXT002   56       Professional        Mobile App            Medium   
 2  EXT003   52       Professional           Website            Medium   
 3  EXT004   53         Unemployed           Website              High   
 4  EXT005   23            Student           Website              High   
 
    website_visits  time_spent_on_website  page_views_per_visit  \
 0               7                   1639                 1.861   
 1               2                     83                 0.320   
 2               3                    330                 0.074   
 3               4                    464                 2.057   
 4               4                    600                16.914   
 
       last_activity print_media_type1 print_media_type2 digital_media  \
 0  Website Activity               Yes                No           Yes   
 1  Website Activity                No                No            No   
 2  Website Activity                No                No           Yes   
 3  Website Activity                No                No            No   
 4    Email Activity                No                No            No   
 
   educational_channels referral  status  
 0                   No       No       1  
 1                  Yes       No       0  
 2                   No       No       0  
 3                   No       No       1  
 4                   No       No       0  )

# Select string (object) columns
string_cols = df.select_dtypes(include='object').columns

# Check unique values per string column
unique_values = {col: df[col].unique() for col in string_cols}

# Display counts of unique values (including possible hidden missing indicators)
unique_counts = {col: df[col].value_counts(dropna=False) for col in string_cols}

unique_values

{'ID': array(['EXT001', 'EXT002', 'EXT003', ..., 'EXT4610', 'EXT4611', 'EXT4612'],
       dtype=object),
 'current_occupation': array(['Unemployed', 'Professional', 'Student'], dtype=object),
 'first_interaction': array(['Website', 'Mobile App'], dtype=object),
 'profile_completed': array(['High', 'Medium', 'Low'], dtype=object),
 'last_activity': array(['Website Activity', 'Email Activity', 'Phone Activity'],
       dtype=object),
 'print_media_type1': array(['Yes', 'No'], dtype=object),
 'print_media_type2': array(['No', 'Yes'], dtype=object),
 'digital_media': array(['Yes', 'No'], dtype=object),
 'educational_channels': array(['No', 'Yes'], dtype=object),
 'referral': array(['No', 'Yes'], dtype=object)}

from sklearn.model_selection import train_test_split

# Split the data
train_df, test_df = train_test_split(df, test_size=0.20, stratify=df['status'],
                                     random_state=42)

train_df.shape, train_df['status'].value_counts(normalize=True)

((3689, 15),
 status
 0    0.701545
 1    0.298455
 Name: proportion, dtype: float64)

import matplotlib.pyplot as plt
import seaborn as sns

# Select numeric columns
numeric_cols = train_df.select_dtypes(include=['int64', 'float64']).columns

# Calculate correlation matrix
correlation_matrix = train_df[numeric_cols].corr()

# Set up the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", center=0)
plt.title("Correlation Heatmap of Numeric Features")
plt.tight_layout()
plt.show()

# Update numeric columns since we transformed categorical ones
numeric_cols_updated = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Seaborn pairplot (sampled to avoid overload)
import seaborn as sns
import matplotlib.pyplot as plt

# To avoid overplotting, take a sample (optional, adjustable)
sample_df = train_df[numeric_cols_updated].sample(n=500, random_state=42)

# Plot
sns.pairplot(sample_df)
plt.suptitle("Pairplot of Numeric Features (Sample of 500)", y=1.02)
plt.show()

# Create box-whisker plots for numeric features
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
axes = axes.flatten()

# removed status from numeric_cols
for idx, col in enumerate([col for col in numeric_cols if col != 'status']):
    sns.boxplot(data=train_df, y=col, ax=axes[idx])
    axes[idx].set_title(f'Boxplot of {col}')

plt.tight_layout()
plt.show()

# Mapping functions
occupation_map = lambda x: 1 if x == 'Professional' else 0
interaction_map = lambda x: 1 if x == 'Mobile App' else 0
yes_no_map = lambda x: 1 if x == 'Yes' else 0
profile_map = {'Low': 1, 'Medium': 2, 'High': 3}
activity_map = lambda x: 1 if x == 'Phone Activity' else 0

# Apply transformations to both train and test sets
def transform(df):
    df = df.copy()
    df['current_occupation'] = df['current_occupation'].map(occupation_map)
    df['first_interaction'] = df['first_interaction'].map(interaction_map)
    df['print_media_type1'] = df['print_media_type1'].map(yes_no_map)
    df['print_media_type2'] = df['print_media_type2'].map(yes_no_map)
    df['digital_media'] = df['digital_media'].map(yes_no_map)
    df['educational_channels'] = df['educational_channels'].map(yes_no_map)
    df['referral'] = df['referral'].map(yes_no_map)
    df['profile_completed'] = df['profile_completed'].map(profile_map)
    df['last_activity'] = df['last_activity'].map(activity_map)
    return df

train_df_transformed = transform(train_df)
test_df_transformed = transform(test_df)

train_df_transformed.head()

# Select binary columns
binary_cols = [
    'current_occupation', 'first_interaction', 'print_media_type1',
    'print_media_type2', 'digital_media', 'educational_channels',
    'referral', 'last_activity'
]

# Calculate proportions of 0s and 1s
binary_proportions = {
    col: train_df_transformed[col].value_counts(normalize=True).sort_index()
    for col in binary_cols
}

# Convert to DataFrame for display
binary_proportions_df = pd.DataFrame(binary_proportions).T
binary_proportions_df.columns = ['Proportion_0', 'Proportion_1']

binary_proportions_df

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define features and target
X_train = train_df_transformed.drop(columns=['ID', 'status'])
y_train = train_df_transformed['status']

# Set up the Decision Tree and hyperparameter grid
dtree = DecisionTreeClassifier(random_state=42)
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Grid search with 5-fold cross-validation
grid_search = GridSearchCV(dtree, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model and classification report
best_tree = grid_search.best_estimator_
y_pred = best_tree.predict(X_train)
report = classification_report(y_train, y_pred, output_dict=True)

pd.DataFrame(report).T

import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

plt.figure(figsize=(20, 10))
plot_tree(
    best_tree,
    feature_names=X_train.columns,
    class_names=['Not Converted', 'Converted'],
    filled=True,
    rounded=True,
    fontsize=10
)
plt.title("Best Decision Tree")
plt.show()

import pandas as pd
import matplotlib.pyplot as plt

# Get feature importances
importances = best_tree.feature_importances_
features = X_train.columns

# Create a DataFrame for easy sorting and visualization
feat_imp_df = pd.DataFrame({
    'Feature': features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(feat_imp_df['Feature'], feat_imp_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importances from Decision Tree')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# The final params
print("Best Hyperparameters:")
print(grid_search.best_params_)

Best Hyperparameters:
{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}

# Re-import and reprocess due to kernel reset
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define features and target
X_train = train_df_transformed.drop(columns=['ID', 'status'])
y_train = train_df_transformed['status']

# Define Random Forest model and grid
rf = RandomForestClassifier(random_state=42)
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'criterion': ['gini', 'entropy']
}

# Perform Grid Search
rf_grid_search = GridSearchCV(rf, rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

# Evaluate
best_rf = rf_grid_search.best_estimator_
y_pred_rf = best_rf.predict(X_train)
rf_report = classification_report(y_train, y_pred_rf, output_dict=True)

pd.DataFrame(rf_report).T

import pandas as pd
import matplotlib.pyplot as plt

# Extract and organize
rf_importances = best_rf.feature_importances_
rf_features = X_train.columns

# Create a DataFrame for sorting and display
rf_feat_imp_df = pd.DataFrame({
    'Feature': rf_features,
    'Importance': rf_importances
}).sort_values(by='Importance', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(rf_feat_imp_df['Feature'], rf_feat_imp_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importances from Random Forest')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

	Proportion_0	Proportion_1
current_occupation	0.430740	0.569260
first_interaction	0.552182	0.447818
print_media_type1	0.892383	0.107617
print_media_type2	0.947682	0.052318
digital_media	0.888046	0.111954
educational_channels	0.849553	0.150447
referral	0.979127	0.020873
last_activity	0.731092	0.268908

	precision	recall	f1-score	support
0	0.884418	0.934312	0.908681	2588.000000
1	0.821990	0.712988	0.763619	1101.000000
accuracy	0.868257	0.868257	0.868257	0.868257
macro avg	0.853204	0.823650	0.836150	3689.000000
weighted avg	0.865786	0.868257	0.865386	3689.000000

	precision	recall	f1-score	support
0	0.904534	0.948223	0.925863	2588.000000
1	0.862705	0.764759	0.810785	1101.000000
accuracy	0.893467	0.893467	0.893467	0.893467
macro avg	0.883619	0.856491	0.868324	3689.000000
weighted avg	0.892050	0.893467	0.891517	3689.000000

Leads Classification Project¶

TL;DR¶

Load the Dataset¶

Check for Hidden Missing Values¶

Train-Test Split¶

Exploratory Data Analysis (EDA)¶

Target Feature Balance¶

Pearson Correlation of Numeric Features¶

Distribution of Numeric Features¶

Box Plots and Outliers¶

Data Pre-processing¶

Feature Engineering and Mapping¶

Binary Mapping (`Yes`/`No` ➝ `1`/`0`):¶

Custom Binary Mapping:¶

Ordinal Mapping:¶

Binary Feature Distributions¶

Model: Decision Tree¶

Visualization of the Decision Tree¶

Feature Importance¶

Hyperparameters¶

Model: Random Forest¶

Feature Importance¶

Do We Need to Prune?¶

Actionable Insights & Recommendations¶

Key Takeaways from Feature Importance¶

Business Recommendations¶

Model Choice Guidance¶

Additional Modeling Considerations¶

Future Work & Limitations¶

Limitations¶

Future Work¶

	ID	age	current_occupation	first_interaction	profile_completed	website_visits	time_spent_on_website	page_views_per_visit	digital_media	educational_channels	status
647	EXT648	45	1	0	2	5	77	8.676	1	0	0
2201	EXT2202	63	0	0	2	1	65	4.031	1	0	0
3362	EXT3363	54	0	0	3	2	90	3.816	0	0	0
617	EXT618	56	1	1	3	4	1857	1.360	0	1	0
1213	EXT1214	42	1	0	3	5	1193	2.113	0	0	1

Leads Classification Project¶

TL;DR¶

Load the Dataset¶

Check for Hidden Missing Values¶

Train-Test Split¶

Exploratory Data Analysis (EDA)¶

Target Feature Balance¶

Pearson Correlation of Numeric Features¶

Distribution of Numeric Features¶

Box Plots and Outliers¶

Data Pre-processing¶

Feature Engineering and Mapping¶

Binary Mapping (Yes/No ➝ 1/0):¶

Custom Binary Mapping:¶

Ordinal Mapping:¶

Binary Feature Distributions¶

Model: Decision Tree¶

Visualization of the Decision Tree¶

Feature Importance¶

Hyperparameters¶

Model: Random Forest¶

Feature Importance¶

Do We Need to Prune?¶

Actionable Insights & Recommendations¶

Key Takeaways from Feature Importance¶

Business Recommendations¶

Model Choice Guidance¶

Additional Modeling Considerations¶

Future Work & Limitations¶

Limitations¶

Future Work¶

Binary Mapping (`Yes`/`No` ➝ `1`/`0`):¶