-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVendor Boosting Regression - Metric Analysis (Random Forest).py
90 lines (75 loc) · 3.29 KB
/
Vendor Boosting Regression - Metric Analysis (Random Forest).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# Vendor Boosting Regression - Metric Analysis (Random Forest)
# Importing libraries
import numpy as np
import pandas as pd
import pyodbc
import psycopg2
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
import matplotlib.pyplot as plt
plt.rc("font", size=14)
# Connecting to the database
import getpass
database_password = getpass.getpass('Enter your database password:')
import sqlalchemy as sa
from sqlalchemy.engine.url import URL
# build the sqlalchemy URL with placeholder values
url = URL.create(
drivername='postgresql+psycopg2', # postgresql driver and dialect
host='your_database_host', # database host
port=5432, # database port
database='your_database_name', # database name
username='your_database_username', # database username
password=database_password # database password
)
engine = sa.create_engine(url)
# Prepare the model by getting all the factors from the vendor table.
vendor_query = """
with vendor_data as (
select
vendor_id,
count(distinct session_id || product_id)::numeric as website_traffic
from your_table
where request_date::date >= current_date - interval '6 months'
and vendor_id <> ''
and page_type_group in ('detail', 'details')
group by vendor_id
)
select * from vendor_data;
"""
vendor_data = pd.read_sql_query(vendor_query, engine)
# Replace 'Y/N' with 1/0
vendor_data_f = vendor_data.replace(to_replace={'Y': 1, 'N': 0})
print(vendor_data_f.head())
print(vendor_data_f.describe())
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
# Scaling the data
scaler = StandardScaler()
vendor_data_f_scale = scaler.fit_transform(vendor_data_f)
vendor_data_f_scale = pd.DataFrame(vendor_data_f_scale, columns=vendor_data_f.columns)
# Select columns for X and Y
y_column = 'your_target_column'
x_columns = [col for col in vendor_data_f.columns if col != y_column]
# Random Forest Regressor
rf = RandomForestRegressor(n_jobs=-1, n_estimators=15, max_features="log2", verbose=1, random_state=42)
rf.fit(vendor_data_f_scale[x_columns], vendor_data_f_scale[y_column])
# Feature Importance
importance = rf.feature_importances_
important_features_dict = {x_columns[i]: importance[i] for i in range(len(x_columns))}
sorted_features = sorted(important_features_dict.items(), key=lambda x: x[1], reverse=True)
for feature in sorted_features:
print(feature)
# Visualizing feature importance
plt.figure(figsize=(10, 5))
plt.bar(important_features_dict.keys(), important_features_dict.values())
plt.xticks(rotation=90)
plt.ylabel('importance')
plt.title('Random Forest Regressor Feature Importance')
plt.show()
## Notes
## 1. Database Connection: The database connection details have been replaced with placeholders (`your_database_host`, `your_database_username`, etc.). You need to replace these with actual values.
## 2. QL Query: The SQL query has been simplified and placeholders (`your_table`, `your_target_column`) are used. Modify the query according to your actual database schema.
## 3. Feature Selection: Adjust the feature selection (`x_columns`, `y_column`) based on your dataset.
## 4. Random Forest Regressor: I've set random_state to 42 for reproducibility, and you might want to adjust the parameters of `RandomForestRegressor` based on your specific needs.