-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
266 lines (207 loc) · 8.5 KB
/
main.py
File metadata and controls
266 lines (207 loc) · 8.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import pandas as pd
import numpy as np
import re
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
warnings.filterwarnings('ignore')
def extract_features(df):
"""Extract features from catalog content"""
def process_text(text):
if pd.isna(text):
text = ""
text = str(text).lower()
features = {}
# Basic text features
features['text_length'] = len(text)
features['word_count'] = len(text.split())
# Extract numbers
numbers = re.findall(r'\d+\.?\d*', text)
if numbers:
numbers = [float(n) for n in numbers]
features['max_number'] = max(numbers)
features['avg_number'] = np.mean(numbers)
features['num_count'] = len(numbers)
features['has_large_number'] = max(numbers) > 100
else:
features['max_number'] = 0
features['avg_number'] = 0
features['num_count'] = 0
features['has_large_number'] = False
# Extract brand (first word)
words = text.split()
features['brand'] = words[0] if words else 'unknown'
# Extract IPQ
ipq_patterns = [
r'ipq[:\s]+(\d+)',
r'pack of (\d+)',
r'(\d+)\s*pack',
r'(\d+)\s*count',
r'(\d+)\s*piece'
]
ipq = 1
for pattern in ipq_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
ipq = int(match.group(1))
break
features['ipq'] = ipq
# Premium indicators
premium_keywords = ['premium', 'luxury', 'professional', 'pro', 'ultra', 'advanced']
features['is_premium'] = any(kw in text for kw in premium_keywords)
# Category indicators
categories = {
'electronics': ['electronic', 'digital', 'wireless', 'bluetooth', 'usb', 'battery'],
'fashion': ['shirt', 'dress', 'clothing', 'wear', 'cotton', 'fabric'],
'home': ['kitchen', 'home', 'furniture', 'decor', 'household'],
'beauty': ['beauty', 'cosmetic', 'skincare', 'makeup', 'cream'],
'sports': ['sport', 'fitness', 'exercise', 'gym', 'athletic'],
'food': ['food', 'snack', 'drink', 'beverage', 'organic']
}
for cat, keywords in categories.items():
features[f'is_{cat}'] = any(kw in text for kw in keywords)
# Material indicators
materials = ['wood', 'metal', 'plastic', 'glass', 'leather', 'ceramic']
features['material_count'] = sum(1 for mat in materials if mat in text)
# Size indicators
size_keywords = ['small', 'medium', 'large', 'xl', 'size', 'inch', 'cm']
features['has_size_info'] = any(kw in text for kw in size_keywords)
# Special formatting
features['has_bullet_points'] = 'bullet point' in text or '•' in text
features['has_emoji'] = bool(re.search(r'[^\x00-\x7F]', text))
return features
# Extract features
feature_dicts = df['catalog_content'].apply(process_text)
feature_df = pd.DataFrame(feature_dicts.tolist())
# Encode brand
le = LabelEncoder()
feature_df['brand_encoded'] = le.fit_transform(feature_df['brand'])
# Combine with original data
result_df = pd.concat([df, feature_df], axis=1)
return result_df
def train_models(X_train, y_train, X_val, y_val):
"""Train ensemble of models"""
models = {}
# Random Forest
print("Training Random Forest")
rf_model = RandomForestRegressor(
n_estimators=500,
max_depth=20,
min_samples_split=5,
min_samples_leaf=2,
random_state=42,
n_jobs=-1
)
rf_model.fit(X_train, y_train)
models['random_forest'] = rf_model
# Gradient Boosting
print("Training Gradient Boosting")
gb_model = GradientBoostingRegressor(
n_estimators=500,
max_depth=8,
learning_rate=0.05,
subsample=0.8,
random_state=42
)
gb_model.fit(X_train, y_train)
models['gradient_boosting'] = gb_model
# Ridge Regression
print("Training Ridge Regression")
ridge_model = Ridge(alpha=1.0, random_state=42)
ridge_model.fit(X_train, y_train)
models['ridge'] = ridge_model
return models
def predict_ensemble(models, X):
"""Make ensemble predictions"""
predictions = []
for name, model in models.items():
pred = model.predict(X)
predictions.append(pred)
# Weighted average (RF gets more weight)
weights = [0.5, 0.3, 0.2] # RF, GB, Ridge
ensemble_pred = np.average(predictions, axis=0, weights=weights)
# Ensure positive predictions
ensemble_pred = np.maximum(ensemble_pred, 0.01)
return ensemble_pred
def calculate_smape(y_true, y_pred):
"""Calculate SMAPE"""
return np.mean(np.abs(y_true - y_pred) / ((np.abs(y_true) + np.abs(y_pred)) / 2)) * 100
def main():
print("=" * 60)
print("Solution")
print("=" * 60)
# Load data
print("\nLoading data")
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')
print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
# Basic analysis
print(f"\nPrice statistics:")
print(f" Mean: ${train_df['price'].mean():.2f}")
print(f" Median: ${train_df['price'].median():.2f}")
print(f" Min: ${train_df['price'].min():.2f}")
print(f" Max: ${train_df['price'].max():.2f}")
# Feature engineering
print("\nFeature engineering")
train_df_processed = extract_features(train_df)
test_df_processed = extract_features(test_df)
# Select features
feature_cols = [
'ipq', 'text_length', 'word_count',
'max_number', 'avg_number', 'num_count', 'has_large_number',
'is_premium', 'material_count', 'has_size_info',
'has_bullet_points', 'has_emoji', 'brand_encoded',
'is_electronics', 'is_fashion', 'is_home', 'is_beauty', 'is_sports', 'is_food'
]
X_train = train_df_processed[feature_cols].fillna(0).values
y_train = train_df_processed['price'].values
print(f"Features used: {len(feature_cols)}")
print(f"Feature names: {feature_cols}")
# Train-validation split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
X_train, y_train, test_size=0.2, random_state=42
)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split)
X_val_scaled = scaler.transform(X_val_split)
# Train models
print("\nTraining ensemble models")
models = train_models(X_train_scaled, y_train_split, X_val_scaled, y_val_split)
# Validation
print("\nValidation")
val_predictions = predict_ensemble(models, X_val_scaled)
val_smape = calculate_smape(y_val_split, val_predictions)
val_mae = mean_absolute_error(y_val_split, val_predictions)
print(f"Validation SMAPE: {val_smape:.2f}%")
print(f"Validation MAE: ${val_mae:.2f}")
# Test predictions
print("\nGenerating test predictions")
X_test = test_df_processed[feature_cols].fillna(0).values
X_test_scaled = scaler.transform(X_test)
test_predictions = predict_ensemble(models, X_test_scaled)
# Save results
output_df = pd.DataFrame({
'sample_id': test_df['sample_id'],
'price': test_predictions
})
output_df.to_csv('test_out.csv', index=False)
print(f"\nResults saved to test_out.csv")
print(f"Total predictions: {len(output_df)}")
print(f"Price range: ${output_df['price'].min():.2f} - ${output_df['price'].max():.2f}")
print(f"Mean predicted price: ${output_df['price'].mean():.2f}")
# Feature importance
print(f"\nTop 10 most important features:")
rf_model = models['random_forest']
feature_importance = pd.DataFrame({
'feature': feature_cols,
'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
for i, row in feature_importance.head(10).iterrows():
print(f" {row['feature']}: {row['importance']:.3f}")
if __name__ == "__main__":
main()