amazon_solution/main.py at main · Sahil-aka/amazon_solution · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import pandas as pd
import numpy as np
import re
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

warnings.filterwarnings('ignore')

def extract_features(df):
    """Extract features from catalog content"""


    def process_text(text):
        if pd.isna(text):
            text = ""
        text = str(text).lower()

        features = {}

        # Basic text features
        features['text_length'] = len(text)
        features['word_count'] = len(text.split())

        # Extract numbers
        numbers = re.findall(r'\d+\.?\d*', text)
        if numbers:
            numbers = [float(n) for n in numbers]
            features['max_number'] = max(numbers)
            features['avg_number'] = np.mean(numbers)
            features['num_count'] = len(numbers)
            features['has_large_number'] = max(numbers) > 100
        else:
            features['max_number'] = 0
            features['avg_number'] = 0
            features['num_count'] = 0
            features['has_large_number'] = False

        # Extract brand (first word)
        words = text.split()
        features['brand'] = words[0] if words else 'unknown'

        # Extract IPQ
        ipq_patterns = [
            r'ipq[:\s]+(\d+)',
            r'pack of (\d+)',
            r'(\d+)\s*pack',
            r'(\d+)\s*count',
            r'(\d+)\s*piece'
        ]

        ipq = 1
        for pattern in ipq_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                ipq = int(match.group(1))
                break

        features['ipq'] = ipq

        # Premium indicators
        premium_keywords = ['premium', 'luxury', 'professional', 'pro', 'ultra', 'advanced']
        features['is_premium'] = any(kw in text for kw in premium_keywords)

        # Category indicators
        categories = {
            'electronics': ['electronic', 'digital', 'wireless', 'bluetooth', 'usb', 'battery'],
            'fashion': ['shirt', 'dress', 'clothing', 'wear', 'cotton', 'fabric'],
            'home': ['kitchen', 'home', 'furniture', 'decor', 'household'],
            'beauty': ['beauty', 'cosmetic', 'skincare', 'makeup', 'cream'],
            'sports': ['sport', 'fitness', 'exercise', 'gym', 'athletic'],
            'food': ['food', 'snack', 'drink', 'beverage', 'organic']
        }

        for cat, keywords in categories.items():
            features[f'is_{cat}'] = any(kw in text for kw in keywords)

        # Material indicators
        materials = ['wood', 'metal', 'plastic', 'glass', 'leather', 'ceramic']
        features['material_count'] = sum(1 for mat in materials if mat in text)

        # Size indicators
        size_keywords = ['small', 'medium', 'large', 'xl', 'size', 'inch', 'cm']
        features['has_size_info'] = any(kw in text for kw in size_keywords)

        # Special formatting
        features['has_bullet_points'] = 'bullet point' in text or '•' in text
        features['has_emoji'] = bool(re.search(r'[^\x00-\x7F]', text))

        return features

    # Extract features
    feature_dicts = df['catalog_content'].apply(process_text)
    feature_df = pd.DataFrame(feature_dicts.tolist())

    # Encode brand
    le = LabelEncoder()
    feature_df['brand_encoded'] = le.fit_transform(feature_df['brand'])

    # Combine with original data
    result_df = pd.concat([df, feature_df], axis=1)

    return result_df

def train_models(X_train, y_train, X_val, y_val):
    """Train ensemble of models"""


    models = {}


    # Random Forest
    print("Training Random Forest")
    rf_model = RandomForestRegressor(
        n_estimators=500,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )
    rf_model.fit(X_train, y_train)
    models['random_forest'] = rf_model

    # Gradient Boosting
    print("Training Gradient Boosting")
    gb_model = GradientBoostingRegressor(
        n_estimators=500,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.8,
        random_state=42
    )
    gb_model.fit(X_train, y_train)
    models['gradient_boosting'] = gb_model

    # Ridge Regression
    print("Training Ridge Regression")
    ridge_model = Ridge(alpha=1.0, random_state=42)
    ridge_model.fit(X_train, y_train)
    models['ridge'] = ridge_model

    return models

def predict_ensemble(models, X):
    """Make ensemble predictions"""
    predictions = []

    for name, model in models.items():
        pred = model.predict(X)
        predictions.append(pred)

    # Weighted average (RF gets more weight)
    weights = [0.5, 0.3, 0.2]  # RF, GB, Ridge
    ensemble_pred = np.average(predictions, axis=0, weights=weights)

    # Ensure positive predictions
    ensemble_pred = np.maximum(ensemble_pred, 0.01)

    return ensemble_pred

def calculate_smape(y_true, y_pred):
    """Calculate SMAPE"""
    return np.mean(np.abs(y_true - y_pred) / ((np.abs(y_true) + np.abs(y_pred)) / 2)) * 100

def main():


    print("=" * 60)
    print("Solution")
    print("=" * 60)

    # Load data
    print("\nLoading data")
    train_df = pd.read_csv('dataset/train.csv')
    test_df = pd.read_csv('dataset/test.csv')

    print(f"Training samples: {len(train_df)}")
    print(f"Test samples: {len(test_df)}")

    # Basic analysis
    print(f"\nPrice statistics:")
    print(f"  Mean: ${train_df['price'].mean():.2f}")
    print(f"  Median: ${train_df['price'].median():.2f}")
    print(f"  Min: ${train_df['price'].min():.2f}")
    print(f"  Max: ${train_df['price'].max():.2f}")

    # Feature engineering
    print("\nFeature engineering")
    train_df_processed = extract_features(train_df)
    test_df_processed = extract_features(test_df)

    # Select features
    feature_cols = [
        'ipq', 'text_length', 'word_count',
        'max_number', 'avg_number', 'num_count', 'has_large_number',
        'is_premium', 'material_count', 'has_size_info',
        'has_bullet_points', 'has_emoji', 'brand_encoded',
        'is_electronics', 'is_fashion', 'is_home', 'is_beauty', 'is_sports', 'is_food'
    ]

    X_train = train_df_processed[feature_cols].fillna(0).values
    y_train = train_df_processed['price'].values

    print(f"Features used: {len(feature_cols)}")
    print(f"Feature names: {feature_cols}")

    # Train-validation split
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_split)
    X_val_scaled = scaler.transform(X_val_split)

    # Train models
    print("\nTraining ensemble models")
    models = train_models(X_train_scaled, y_train_split, X_val_scaled, y_val_split)

    # Validation
    print("\nValidation")
    val_predictions = predict_ensemble(models, X_val_scaled)
    val_smape = calculate_smape(y_val_split, val_predictions)
    val_mae = mean_absolute_error(y_val_split, val_predictions)

    print(f"Validation SMAPE: {val_smape:.2f}%")
    print(f"Validation MAE: ${val_mae:.2f}")

    # Test predictions
    print("\nGenerating test predictions")
    X_test = test_df_processed[feature_cols].fillna(0).values
    X_test_scaled = scaler.transform(X_test)
    test_predictions = predict_ensemble(models, X_test_scaled)

    # Save results
    output_df = pd.DataFrame({
        'sample_id': test_df['sample_id'],
        'price': test_predictions
    })

    output_df.to_csv('test_out.csv', index=False)

    print(f"\nResults saved to test_out.csv")
    print(f"Total predictions: {len(output_df)}")
    print(f"Price range: ${output_df['price'].min():.2f} - ${output_df['price'].max():.2f}")
    print(f"Mean predicted price: ${output_df['price'].mean():.2f}")

    # Feature importance
    print(f"\nTop 10 most important features:")
    rf_model = models['random_forest']
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)

    for i, row in feature_importance.head(10).iterrows():
        print(f"  {row['feature']}: {row['importance']:.3f}")

if __name__ == "__main__":
    main()