analysis.py

Sure! Below is a detailed Python implementation for exploring and analyzing your dataset. I’ll break the implementation into steps, following the outlined analysis plan.

1. Data Preparation

Load the data and prepare for analysis.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('your_dataset.csv')

# Convert '_time' column to datetime for time-based analysis
df['_time'] = pd.to_datetime(df['_time'])

# Display the first few rows of the dataset
print(df.head())

# Check data types and null values
print(df.info())
print(df.isnull().sum())

2. Descriptive Analysis

Analyze proportions and feature distributions.

a. Proportion of Fraud vs. Non-Fraud

# Proportion of fraud (target = 1) vs. non-fraud (target = 0)
fraud_counts = df['target'].value_counts(normalize=True)
print(fraud_counts)

# Plot proportions
fraud_counts.plot(kind='bar')
plt.title('Proportion of Fraud vs Non-Fraud')
plt.xlabel('Target')
plt.ylabel('Proportion')
plt.show()

b. Feature Distributions

# Compare distributions for IP, device, and action
for col in ['IP', 'device', 'action']:
    plt.figure(figsize=(10, 5))
    sns.countplot(data=df, x=col, hue='target', order=df[col].value_counts().index[:10])
    plt.title(f'Distribution of {col} by Fraud Status')
    plt.xticks(rotation=45)
    plt.show()

3. Temporal Analysis

a. Fraud Over Time

# Aggregate fraud cases over time
time_fraud = df[df['target'] == 1].set_index('_time').resample('D').size()
time_non_fraud = df[df['target'] == 0].set_index('_time').resample('D').size()

# Plot fraud vs. non-fraud over time
plt.figure(figsize=(15, 5))
plt.plot(time_fraud, label='Fraud')
plt.plot(time_non_fraud, label='Non-Fraud')
plt.title('Fraud vs Non-Fraud Over Time')
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.legend()
plt.show()

b. Time of Day Analysis

# Extract hour from '_time'
df['hour'] = df['_time'].dt.hour

# Plot fraud occurrences by hour
plt.figure(figsize=(10, 5))
sns.countplot(data=df, x='hour', hue='target')
plt.title('Fraud Occurrences by Hour')
plt.xlabel('Hour of the Day')
plt.ylabel('Count')
plt.show()

4. Interaction Analysis

a. Customer-Device-IP Mapping

# Unique devices and IPs per customer
customer_device_ip = df.groupby('customer').agg({'device': 'nunique', 'IP': 'nunique'}).reset_index()
customer_device_ip.columns = ['customer', 'unique_devices', 'unique_IPs']

# Compare fraud vs. non-fraud
fraud_device_ip = df[df['target'] == 1].groupby('customer').agg({'device': 'nunique', 'IP': 'nunique'})
non_fraud_device_ip = df[df['target'] == 0].groupby('customer').agg({'device': 'nunique', 'IP': 'nunique'})

# Plot comparisons
customer_device_ip.plot(kind='scatter', x='unique_devices', y='unique_IPs', alpha=0.5)
plt.title('Customer Unique Devices vs IPs')
plt.xlabel('Unique Devices')
plt.ylabel('Unique IPs')
plt.show()

b. Action Sequences

# Analyze action sequences
sequences = df.groupby('customer')['action'].apply(list)
print(sequences.head())

# Identify common sequences for fraud cases
fraud_sequences = df[df['target'] == 1].groupby('customer')['action'].apply(list)
print(fraud_sequences.value_counts().head())

5. Machine Learning Approach

a. Feature Engineering

# Generate features
df['unique_actions'] = df.groupby('customer')['action'].transform('nunique')
df['unique_IPs'] = df.groupby('customer')['IP'].transform('nunique')
df['session_length'] = df.groupby('customer')['_time'].transform(lambda x: (x.max() - x.min()).total_seconds())

# Check new features
print(df[['unique_actions', 'unique_IPs', 'session_length']].head())

b. Train a Model

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Prepare features and labels
X = df[['unique_actions', 'unique_IPs', 'session_length']]
y = df['target']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))

c. Explainability

import shap

# Explain the model predictions using SHAP
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Plot SHAP summary
shap.summary_plot(shap_values[1], X_test)

6. Anomaly Detection

from sklearn.ensemble import IsolationForest

# Train an Isolation Forest
iso = IsolationForest(random_state=42, contamination=0.1)
iso.fit(X)

# Identify anomalies
df['anomaly_score'] = iso.decision_function(X)
df['is_anomaly'] = iso.predict(X)  # -1 for anomaly, 1 for normal

# Compare fraud detection with anomalies
print(df.groupby(['target', 'is_anomaly']).size())

7. Visualization

a. Correlation Matrix

# Correlation matrix
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

b. Heatmap for Relationships

# Heatmap of customer-IP-device relationships
pivot = df.pivot_table(index='customer', columns='IP', values='device', aggfunc='count', fill_value=0)
sns.heatmap(pivot, cmap='viridis')
plt.title('Customer-IP-Device Relationships')
plt.show()

This implementation provides you with an extensive toolbox to identify and visualize patterns differentiating fraud and non-fraud cases. Let me know if you’d like further refinement or if you encounter specific challenges!