Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions channel_messages_stats.csv

Large diffs are not rendered by default.

238 changes: 238 additions & 0 deletions scripts_python/generate_channel_messages_reactions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
#!/usr/bin/env python3
"""
Generate mock reactions data for channel_messages CSV files.

This script takes an input CSV containing channel_messages data (without reactions)
and outputs a new CSV with a generated 'reactions' column containing realistic mock data.
"""

import argparse
import json
import random
from pathlib import Path

import pandas as pd


REACTION_NAMES = [
'approved',
'merged',
'raised_hands',
'pray',
'laugh',
'ty',
'this',
'point_up',
'white_check_mark'
]

MOCK_SLACK_USER_IDS = [
'U1FM88J9J',
'UNC2V4ZD7',
'U7W96XW2A',
'US3QTX4Z1',
'UQY0NPYWN',
'UR257U86O',
'UG6UG9470',
'U0KG42Q0P',
'UEA8H90WN',
'UNNH8IW78',
'U3GY5K9EQ',
'UL9VWQ9S2',
'UORS33I12',
'U8S28EIM6',
'UWL5JX23M',
'UV77JEL7Q',
'U2M260029',
'UFF1HYHJ2',
'UDC0ZA535',
'UB81I8027',
'U3HY0BS3W',
'UL92954KA',
'UZ5125Y28',
'U168N7Y9G',
'UE83A3954',
'UP9T7VJBF',
'UK839UQ5R',
'UCL2RJ89X',
'UFUV7GPHM',
'UNVWFC5Q1',
'UY640917T',
'UQSF747QR',
'U34AX16M0',
'UWBXBIUB7',
'U2QXC4ZNG',
'UN979RN92',
'UH749DC04',
'UL2989703',
'U4095YWYD',
'U24SJD0U3',
'U7ZP08017',
'U0GBC0N90',
'UU6K92DUR',
'UNY6WDI60',
'UJ65P6F44',
'U3TE14ET2',
'U96N45CL2',
'UK4V3ON2O',
'UKA77QAYY',
'U81OT695K',
'UPT27H737',
'UOBK5OFZ0',
'U0EIW7DOX',
'UZ271WU40',
'UYVM94HE1',
'UEZMYV74R',
'UYEP6YY91',
'U9E255NN9',
'ULRS0HVJ3',
'U378U76S4',
'UK756E555',
'U6P582A60',
'UL96240J4',
'UU95TZPYP',
'U306GEBY8',
'U549234TU',
'UIX8TSI0Y',
'UE0WA7608',
'U9YFOYM78',
'UZ90K3JDW',
'U5V38TNV1',
'U0381889F',
'UF39WLXAT',
'UHGL04640',
'UA7M10C0O',
'UB51ZBPGJ',
'UNS22LGIY',
'UN0J0I7F0',
'UH1039P7I',
'UO513ARUP',
'U3O3NUM10',
'UKVN1ZI1Z',
'ULAN263E4',
'U7PX1XVTE',
'UIDQ8QXI5',
'UV4M04Y17',
'U0D6182AR',
'UC3Z15JC3',
'U84A3Q05B',
'UWW10VL06',
'USI1BH3K3',
'U2XQ3JJAN',
'UYJ494VWS',
'ULWL32B29',
'U91Q0A72O',
'U85PB3J63',
'UA3IV9419',
'UB9LLF16J',
'U3U77Y17K',
'U08ITMEA6',
]


def generate_reactions() -> list[dict]:
"""
Generate a random list of reactions for a message.

Returns a list of reaction dicts, each containing:
- name: reaction name from REACTION_NAMES
- users: list of user IDs who reacted
- count: number of users (equals len(users))
"""
# Randomly decide how many reaction types (0 to all available)
# Weight towards fewer reactions (more realistic)
num_reaction_types = random.choices(
range(len(REACTION_NAMES) + 1),
weights=[40, 25, 15, 10, 5, 3, 1, 0.5, 0.3, 0.2], # Weighted towards 0-2 reactions
k=1
)[0]

if num_reaction_types == 0:
return []

# Select which reaction types to include
selected_reactions = random.sample(REACTION_NAMES, num_reaction_types)

reactions = []
for reaction_name in selected_reactions:
# Randomly decide how many users reacted (1 to ~10 typically)
# Weight towards fewer users per reaction
num_users = random.choices(
range(1, 11),
weights=[40, 25, 15, 10, 5, 3, 1, 0.5, 0.3, 0.2],
k=1
)[0]

# Select random users
users = random.sample(MOCK_SLACK_USER_IDS, num_users)

reactions.append({
'name': reaction_name,
'users': users,
'count': len(users)
})

return reactions


def process_csv(input_path: Path, output_path: Path) -> None:
"""
Read input CSV, generate reactions for each row, and write output CSV.
"""

df = pd.read_csv(input_path)
df['reactions'] = [json.dumps(generate_reactions()) for _ in range(len(df))]
df.to_csv(output_path, index=False)

print(f"Processed {len(df)} rows")
print(f"Output written to: {output_path}")


def main():
parser = argparse.ArgumentParser(
description="Generate mock reactions data for channel_messages CSV files."
)
parser.add_argument(
"input_file",
type=Path,
help="Path to input CSV file containing channel_messages data"
)
parser.add_argument(
"-o", "--output",
type=Path,
default=None,
help="Path to output CSV file (default: input_file with '_with_reactions' suffix)"
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Random seed for reproducible results"
)

args = parser.parse_args()

# Validate input file exists
if not args.input_file.exists():
print(f"Error: Input file not found: {args.input_file}")
return 1

# Set output path
if args.output is None:
output_path = args.input_file.with_stem(f"{args.input_file.stem}_with_reactions")
else:
output_path = args.output

# Set random seed if provided
if args.seed is not None:
random.seed(args.seed)

# Process the CSV
process_csv(args.input_file, output_path)

return 0


if __name__ == "__main__":
exit(main())

34 changes: 34 additions & 0 deletions shared/projects/dbt/slack_analytics/dbt_project.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: 'slack_analytics'
version: '1.0.0'
config-version: 2

profile: 'slack-analytics-duckdb'

# These configurations specify where dbt should look for different types of files.
model-paths:
- models
analysis-paths:
- analyses
test-paths:
- tests
seed-paths:
- seeds
macro-paths:
- macros
snapshot-paths:
- snapshots

clean-targets: # Directories to be removed by `dbt clean`
- dbt_packages
- state
- target

# Configuring models
# Full documentation: https://docs.getdbt.com/docs/configuring-models

models:
slack_analytics:
+materialized: table

vars:
local_timezone: 'UTC'
15 changes: 15 additions & 0 deletions shared/projects/dbt/slack_analytics/macros/parse_json.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{% macro parse_json(column_name) %}
{{ return(adapter.dispatch('parse_json', 'slack_analytics')(column_name)) }}
{% endmacro %}

{% macro default__parse_json(column_name) %}
{{ column_name }}::JSON
{% endmacro %}

{% macro databricks__parse_json(column_name) %}
FROM_JSON({{ column_name }}, 'STRING')
{% endmacro %}

{% macro snowflake__parse_json(column_name) %}
PARSE_JSON({{ column_name }})
{% endmacro %}
27 changes: 27 additions & 0 deletions shared/projects/dbt/slack_analytics/macros/to_date.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{% macro to_date(timestamp, localize=True, timezone=var('local_timezone')) %}
{{ return(adapter.dispatch('to_date', 'slack_analytics')(timestamp, localize, timezone)) }}
{% endmacro %}

{% macro default__to_date(timestamp, localize, timezone) %}
{% if localize %}
TO_DATE(FROM_UTC_TIMESTAMP({{ timestamp }}, '{{ timezone }}'))
{% else %}
TO_DATE({{ timestamp }})
{% endif %}
{% endmacro %}

{% macro duckdb__to_date(timestamp, localize, timezone) %}
{% if localize %}
CAST(timezone('{{ timezone }}', {{ timestamp }}::TIMESTAMPTZ) AS DATE)
{% else %}
CAST({{ timestamp }} AS DATE)
{% endif %}
{% endmacro %}

{% macro snowflake__to_date(timestamp, localize, timezone) %}
{% if localize %}
TO_DATE(CONVERT_TIMEZONE('UTC', '{{ timezone }}', {{ timestamp }}))
{% else %}
TO_DATE({{ timestamp }})
{% endif %}
{% endmacro %}
15 changes: 15 additions & 0 deletions shared/projects/dbt/slack_analytics/macros/unnest_array.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{% macro unnest_array(array_column, alias) %}
{{ return(adapter.dispatch('unnest_array', 'slack_analytics')(array_column, alias)) }}
{% endmacro %}

{% macro databricks__unnest_array(array_column, alias) %}
lateral view explode({{ array_column }}) as {{ alias }}
{% endmacro %}

{% macro snowflake__unnest_array(array_column, alias) %}
, lateral flatten(input => {{ array_column }}) as {{ alias }}
{% endmacro %}

{% macro duckdb__unnest_array(array_column, alias) %}
, unnest(from_json({{ array_column }}, '["JSON"]')) as t({{ alias }})
{% endmacro %}
39 changes: 39 additions & 0 deletions shared/projects/dbt/slack_analytics/models/dimensions/_models.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
version: 2
models:
- name: dim_slack_messages
description: |
This table contains one row per Slack message obtained from `stg_channel_messages`.
columns:
- name: message_id
data_type: string
description: Unique identifier for the message. Primary key.
tests:
- unique
- not_null
- name: user_id
data_type: string
description: The user ID of the person who sent the message.
- name: channel_name
data_type: string
description: The name of the channel the message was sent in.
- name: reply_count
data_type: int
description: The number of replies to the message.
- name: reply_users_count
data_type: int
description: The number of users who replied to the message.
- name: reply_users
data_type: array<string>
description: An array of the users who replied to the message.
- name: reactions
data_type: array<struct<name:string, users:array<string>, count:int>>
description: An array of the reactions to the message including the reaction type and the user who reacted.
- name: message_date
data_type: date
description: The date the message was sent.
- name: message_datetime
data_type: timestamp
description: The date and time the message was sent.
- name: extracted_datetime
data_type: timestamp
description: The date and time the message was extracted from the Slack API.
Loading