Skip to content

Commit 857d890

Browse files
committed
WIP data script
1 parent 4b20655 commit 857d890

2 files changed

Lines changed: 242 additions & 1 deletion

File tree

Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Generate mock reactions data for channel_messages CSV files.
4+
5+
This script takes an input CSV containing channel_messages data (without reactions)
6+
and outputs a new CSV with a generated 'reactions' column containing realistic mock data.
7+
"""
8+
9+
import argparse
10+
import random
11+
from pathlib import Path
12+
13+
import pandas as pd
14+
15+
16+
REACTION_NAMES = [
17+
'approved',
18+
'merged',
19+
'raised_hands',
20+
'pray',
21+
'laugh',
22+
'ty',
23+
'this',
24+
'point_up',
25+
'white_check_mark'
26+
]
27+
28+
MOCK_SLACK_USER_IDS = [
29+
'U1FM88J9J',
30+
'UNC2V4ZD7',
31+
'U7W96XW2A',
32+
'US3QTX4Z1',
33+
'UQY0NPYWN',
34+
'UR257U86O',
35+
'UG6UG9470',
36+
'U0KG42Q0P',
37+
'UEA8H90WN',
38+
'UNNH8IW78',
39+
'U3GY5K9EQ',
40+
'UL9VWQ9S2',
41+
'UORS33I12',
42+
'U8S28EIM6',
43+
'UWL5JX23M',
44+
'UV77JEL7Q',
45+
'U2M260029',
46+
'UFF1HYHJ2',
47+
'UDC0ZA535',
48+
'UB81I8027',
49+
'U3HY0BS3W',
50+
'UL92954KA',
51+
'UZ5125Y28',
52+
'U168N7Y9G',
53+
'UE83A3954',
54+
'UP9T7VJBF',
55+
'UK839UQ5R',
56+
'UCL2RJ89X',
57+
'UFUV7GPHM',
58+
'UNVWFC5Q1',
59+
'UY640917T',
60+
'UQSF747QR',
61+
'U34AX16M0',
62+
'UWBXBIUB7',
63+
'U2QXC4ZNG',
64+
'UN979RN92',
65+
'UH749DC04',
66+
'UL2989703',
67+
'U4095YWYD',
68+
'U24SJD0U3',
69+
'U7ZP08017',
70+
'U0GBC0N90',
71+
'UU6K92DUR',
72+
'UNY6WDI60',
73+
'UJ65P6F44',
74+
'U3TE14ET2',
75+
'U96N45CL2',
76+
'UK4V3ON2O',
77+
'UKA77QAYY',
78+
'U81OT695K',
79+
'UPT27H737',
80+
'UOBK5OFZ0',
81+
'U0EIW7DOX',
82+
'UZ271WU40',
83+
'UYVM94HE1',
84+
'UEZMYV74R',
85+
'UYEP6YY91',
86+
'U9E255NN9',
87+
'ULRS0HVJ3',
88+
'U378U76S4',
89+
'UK756E555',
90+
'U6P582A60',
91+
'UL96240J4',
92+
'UU95TZPYP',
93+
'U306GEBY8',
94+
'U549234TU',
95+
'UIX8TSI0Y',
96+
'UE0WA7608',
97+
'U9YFOYM78',
98+
'UZ90K3JDW',
99+
'U5V38TNV1',
100+
'U0381889F',
101+
'UF39WLXAT',
102+
'UHGL04640',
103+
'UA7M10C0O',
104+
'UB51ZBPGJ',
105+
'UNS22LGIY',
106+
'UN0J0I7F0',
107+
'UH1039P7I',
108+
'UO513ARUP',
109+
'U3O3NUM10',
110+
'UKVN1ZI1Z',
111+
'ULAN263E4',
112+
'U7PX1XVTE',
113+
'UIDQ8QXI5',
114+
'UV4M04Y17',
115+
'U0D6182AR',
116+
'UC3Z15JC3',
117+
'U84A3Q05B',
118+
'UWW10VL06',
119+
'USI1BH3K3',
120+
'U2XQ3JJAN',
121+
'UYJ494VWS',
122+
'ULWL32B29',
123+
'U91Q0A72O',
124+
'U85PB3J63',
125+
'UA3IV9419',
126+
'UB9LLF16J',
127+
'U3U77Y17K',
128+
'U08ITMEA6',
129+
]
130+
131+
132+
def generate_reactions() -> list[dict]:
133+
"""
134+
Generate a random list of reactions for a message.
135+
136+
Returns a list of reaction dicts, each containing:
137+
- name: reaction name from REACTION_NAMES
138+
- users: list of user IDs who reacted
139+
- count: number of users (equals len(users))
140+
"""
141+
# Randomly decide how many reaction types (0 to all available)
142+
# Weight towards fewer reactions (more realistic)
143+
num_reaction_types = random.choices(
144+
range(len(REACTION_NAMES) + 1),
145+
weights=[40, 25, 15, 10, 5, 3, 1, 0.5, 0.3, 0.2], # Weighted towards 0-2 reactions
146+
k=1
147+
)[0]
148+
149+
if num_reaction_types == 0:
150+
return []
151+
152+
# Select which reaction types to include
153+
selected_reactions = random.sample(REACTION_NAMES, num_reaction_types)
154+
155+
reactions = []
156+
for reaction_name in selected_reactions:
157+
# Randomly decide how many users reacted (1 to ~10 typically)
158+
# Weight towards fewer users per reaction
159+
num_users = random.choices(
160+
range(1, 11),
161+
weights=[40, 25, 15, 10, 5, 3, 1, 0.5, 0.3, 0.2],
162+
k=1
163+
)[0]
164+
165+
# Select random users
166+
users = random.sample(MOCK_SLACK_USER_IDS, num_users)
167+
168+
reactions.append({
169+
'name': reaction_name,
170+
'users': users,
171+
'count': len(users)
172+
})
173+
174+
return reactions
175+
176+
177+
def process_csv(input_path: Path, output_path: Path) -> None:
178+
"""
179+
Read input CSV, generate reactions for each row, and write output CSV.
180+
"""
181+
# Read input CSV
182+
df = pd.read_csv(input_path)
183+
184+
# Generate reactions for each row
185+
df['reactions'] = [str(generate_reactions()) for _ in range(len(df))]
186+
187+
# Write output CSV
188+
df.to_csv(output_path, index=False)
189+
190+
print(f"Processed {len(df)} rows")
191+
print(f"Output written to: {output_path}")
192+
193+
194+
def main():
195+
parser = argparse.ArgumentParser(
196+
description="Generate mock reactions data for channel_messages CSV files."
197+
)
198+
parser.add_argument(
199+
"input_file",
200+
type=Path,
201+
help="Path to input CSV file containing channel_messages data"
202+
)
203+
parser.add_argument(
204+
"-o", "--output",
205+
type=Path,
206+
default=None,
207+
help="Path to output CSV file (default: input_file with '_with_reactions' suffix)"
208+
)
209+
parser.add_argument(
210+
"--seed",
211+
type=int,
212+
default=None,
213+
help="Random seed for reproducible results"
214+
)
215+
216+
args = parser.parse_args()
217+
218+
# Validate input file exists
219+
if not args.input_file.exists():
220+
print(f"Error: Input file not found: {args.input_file}")
221+
return 1
222+
223+
# Set output path
224+
if args.output is None:
225+
output_path = args.input_file.with_stem(f"{args.input_file.stem}_with_reactions")
226+
else:
227+
output_path = args.output
228+
229+
# Set random seed if provided
230+
if args.seed is not None:
231+
random.seed(args.seed)
232+
233+
# Process the CSV
234+
process_csv(args.input_file, output_path)
235+
236+
return 0
237+
238+
239+
if __name__ == "__main__":
240+
exit(main())
241+

shared/projects/dbt/slack_analytics/models/staging/stg_channel_messages.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ renamed as (
1111
channel_name,
1212
reply_count,
1313
reply_users_count,
14-
{{ parse_json('reply_users') }} as reply_users,
14+
{{ parse_json('reply_users') }} as reply_users, --TODO: this is an array of strings, how do we handle this?
1515
{{ parse_json('reactions') }} as reactions,
1616
message_datetime,
1717
extracted_at

0 commit comments

Comments
 (0)