Skip to content

Commit

Permalink
Merge pull request #94 from Sandrro/feat/parser_update
Browse files Browse the repository at this point in the history
updated vk parser
  • Loading branch information
Sandrro authored Sep 19, 2024
2 parents 028fce3 + fa4eef3 commit 55a2823
Showing 1 changed file with 23 additions and 16 deletions.
39 changes: 23 additions & 16 deletions sloyka/src/utils/data_getter/vk_data_getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,11 +244,15 @@ def run_comments(domain, post_ids, access_token):
for post_id in tqdm(post_ids):
comments = VKParser().get_comments(owner_id, post_id, access_token)
all_comments.extend(comments)
df = VKParser.comments_to_dataframe(all_comments)
df["type"] = "comment"
df = df.reset_index(drop=True)
print("comments downloaded")
return df
if len(all_comments) > 0:
df = VKParser.comments_to_dataframe(all_comments)
df["type"] = "comment"
df = df.reset_index(drop=True)
print("comments downloaded")
return df
else:
print("no comments")
return None

@staticmethod
def run_parser(domain, access_token, cutoff_date, number_of_messages=float("inf"), step=100):
Expand All @@ -267,17 +271,20 @@ def run_parser(domain, access_token, cutoff_date, number_of_messages=float("inf"
post_ids = df_posts["id"].tolist()

df_comments = VKParser.run_comments(domain=owner_id, post_ids=post_ids, access_token=access_token)
df_comments.loc[df_comments["parents_stack"].apply(lambda x: len(x) > 0), "type"] = "reply"
for i in range(len(df_comments)):
tmp = df_comments["parents_stack"].iloc[i]
if tmp is not None:
if len(tmp) > 0:
df_comments["parents_stack"].iloc[i] = tmp[0]
else:
df_comments["parents_stack"].iloc[i] = None

df_combined = df_comments.join(df_posts, on="post_id", rsuffix="_post")
df_combined = pd.concat([df_posts, df_comments], ignore_index=True)
if df_comments is not None:
df_comments.loc[df_comments["parents_stack"].apply(lambda x: len(x) > 0), "type"] = "reply"
for i in range(len(df_comments)):
tmp = df_comments["parents_stack"].iloc[i]
if tmp is not None:
if len(tmp) > 0:
df_comments["parents_stack"].iloc[i] = tmp[0]
else:
df_comments["parents_stack"].iloc[i] = None

df_combined = df_comments.join(df_posts, on="post_id", rsuffix="_post")
df_combined = pd.concat([df_posts, df_comments], ignore_index=True)
else:
df_combined = df_posts
df_group_name = VKParser.get_group_name(domain, access_token)
df_combined["group_name"] = df_group_name["group_name"][0]

Expand Down

0 comments on commit 55a2823

Please sign in to comment.