From c4890a21166fc488491076b4745a75f8c73594cc Mon Sep 17 00:00:00 2001 From: Elizabeth Alpert Date: Fri, 2 Jun 2023 15:13:28 +1000 Subject: [PATCH] Add views back in --- docs/schema.md | 26 +++++++------- src/tidy_tweet/database.py | 4 +++ src/tidy_tweet/tweet_mapping.py | 60 +++++++++++++++++++++++++++------ tests/test_overall.py | 2 +- 4 files changed, 67 insertions(+), 25 deletions(-) diff --git a/docs/schema.md b/docs/schema.md index e2ca1ef..2ebb06e 100644 --- a/docs/schema.md +++ b/docs/schema.md @@ -51,7 +51,7 @@ erDiagram string alt_text text media_key PK } - "user" { + "user_by_page" { text name text profile_image_url text id PK @@ -66,7 +66,7 @@ erDiagram integer page_id PK, FK text source_file FK } - "tweet" { + "tweet_by_page" { text id PK integer page_id PK, FK text reply_settings @@ -107,15 +107,15 @@ erDiagram user_hashtag |o--o{ user : "user" tweet_mention |o--o{ tweet : "tweet" user_mention |o--o{ user : "user" - user |o--o{ results_page : "page" - user |o--o{ results_page : "source file" - tweet |o--o{ results_page : "page" - tweet |o--o{ tweet : "retweeted tweet" - tweet |o--o{ tweet : "quoted tweet" - tweet |o--o{ tweet : "replied to tweet" - tweet |o--o{ user : "in reply to user" - tweet |o--o{ user : "author" - tweet |o--o{ results_page : "source file" + user_by_page |o--o{ results_page : "page" + user_by_page |o--o{ results_page : "source file" + tweet_by_page |o--o{ results_page : "page" + tweet_by_page |o--o{ tweet : "retweeted tweet" + tweet_by_page |o--o{ tweet : "quoted tweet" + tweet_by_page |o--o{ tweet : "replied to tweet" + tweet_by_page |o--o{ user : "in reply to user" + tweet_by_page |o--o{ user : "author" + tweet_by_page |o--o{ results_page : "source file" ``` Table **tweet_url**: @@ -191,7 +191,7 @@ Table **media**: - **media_key** (text primary key) -Table **user**: +Table **user_by_page**: - **name** (text) - **profile_image_url** (text) @@ -210,7 +210,7 @@ Table **user**: primary key -Table **tweet**: +Table **tweet_by_page**: - **id** (text primary key ) - **page_id** (integer primary key references results_page (id)) diff --git a/src/tidy_tweet/database.py b/src/tidy_tweet/database.py index 60f24fa..909a287 100644 --- a/src/tidy_tweet/database.py +++ b/src/tidy_tweet/database.py @@ -100,6 +100,10 @@ def initialise_sqlite( {"version": mapping.SCHEMA_VERSION}, ) + # Create views + for view_sql in mapping.sql_views.values(): + cursor.execute(view_sql) + logger.info("The database schema has been initialised") diff --git a/src/tidy_tweet/tweet_mapping.py b/src/tidy_tweet/tweet_mapping.py index ef25b13..5f82113 100644 --- a/src/tidy_tweet/tweet_mapping.py +++ b/src/tidy_tweet/tweet_mapping.py @@ -305,9 +305,9 @@ def map_media(media_list_json) -> Dict[str, List[Dict]]: # users # TODO: Fields not included yet: # - public_metrics -sql_by_table["user"] = { +sql_by_table["user_by_page"] = { "create": """ -create table user ( +create table user_by_page ( name text, profile_image_url text, id text, @@ -325,23 +325,41 @@ def map_media(media_list_json) -> Dict[str, List[Dict]]: ) """, "insert": """ -insert or ignore into user ( +insert or ignore into user_by_page ( id, username, name, url, profile_image_url, description, created_at, protected, verified, location, - pinned_tweet_id + pinned_tweet_id, + page_id, source_file ) values ( :id, :username, :name, :url, :profile_image_url, :description, :created_at, :protected, :verified, :location, - :pinned_tweet_id + :pinned_tweet_id, + :page_id, :source_file ) """, } +sql_views[ + "user" +] = """ +create view user as +select + id, username, name, url, + profile_image_url, description, + created_at, + protected, verified, + location, + pinned_tweet_id, + max(retrieved_at) as retrieved_at +from user_by_page +left join results_page on user_by_page.page_id = results_page.id +group by user_by_page.id +""" def map_user(user_json, source_file, page_id) -> Dict[str, List[Dict]]: @@ -361,7 +379,7 @@ def map_user(user_json, source_file, page_id) -> Dict[str, List[Dict]]: "page_id": page_id, } - mappings = {"user": [user_map]} + mappings = {"user_by_page": [user_map]} # Entities if "entities" in user_json: @@ -378,9 +396,9 @@ def map_user(user_json, source_file, page_id) -> Dict[str, List[Dict]]: # - entities # - context_annotations -sql_by_table["tweet"] = { +sql_by_table["tweet_by_page"] = { "create": """ -create table tweet ( +create table tweet_by_page ( id text, page_id integer references results_page (id), reply_settings text, @@ -405,7 +423,7 @@ def map_user(user_json, source_file, page_id) -> Dict[str, List[Dict]]: ) """, "insert": """ -insert or ignore into tweet ( +insert or ignore into tweet_by_page ( id, author_id, text, lang, source, possibly_sensitive, reply_settings, @@ -432,6 +450,26 @@ def map_user(user_json, source_file, page_id) -> Dict[str, List[Dict]]: ) """, } +sql_views[ + "tweet" +] = """ +create view tweet as +select + tweet_by_page.id, author_id, + text, lang, source, + possibly_sensitive, reply_settings, + created_at, + conversation_id, + retweeted_tweet_id, + quoted_tweet_id, + replied_to_tweet_id, + in_reply_to_user_id, + like_count, quote_count, reply_count, retweet_count, + max(retrieved_at) as retrieved_at +from tweet_by_page +left join results_page on tweet_by_page.page_id = results_page.id +group by tweet_by_page.id +""" def map_tweet( @@ -479,7 +517,7 @@ def map_tweet( tweet_map["quoted_tweet_id"] = qt_id tweet_map["replied_to_tweet_id"] = replied_to_id - mappings = {"tweet": [tweet_map]} + mappings = {"tweet_by_page": [tweet_map]} # Entities if "entities" in tweet_json: @@ -557,7 +595,7 @@ def map_page_metadata( for key in key_columns: metadata[key] = page_metadata_json.pop(key, None) - # Twarc metadata + # Twarc metadatas metadata["twarc_version"] = twarc_metadata_json.pop("version", None) metadata["request_url"] = twarc_metadata_json.pop("url", None) metadata["retrieved_at"] = twarc_metadata_json.pop("retrieved_at") diff --git a/tests/test_overall.py b/tests/test_overall.py index 45f8a86..9159d35 100644 --- a/tests/test_overall.py +++ b/tests/test_overall.py @@ -27,7 +27,7 @@ def test_load_timeline(tmp_path): db.execute( """ select directly_collected, count(*) - from tweet + from tweet_by_page group by directly_collected; """ )