Skip to content

Commit

Permalink
Add views back in
Browse files Browse the repository at this point in the history
  • Loading branch information
betsybookwyrm committed Jun 2, 2023
1 parent 241deab commit c4890a2
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 25 deletions.
26 changes: 13 additions & 13 deletions docs/schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ erDiagram
string alt_text
text media_key PK
}
"user" {
"user_by_page" {
text name
text profile_image_url
text id PK
Expand All @@ -66,7 +66,7 @@ erDiagram
integer page_id PK, FK
text source_file FK
}
"tweet" {
"tweet_by_page" {
text id PK
integer page_id PK, FK
text reply_settings
Expand Down Expand Up @@ -107,15 +107,15 @@ erDiagram
user_hashtag |o--o{ user : "user"
tweet_mention |o--o{ tweet : "tweet"
user_mention |o--o{ user : "user"
user |o--o{ results_page : "page"
user |o--o{ results_page : "source file"
tweet |o--o{ results_page : "page"
tweet |o--o{ tweet : "retweeted tweet"
tweet |o--o{ tweet : "quoted tweet"
tweet |o--o{ tweet : "replied to tweet"
tweet |o--o{ user : "in reply to user"
tweet |o--o{ user : "author"
tweet |o--o{ results_page : "source file"
user_by_page |o--o{ results_page : "page"
user_by_page |o--o{ results_page : "source file"
tweet_by_page |o--o{ results_page : "page"
tweet_by_page |o--o{ tweet : "retweeted tweet"
tweet_by_page |o--o{ tweet : "quoted tweet"
tweet_by_page |o--o{ tweet : "replied to tweet"
tweet_by_page |o--o{ user : "in reply to user"
tweet_by_page |o--o{ user : "author"
tweet_by_page |o--o{ results_page : "source file"
```

Table **tweet_url**:
Expand Down Expand Up @@ -191,7 +191,7 @@ Table **media**:
- **media_key** (text primary key)


Table **user**:
Table **user_by_page**:

- **name** (text)
- **profile_image_url** (text)
Expand All @@ -210,7 +210,7 @@ Table **user**:
primary key


Table **tweet**:
Table **tweet_by_page**:

- **id** (text primary key )
- **page_id** (integer primary key references results_page (id))
Expand Down
4 changes: 4 additions & 0 deletions src/tidy_tweet/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@ def initialise_sqlite(
{"version": mapping.SCHEMA_VERSION},
)

# Create views
for view_sql in mapping.sql_views.values():
cursor.execute(view_sql)

logger.info("The database schema has been initialised")


Expand Down
60 changes: 49 additions & 11 deletions src/tidy_tweet/tweet_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,9 +305,9 @@ def map_media(media_list_json) -> Dict[str, List[Dict]]:
# users
# TODO: Fields not included yet:
# - public_metrics
sql_by_table["user"] = {
sql_by_table["user_by_page"] = {
"create": """
create table user (
create table user_by_page (
name text,
profile_image_url text,
id text,
Expand All @@ -325,23 +325,41 @@ def map_media(media_list_json) -> Dict[str, List[Dict]]:
)
""",
"insert": """
insert or ignore into user (
insert or ignore into user_by_page (
id, username, name, url,
profile_image_url, description,
created_at,
protected, verified,
location,
pinned_tweet_id
pinned_tweet_id,
page_id, source_file
) values (
:id, :username, :name, :url,
:profile_image_url, :description,
:created_at,
:protected, :verified,
:location,
:pinned_tweet_id
:pinned_tweet_id,
:page_id, :source_file
)
""",
}
sql_views[
"user"
] = """
create view user as
select
id, username, name, url,
profile_image_url, description,
created_at,
protected, verified,
location,
pinned_tweet_id,
max(retrieved_at) as retrieved_at
from user_by_page
left join results_page on user_by_page.page_id = results_page.id
group by user_by_page.id
"""


def map_user(user_json, source_file, page_id) -> Dict[str, List[Dict]]:
Expand All @@ -361,7 +379,7 @@ def map_user(user_json, source_file, page_id) -> Dict[str, List[Dict]]:
"page_id": page_id,
}

mappings = {"user": [user_map]}
mappings = {"user_by_page": [user_map]}

# Entities
if "entities" in user_json:
Expand All @@ -378,9 +396,9 @@ def map_user(user_json, source_file, page_id) -> Dict[str, List[Dict]]:
# - entities
# - context_annotations

sql_by_table["tweet"] = {
sql_by_table["tweet_by_page"] = {
"create": """
create table tweet (
create table tweet_by_page (
id text,
page_id integer references results_page (id),
reply_settings text,
Expand All @@ -405,7 +423,7 @@ def map_user(user_json, source_file, page_id) -> Dict[str, List[Dict]]:
)
""",
"insert": """
insert or ignore into tweet (
insert or ignore into tweet_by_page (
id, author_id,
text, lang, source,
possibly_sensitive, reply_settings,
Expand All @@ -432,6 +450,26 @@ def map_user(user_json, source_file, page_id) -> Dict[str, List[Dict]]:
)
""",
}
sql_views[
"tweet"
] = """
create view tweet as
select
tweet_by_page.id, author_id,
text, lang, source,
possibly_sensitive, reply_settings,
created_at,
conversation_id,
retweeted_tweet_id,
quoted_tweet_id,
replied_to_tweet_id,
in_reply_to_user_id,
like_count, quote_count, reply_count, retweet_count,
max(retrieved_at) as retrieved_at
from tweet_by_page
left join results_page on tweet_by_page.page_id = results_page.id
group by tweet_by_page.id
"""


def map_tweet(
Expand Down Expand Up @@ -479,7 +517,7 @@ def map_tweet(
tweet_map["quoted_tweet_id"] = qt_id
tweet_map["replied_to_tweet_id"] = replied_to_id

mappings = {"tweet": [tweet_map]}
mappings = {"tweet_by_page": [tweet_map]}

# Entities
if "entities" in tweet_json:
Expand Down Expand Up @@ -557,7 +595,7 @@ def map_page_metadata(
for key in key_columns:
metadata[key] = page_metadata_json.pop(key, None)

# Twarc metadata
# Twarc metadatas
metadata["twarc_version"] = twarc_metadata_json.pop("version", None)
metadata["request_url"] = twarc_metadata_json.pop("url", None)
metadata["retrieved_at"] = twarc_metadata_json.pop("retrieved_at")
Expand Down
2 changes: 1 addition & 1 deletion tests/test_overall.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_load_timeline(tmp_path):
db.execute(
"""
select directly_collected, count(*)
from tweet
from tweet_by_page
group by directly_collected;
"""
)
Expand Down

0 comments on commit c4890a2

Please sign in to comment.