Skip to content

Commit

Permalink
Add flag for nonstandard file encodings (e.g. UTF-16)
Browse files Browse the repository at this point in the history
  • Loading branch information
betsybookwyrm committed Aug 9, 2023
1 parent 4d12822 commit a68b4bd
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 4 deletions.
12 changes: 10 additions & 2 deletions src/tidy_tweet/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,16 @@
help="Should the SQLite tables be created in strict mode (defaults to yes)? "
"Irrelevant if adding files to an existing database.",
)
@click.option(
"--json_encoding",
type=str,
default=None,
help="If the json file/s you wish to load are encoded other than UTF-8, specify "
"encoding. If you don't know what this means and you're not getting any "
"decoding errors using tidy_tweet, you're all good!",
)
def tidy_twarc_jsons(
database: Path, json_files: Collection[Union[str, PathLike]], strict
database: Path, json_files: Collection[Union[str, PathLike]], strict, json_encoding
):
"""
Tidies Twitter json collected with Twarc into relational tables.
Expand Down Expand Up @@ -72,7 +80,7 @@ def tidy_twarc_jsons(
for file in json_files:
n = n + 1 # Count files for user messaging only
click.echo(f"Loading {file} (file {n} of {num_files}) into {database}")
p = load_twarc_json_to_sqlite(file, database)
p = load_twarc_json_to_sqlite(file, database, json_encoding=json_encoding)
total_pages = total_pages + p
click.echo(f"{p} pages of Twitter results loaded from {file}")

Expand Down
8 changes: 6 additions & 2 deletions src/tidy_tweet/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ def _load_page_object(


def load_twarc_json_to_sqlite(
filename: Union[str, PathLike], db_name: Union[str, PathLike]
filename: Union[str, PathLike],
db_name: Union[str, PathLike],
json_encoding: str = None,
) -> int:
"""
Parses a json/jsonl file produced by a Twarc search and loads the Twitter data into
Expand All @@ -95,7 +97,9 @@ def load_twarc_json_to_sqlite(
:param db_name: The path to an existing sqlite database to load the data into
:return: The number of pages of Twitter results loaded in this file
"""
with open(filename, "r") as json_fh, sqlite3.connect(db_name) as connection:
with open(filename, "r", encoding=json_encoding) as json_fh, sqlite3.connect(
db_name
) as connection:
logger.info(f"Loading {filename} into {db_name}")

page_num = 0
Expand Down

0 comments on commit a68b4bd

Please sign in to comment.