From a68b4bd67df653ffd617b8552a5f4c6a1349b701 Mon Sep 17 00:00:00 2001 From: Elizabeth Alpert Date: Wed, 9 Aug 2023 11:41:24 +1000 Subject: [PATCH] Add flag for nonstandard file encodings (e.g. UTF-16) --- src/tidy_tweet/__main__.py | 12 ++++++++++-- src/tidy_tweet/processing.py | 8 ++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/tidy_tweet/__main__.py b/src/tidy_tweet/__main__.py index 13eddc2..1961876 100644 --- a/src/tidy_tweet/__main__.py +++ b/src/tidy_tweet/__main__.py @@ -24,8 +24,16 @@ help="Should the SQLite tables be created in strict mode (defaults to yes)? " "Irrelevant if adding files to an existing database.", ) +@click.option( + "--json_encoding", + type=str, + default=None, + help="If the json file/s you wish to load are encoded other than UTF-8, specify " + "encoding. If you don't know what this means and you're not getting any " + "decoding errors using tidy_tweet, you're all good!", +) def tidy_twarc_jsons( - database: Path, json_files: Collection[Union[str, PathLike]], strict + database: Path, json_files: Collection[Union[str, PathLike]], strict, json_encoding ): """ Tidies Twitter json collected with Twarc into relational tables. @@ -72,7 +80,7 @@ def tidy_twarc_jsons( for file in json_files: n = n + 1 # Count files for user messaging only click.echo(f"Loading {file} (file {n} of {num_files}) into {database}") - p = load_twarc_json_to_sqlite(file, database) + p = load_twarc_json_to_sqlite(file, database, json_encoding=json_encoding) total_pages = total_pages + p click.echo(f"{p} pages of Twitter results loaded from {file}") diff --git a/src/tidy_tweet/processing.py b/src/tidy_tweet/processing.py index c4c4a9d..29e13f3 100644 --- a/src/tidy_tweet/processing.py +++ b/src/tidy_tweet/processing.py @@ -81,7 +81,9 @@ def _load_page_object( def load_twarc_json_to_sqlite( - filename: Union[str, PathLike], db_name: Union[str, PathLike] + filename: Union[str, PathLike], + db_name: Union[str, PathLike], + json_encoding: str = None, ) -> int: """ Parses a json/jsonl file produced by a Twarc search and loads the Twitter data into @@ -95,7 +97,9 @@ def load_twarc_json_to_sqlite( :param db_name: The path to an existing sqlite database to load the data into :return: The number of pages of Twitter results loaded in this file """ - with open(filename, "r") as json_fh, sqlite3.connect(db_name) as connection: + with open(filename, "r", encoding=json_encoding) as json_fh, sqlite3.connect( + db_name + ) as connection: logger.info(f"Loading {filename} into {db_name}") page_num = 0