diff --git a/pbp/logging_helper.py b/pbp/logging_helper.py index 5c8ff65..05c89cb 100644 --- a/pbp/logging_helper.py +++ b/pbp/logging_helper.py @@ -45,3 +45,27 @@ def create_logger( ) return log + + +def create_logger_info(log_filename: str): + """ + Create a logger with INFO level for console and file and simple format (no log level). + Also logs to a file all messages at DEBUG level and above. + Best used for scripts that don't need DEBUG level logging to the console. + :param log_filename: + The name of the log file to create + """ + loguru.logger.remove() + log = copy.deepcopy(loguru.logger) + info_format = "{message}" + default_format = "{time} {level} {message}" + log.add( + sys.stdout, + level="INFO", + format=info_format, + filter=lambda record: record["level"].name == "INFO", + ) + log.add( + sink=open(log_filename, "w"), level="DEBUG", format=default_format, enqueue=True + ) + return log diff --git a/pbp/main_meta_generator.py b/pbp/main_meta_generator.py index af6f65b..a0d3cb0 100644 --- a/pbp/main_meta_generator.py +++ b/pbp/main_meta_generator.py @@ -2,11 +2,13 @@ from datetime import datetime from pathlib import Path +from pbp.logging_helper import create_logger_info from pbp.meta_gen.gen_nrs import NRSMetadataGenerator from pbp.meta_gen.gen_iclisten import IcListenMetadataGenerator from pbp.meta_gen.gen_soundtrap import SoundTrapMetadataGenerator from pbp.main_meta_generator_args import parse_arguments + # Some imports, in particular involving data processing, cause a delay that is # noticeable when just running the --help option. We get around this issue by # postponing the imports until actually needed. See the main() function. @@ -15,17 +17,6 @@ def main(): opts = parse_arguments() - # pylint: disable=import-outside-toplevel - from pbp.logging_helper import create_logger - - log = create_logger( - log_filename_and_level=( - f"{opts.output_dir}/{opts.recorder}{opts.start}_{opts.end}.log", - "INFO", - ), - console_level="INFO", - ) - log_dir = Path(opts.output_dir) json_dir = Path(opts.json_base_dir) if opts.xml_dir is None: @@ -42,6 +33,10 @@ def main(): start = datetime.strptime(opts.start, "%Y%m%d") end = datetime.strptime(opts.end, "%Y%m%d") + log = create_logger_info( + f"{opts.output_dir}/{opts.recorder}{opts.start:%Y%m%d}_{opts.end:%Y%m%d}.log" + ) + try: if opts.recorder == "NRS": generator = NRSMetadataGenerator( diff --git a/pbp/main_plot.py b/pbp/main_plot.py index 6aabf2e..781cfe2 100644 --- a/pbp/main_plot.py +++ b/pbp/main_plot.py @@ -113,8 +113,6 @@ def main(): jpeg_filename=jpeg_filename, show=show, ) - if jpeg_filename is not None: - print(f" done: {jpeg_filename}") if __name__ == "__main__": diff --git a/pbp/meta_gen/gen_iclisten.py b/pbp/meta_gen/gen_iclisten.py index 2f76866..0935357 100644 --- a/pbp/meta_gen/gen_iclisten.py +++ b/pbp/meta_gen/gen_iclisten.py @@ -138,7 +138,7 @@ def run(self): ) ) - self.log.info( + self.log.debug( f"{self.log_prefix} Found {len(wav_files)} files to process that " f"cover the expanded period {start_dt} - {end_dt}" ) @@ -154,8 +154,8 @@ def run(self): wav_files.sort(key=lambda x: x.start) # create a dataframe from the wav files - self.log.info( - f"{self.log_prefix} Creating dataframe from {len(wav_files)} files " + self.log.debug( + f"{self.log_prefix} creating dataframe from {len(wav_files)} files " f"spanning {wav_files[0].start} to {wav_files[-1].start}..." ) @@ -180,10 +180,16 @@ def run(self): except Exception as ex: self.log.exception(str(ex)) + # plot the daily coverage only on files that are greater than the start date + # this os tp avoid plotting any coverage on files only included for overlap plot_file = plot_daily_coverage( - InstrumentType.ICLISTEN, self.df, self.json_base_dir, self.start, self.end + InstrumentType.ICLISTEN, + self.df[self.df["start"] >= self.start], + self.json_base_dir, + self.start, + self.end, ) - self.log.info(f"Plot file: {plot_file}") + self.log.info(f"Coverage plot saved to {plot_file}") if __name__ == "__main__": diff --git a/pbp/meta_gen/gen_nrs.py b/pbp/meta_gen/gen_nrs.py index 248561a..98c318d 100644 --- a/pbp/meta_gen/gen_nrs.py +++ b/pbp/meta_gen/gen_nrs.py @@ -83,7 +83,7 @@ def run(self): if f_dt is None: continue if start_dt <= f_dt <= end_dt: - self.log.info(f"Found file {filename} with timestamp {f_dt}") + self.log.debug(f"Found file {filename} with timestamp {f_dt}") if ext == "*.flac": sound_files.append(FlacFile(self.log, str(filename), f_dt)) if ext == "*.wav": @@ -102,13 +102,13 @@ def run(self): if f_dt is None: continue if start_dt <= f_dt <= end_dt: - self.log.info(f"Found file {blob.name} with timestamp {f_dt}") + self.log.debug(f"Found file {blob.name} with timestamp {f_dt}") if re.search(r"\.flac$", blob.name): sound_files.append(FlacFile(self.log, f_path, f_dt)) if re.search(r"\.wav$", blob.name): sound_files.append(WavFile(self.log, f_path, f_dt)) # delay to avoid 400 error - if i % 100 == 0: + if i % 100 == 0 and i > 0: self.log.info( f"{i} files searched...found {len(sound_files)} files that match the search pattern" ) @@ -135,7 +135,7 @@ def run(self): for day in pd.date_range(self.start, self.end, freq="D"): try: # create a dataframe from the flac files - self.log.info( + self.log.debug( f"Creating dataframe from {len(sound_files)} " f"files spanning {sound_files[0].start} to {sound_files[-1].start} in self.json_base_dir..." ) @@ -155,9 +155,14 @@ def run(self): except Exception as ex: self.log.exception(str(ex)) - # plot the daily coverage + # plot the daily coverage only on files that are greater than the start date + # this os tp avoid plotting any coverage on files only included for overlap plot_file = plot_daily_coverage( - InstrumentType.NRS, self.df, self.json_base_dir, self.start, self.end + InstrumentType.NRS, + self.df[self.df["start"] >= self.start], + self.json_base_dir, + self.start, + self.end, ) self.log.info(f"Coverage plot saved to {plot_file}") diff --git a/pbp/meta_gen/gen_soundtrap.py b/pbp/meta_gen/gen_soundtrap.py index 71d91a2..69b908f 100644 --- a/pbp/meta_gen/gen_soundtrap.py +++ b/pbp/meta_gen/gen_soundtrap.py @@ -121,7 +121,7 @@ def run(self): else: # if the audio_loc is a s3 url, then we need to list the files in buckets that cover the start and end # dates - self.log.info(f"Searching between {start_dt} and {end_dt}") + self.log.debug(f"Searching between {start_dt} and {end_dt}") client = boto3.client("s3", config=Config(signature_version=UNSIGNED)) paginator = client.get_paginator("list_objects") @@ -148,7 +148,7 @@ def run(self): if start_dt <= key_dt <= end_dt and key.endswith(".wav"): # download the associated xml file to the wav file and create a SoundTrapWavFile object try: - self.log.info(f"Downloading {key_xml} ...") + self.log.debug(f"Downloading {key_xml} ...") client.download_file(bucket, key_xml, xml_path) wav_files.append(SoundTrapWavFile(uri, xml_path, key_dt)) except Exception as ex: @@ -158,7 +158,7 @@ def run(self): continue self.log.info( - f"Found {len(wav_files)} files to process that cover the expanded period {start_dt} - {end_dt}" + f"Found {len(wav_files)} files to process that covers the expanded period {start_dt} - {end_dt}" ) if len(wav_files) == 0: @@ -168,7 +168,7 @@ def run(self): wav_files.sort(key=lambda x: x.start) # create a dataframe from the wav files - self.log.info( + self.log.debug( f"Creating dataframe from {len(wav_files)} files spanning " f"{wav_files[0].start} to {wav_files[-1].start}..." ) @@ -206,7 +206,7 @@ def run(self): # plot the daily coverage plot_file = plot_daily_coverage( InstrumentType.SOUNDTRAP, - self.df, + self.df[self.df["start"] >= self.start], self.json_base_dir, self.start, self.end, diff --git a/pbp/meta_gen/json_generator.py b/pbp/meta_gen/json_generator.py index d825361..856ecd8 100644 --- a/pbp/meta_gen/json_generator.py +++ b/pbp/meta_gen/json_generator.py @@ -71,10 +71,6 @@ def run(self): | ((self.raw_df["end"] >= self.day) & (self.raw_df["start"] < self.day)) ] - self.log.info( - f"Creating metadata for day {self.day} from {len(day_df)} files..." - ) - if len(day_df) == 0: self.log.warning(f"No metadata found for day {self.day}") return @@ -85,7 +81,7 @@ def run(self): day_df["end"] = pd.to_datetime(day_df["end"]) # get the file list that covers the requested day - self.log.info( + self.log.debug( f'Found {len(day_df)} files for day {self.day}, between {day_df.iloc[0]["start"]} and {day_df.iloc[-1]["end"]}' ) @@ -159,10 +155,6 @@ def run(self): except Exception as e: self.log.exception(f"Error correcting metadata for {self.day}. {e}") - finally: - self.log.info( - f"Done correcting metadata for {self.day}. Saved to {self.json_base_dir}" - ) def no_jitter(self, day_df: pd.DataFrame) -> pd.DataFrame: """ @@ -172,7 +164,7 @@ def no_jitter(self, day_df: pd.DataFrame) -> pd.DataFrame: :return: The corrected dataframe """ - self.log.info( + self.log.debug( "Using file start times as is, setting jitter to 0 and calculating end times." ) # calculate the difference between each row start time and save as diff in a copy of the dataframe @@ -236,4 +228,6 @@ def save_day(self, day: datetime.datetime, day_df: pd.DataFrame, prefix: str = " output_path = Path(self.json_base_dir, str(day.year)) output_path.mkdir(parents=True, exist_ok=True) shutil.copy2(temp_metadata.as_posix(), output_path) - self.log.info(f"Wrote {output_path}/{temp_metadata.name}") + self.log.info( + f"Done correcting metadata for {self.day}. Saved to {output_path}/{temp_metadata.name}" + ) diff --git a/pbp/meta_gen/utils.py b/pbp/meta_gen/utils.py index 666b806..20ab86e 100644 --- a/pbp/meta_gen/utils.py +++ b/pbp/meta_gen/utils.py @@ -7,8 +7,13 @@ from datetime import datetime from pathlib import Path +import numpy as np import pandas as pd import matplotlib.pyplot as plt +import matplotlib.dates as mdates +from matplotlib.ticker import NullLocator + +from pbp.plot_const import DEFAULT_DPI class InstrumentType: @@ -108,37 +113,63 @@ def plot_daily_coverage( :param end: The end date of the recordings :return: The path to the plot file """ - # Create a plot of the dataframe with the x-axis as the month, and the y-axis as the daily recording coverage, - # which is percent of the day covered by recordings + # Create a plot of the dataframe with the x-axis as the month, and the y-axis as the daily recording coverage. + # This is percent of the day covered by recordings plt.rcParams["text.usetex"] = False - df["duration"] = (df["end"] - df["start"]).dt.total_seconds() - ts_df = df[["start", "duration"]].copy() + plt.rcParams["axes.edgecolor"] = "black" + duration = (df["end"] - df["start"]).dt.total_seconds() + ts_df = df[["start"]].copy() + ts_df["duration"] = duration ts_df.set_index("start", inplace=True) daily_sum_df = ts_df.resample("D").sum() daily_sum_df["coverage"] = 100 * daily_sum_df["duration"] / 86400 daily_sum_df["coverage"] = daily_sum_df[ "coverage" ].round() # round to nearest integer - plot = daily_sum_df["coverage"].plot() - plot.set_ylabel("Daily % Recording") - plot.set_xlabel("Date") + if len(daily_sum_df) == 1: + # Add a row with a NaN coverage before and after the single day to avoid matplotlib + # warnings about automatically expanding the x-axis + daily_sum_df.loc[daily_sum_df.index[0] - pd.DateOffset(days=1)] = np.nan + daily_sum_df.loc[daily_sum_df.index[0] + pd.DateOffset(days=1)] = np.nan + plot = daily_sum_df["coverage"].plot( + linestyle="-", + markerfacecolor="none", + marker="o", + color="b", + markersize=5, + linewidth=1, + figsize=(8, 4), + ) + plot.set_ylabel("Daily % Recording", fontsize=8) + plot.set_xlabel("Date", fontsize=8) plot.set_xticks(daily_sum_df.index.values) - plot.set_ylim(0, 102) - # Angle the x-axis labels for better readability and force them to be in the format YYYY-MM-DD - plot.set_xticklabels([x.strftime("%Y-%m-%d") for x in daily_sum_df.index]) - plot.set_xticklabels(plot.get_xticklabels(), rotation=45, horizontalalignment="right") + plot.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d")) + # Maximum 15 ticks on the x-axis + # plot.xaxis.set_major_locator( + # MaxNLocator(nbins=min(15, len(daily_sum_df.index.values) - 1)) + # ) + plot.axes.set_facecolor("#E4E4F1") + # Rotate the x-axis labels for better readability + plt.xticks(rotation=45) + # Set both x and y axis tick label font size to 6 + plot.tick_params(axis="both", which="major", labelsize=6) + # Disable the minor ticks on the x-axis using NullLocator, as they are not needed + plot.xaxis.set_minor_locator(NullLocator()) + # Set the y-axis limits to 0-110 to avoid the plot being too close to the top + plot.set_ylim(0, 110) # Adjust the title based on the instrument type if instrument_type == InstrumentType.NRS: - plot.set_title("Daily Coverage of NRS Recordings") + plot.set_title("Daily Coverage of NRS Recordings", fontsize=11) elif instrument_type == InstrumentType.ICLISTEN: - plot.set_title("Daily Coverage of icListen Recordings") + plot.set_title("Daily Coverage of icListen Recordings", fontsize=11) elif instrument_type == InstrumentType.SOUNDTRAP: - plot.set_title("Daily Coverage of SoundTrap Recordings") - plot_file = Path(base_dir) / f"soundtrap_coverage_{start:%Y%m%d}_{end:%Y%m%d}.jpg" - dpi = 300 + plot.set_title("Daily Coverage of SoundTrap Recordings", fontsize=11) + plot_file = ( + Path(base_dir) + / f"{str(instrument_type).lower()}_coverage_{start:%Y%m%d}_{end:%Y%m%d}.jpg" + ) fig = plot.get_figure() - fig.set_size_inches(10, 5) - fig.set_dpi(dpi) - fig.savefig(plot_file.as_posix(), bbox_inches="tight") + fig.autofmt_xdate() + fig.savefig(plot_file.as_posix(), dpi=DEFAULT_DPI, bbox_inches="tight") plt.close(fig) return plot_file.as_posix() diff --git a/pbp/plotting.py b/pbp/plotting.py index f61ec89..820fb91 100644 --- a/pbp/plotting.py +++ b/pbp/plotting.py @@ -39,6 +39,9 @@ def plot_dataset_summary( :param jpeg_filename: If given, filename to save the plot to. :param show: Whether to show the plot. """ + plt.rcParams["text.usetex"] = False + plt.rcParams["axes.edgecolor"] = "black" + # Transpose psd array for plotting da = xr.DataArray.transpose(ds.psd) @@ -151,6 +154,7 @@ def plot_dataset_summary( plt.gcf().text(0.65, 0.91, "UTC") if jpeg_filename is not None: + print(f"Saving plot to {jpeg_filename}") plt.savefig(jpeg_filename, dpi=dpi) if show: plt.show() diff --git a/tests/test_meta_generator.py b/tests/test_meta_generator.py index 181f8a9..74e13d5 100644 --- a/tests/test_meta_generator.py +++ b/tests/test_meta_generator.py @@ -15,6 +15,7 @@ from pbp.meta_gen.gen_nrs import NRSMetadataGenerator from pbp.meta_gen.gen_soundtrap import SoundTrapMetadataGenerator from pbp.meta_gen.gen_iclisten import IcListenMetadataGenerator +from pbp.meta_gen.utils import InstrumentType # which is .gitignore'ed OUT_BASE_DIR = Path("tests/json_generator_tmp") @@ -50,7 +51,7 @@ def test_soundtrap_generator_s3(): :return: """ log = create_test_logger("test_soundtrap_generator_s3") - json_dir = create_json_dir("soundtrap") + json_dir = create_json_dir("soundtrap_s3") start = datetime(2023, 7, 15) end = datetime(2023, 7, 16) @@ -68,8 +69,8 @@ def test_soundtrap_generator_s3(): # There should be two files in the json directory - one for each day json_files = list(json_dir.rglob("*.json")) assert len(json_files) == 2 - assert (json_dir / "2023/20230715.json").exists() - assert (json_dir / "2023/20230716.json").exists() + assert (json_dir / "2023" / "20230715.json").exists() + assert (json_dir / "2023" / "20230716.json").exists() # Each file should have 5 json objects for json_file in json_files: @@ -78,7 +79,9 @@ def test_soundtrap_generator_s3(): assert len(json_objects) == 5 # There should also be a coverage plot in the base json directory - coverage_plot = json_dir / "soundtrap_coverage_20230715_20230716.jpg" + coverage_plot = ( + json_dir / f"{InstrumentType.SOUNDTRAP.lower()}_coverage_20230715_20230716.jpg" + ) assert coverage_plot.exists() @@ -90,7 +93,7 @@ def test_soundtrap_generator_local(): :return: """ log = create_test_logger("test_soundtrap_generator_local") - json_dir = create_json_dir("soundtrap") + json_dir = create_json_dir("soundtrap_local") wav_dir = Path(__file__).parent / "wav" / "soundtrap" wav_dir.mkdir(exist_ok=True, parents=True) @@ -115,7 +118,7 @@ def test_soundtrap_generator_local(): uri=f"file://{wav_dir.as_posix()}", json_base_dir=json_dir.as_posix(), prefixes=["6716"], - xml_dir=f"{wav_dir.as_posix()}", + xml_dir=wav_dir.as_posix(), start=start, end=end, ) @@ -124,7 +127,7 @@ def test_soundtrap_generator_local(): # There should be two files in the json directory - one for each day json_files = list(json_dir.rglob("*.json")) assert len(json_files) == 1 - assert (json_dir / "2022/20221116.json").exists() + assert (json_dir / "2022" / "20221116.json").exists() # The file should have 1 json object for json_file in json_files: @@ -133,7 +136,9 @@ def test_soundtrap_generator_local(): assert len(json_objects) == 1 # There should also be a coverage plot in the base json directory - coverage_plot = json_dir / "soundtrap_coverage_20221116_20221116.jpg" + coverage_plot = ( + json_dir / f"{InstrumentType.SOUNDTRAP.lower()}_coverage_20221116_20221116.jpg" + ) assert coverage_plot.exists() @@ -165,7 +170,7 @@ def test_iclisten_generator(): # There should be one files in the json directory named 20230718.json and it should have 145 json objects json_files = list(json_dir.rglob("*.json")) assert len(json_files) == 1 - json_file = json_dir / "2023/20230718.json" + json_file = json_dir / "2023" / "20230718.json" assert json_file.exists() # Read the file and check the number of json objects @@ -174,7 +179,9 @@ def test_iclisten_generator(): assert len(json_objects) == 145 # There should also be a coverage plot in the base json directory - coverage_plot = json_dir / "soundtrap_coverage_20230718_20230718.jpg" + coverage_plot = ( + json_dir / f"{InstrumentType.ICLISTEN.lower()}_coverage_20230718_20230718.jpg" + ) assert coverage_plot.exists() @@ -204,7 +211,7 @@ def test_nrs_generator(): # There should be one file in the json directory and with number of objects as indicated json_files = list(json_dir.rglob("*.json")) assert len(json_files) == 1 - json_file = json_dir / "2019/20191024.json" + json_file = json_dir / "2019" / "20191024.json" assert json_file.exists() # Read the file and check the number of json objects @@ -213,7 +220,9 @@ def test_nrs_generator(): assert len(json_objects) == 7 # There should also be a coverage plot in the base json directory - coverage_plot = json_dir / "soundtrap_coverage_20191024_20191024.jpg" + coverage_plot = ( + json_dir / f"{InstrumentType.NRS.lower()}_coverage_20191024_20191024.jpg" + ) assert coverage_plot.exists()