diff --git a/openlibrary/catalog/add_book/match.py b/openlibrary/catalog/add_book/match.py index 62feb7e4aba..2f9b2458022 100644 --- a/openlibrary/catalog/add_book/match.py +++ b/openlibrary/catalog/add_book/match.py @@ -438,13 +438,15 @@ def compare_publisher(e1: dict, e2: dict): return ('publisher', 'occur within the other', 100) elif short_part_publisher_match(e1_norm, e2_norm): return ('publisher', 'match', 100) - return ('publisher', 'mismatch', -25) + return ('publisher', 'mismatch', -51) if 'publishers' not in e1 or 'publishers' not in e2: return ('publisher', 'either missing', 0) -def threshold_match(rec1: dict, rec2: dict, threshold: int, debug: bool = False): +def threshold_match( + rec1: dict, rec2: dict, threshold: int, debug: bool = False +) -> bool: """ Determines (according to a threshold) whether two edition representations are sufficiently the same. Used when importing new books. @@ -460,12 +462,12 @@ def threshold_match(rec1: dict, rec2: dict, threshold: int, debug: bool = False) level1 = level1_match(e1, e2) total = sum(i[2] for i in level1) if debug: - print(f"E1: {e1}\nE2: {e2}") - print(f"TOTAL 1 = {total} : {level1}") + print(f"E1: {e1}\nE2: {e2}", flush=True) + print(f"TOTAL 1 = {total} : {level1}", flush=True) if total >= threshold: return True level2 = level2_match(e1, e2) total = sum(i[2] for i in level2) if debug: - print(f"TOTAL 2 = {total} : {level2}") + print(f"TOTAL 2 = {total} : {level2}", flush=True) return total >= threshold diff --git a/openlibrary/catalog/add_book/tests/test_match.py b/openlibrary/catalog/add_book/tests/test_match.py index 28246d79e16..9989cf12fe5 100644 --- a/openlibrary/catalog/add_book/tests/test_match.py +++ b/openlibrary/catalog/add_book/tests/test_match.py @@ -4,6 +4,7 @@ from openlibrary.catalog.add_book import load from openlibrary.catalog.add_book.match import ( + THRESHOLD, add_db_name, build_titles, compare_authors, @@ -290,7 +291,7 @@ def test_compare_publisher(): assert compare_publisher(foo, {}) == ('publisher', 'either missing', 0) assert compare_publisher({}, bar) == ('publisher', 'either missing', 0) assert compare_publisher(foo, foo2) == ('publisher', 'match', 100) - assert compare_publisher(foo, bar) == ('publisher', 'mismatch', -25) + assert compare_publisher(foo, bar) == ('publisher', 'mismatch', -51) assert compare_publisher(bar, both) == ('publisher', 'match', 100) assert compare_publisher(both, foo) == ('publisher', 'match', 100) @@ -370,3 +371,36 @@ def test_match_low_threshold(self): threshold = 515 assert threshold_match(e1, e2, threshold) is True assert threshold_match(e1, e2, threshold + 1) is False + + def test_matching_title_author_and_publish_year_but_not_publishers(self) -> None: + """ + Matching only title, author, and publish_year should not be sufficient for + meeting the match threshold if the publisher is truthy and doesn't match, + as a book published in different publishers in the same year would easily meet + the criteria. + """ + existing_edition = { + 'authors': [{'name': 'Edgar Lee Masters'}], + 'publish_date': '2022', + 'publishers': ['Creative Media Partners, LLC'], + 'title': 'Spoon River Anthology', + } + + potential_match1 = { + 'authors': [{'name': 'Edgar Lee Masters'}], + 'publish_date': '2022', + 'publishers': ['Standard Ebooks'], + 'title': 'Spoon River Anthology', + } + + assert threshold_match(existing_edition, potential_match1, THRESHOLD) is False + + potential_match2 = { + 'authors': [{'name': 'Edgar Lee Masters'}], + 'publish_date': '2022', + 'title': 'Spoon River Anthology', + } + + # If there i s no publisher and nothing else to match, the editions should be + # indistinguishable, and therefore matches. + assert threshold_match(existing_edition, potential_match2, THRESHOLD) is True