Skip to content

Commit

Permalink
Merge pull request #9389 from scottbarnes/9387/fix/importbot-ignoring…
Browse files Browse the repository at this point in the history
…-publishers-field-when-resolving-editions

Imports: increase point penalty for non-matching publishers
  • Loading branch information
cdrini authored Jun 7, 2024
2 parents e982ea1 + d132842 commit 672c5ed
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 6 deletions.
12 changes: 7 additions & 5 deletions openlibrary/catalog/add_book/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,13 +438,15 @@ def compare_publisher(e1: dict, e2: dict):
return ('publisher', 'occur within the other', 100)
elif short_part_publisher_match(e1_norm, e2_norm):
return ('publisher', 'match', 100)
return ('publisher', 'mismatch', -25)
return ('publisher', 'mismatch', -51)

if 'publishers' not in e1 or 'publishers' not in e2:
return ('publisher', 'either missing', 0)


def threshold_match(rec1: dict, rec2: dict, threshold: int, debug: bool = False):
def threshold_match(
rec1: dict, rec2: dict, threshold: int, debug: bool = False
) -> bool:
"""
Determines (according to a threshold) whether two edition representations are
sufficiently the same. Used when importing new books.
Expand All @@ -460,12 +462,12 @@ def threshold_match(rec1: dict, rec2: dict, threshold: int, debug: bool = False)
level1 = level1_match(e1, e2)
total = sum(i[2] for i in level1)
if debug:
print(f"E1: {e1}\nE2: {e2}")
print(f"TOTAL 1 = {total} : {level1}")
print(f"E1: {e1}\nE2: {e2}", flush=True)
print(f"TOTAL 1 = {total} : {level1}", flush=True)
if total >= threshold:
return True
level2 = level2_match(e1, e2)
total = sum(i[2] for i in level2)
if debug:
print(f"TOTAL 2 = {total} : {level2}")
print(f"TOTAL 2 = {total} : {level2}", flush=True)
return total >= threshold
36 changes: 35 additions & 1 deletion openlibrary/catalog/add_book/tests/test_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from openlibrary.catalog.add_book import load
from openlibrary.catalog.add_book.match import (
THRESHOLD,
add_db_name,
build_titles,
compare_authors,
Expand Down Expand Up @@ -290,7 +291,7 @@ def test_compare_publisher():
assert compare_publisher(foo, {}) == ('publisher', 'either missing', 0)
assert compare_publisher({}, bar) == ('publisher', 'either missing', 0)
assert compare_publisher(foo, foo2) == ('publisher', 'match', 100)
assert compare_publisher(foo, bar) == ('publisher', 'mismatch', -25)
assert compare_publisher(foo, bar) == ('publisher', 'mismatch', -51)
assert compare_publisher(bar, both) == ('publisher', 'match', 100)
assert compare_publisher(both, foo) == ('publisher', 'match', 100)

Expand Down Expand Up @@ -370,3 +371,36 @@ def test_match_low_threshold(self):
threshold = 515
assert threshold_match(e1, e2, threshold) is True
assert threshold_match(e1, e2, threshold + 1) is False

def test_matching_title_author_and_publish_year_but_not_publishers(self) -> None:
"""
Matching only title, author, and publish_year should not be sufficient for
meeting the match threshold if the publisher is truthy and doesn't match,
as a book published in different publishers in the same year would easily meet
the criteria.
"""
existing_edition = {
'authors': [{'name': 'Edgar Lee Masters'}],
'publish_date': '2022',
'publishers': ['Creative Media Partners, LLC'],
'title': 'Spoon River Anthology',
}

potential_match1 = {
'authors': [{'name': 'Edgar Lee Masters'}],
'publish_date': '2022',
'publishers': ['Standard Ebooks'],
'title': 'Spoon River Anthology',
}

assert threshold_match(existing_edition, potential_match1, THRESHOLD) is False

potential_match2 = {
'authors': [{'name': 'Edgar Lee Masters'}],
'publish_date': '2022',
'title': 'Spoon River Anthology',
}

# If there i s no publisher and nothing else to match, the editions should be
# indistinguishable, and therefore matches.
assert threshold_match(existing_edition, potential_match2, THRESHOLD) is True

0 comments on commit 672c5ed

Please sign in to comment.