Skip to content

Commit

Permalink
Merge pull request #2 from shneeba/document_lookup-images
Browse files Browse the repository at this point in the history
include images in document lookup
  • Loading branch information
shneeba authored Feb 1, 2025
2 parents 2563496 + a05aba7 commit b5ae53f
Showing 1 changed file with 21 additions and 12 deletions.
33 changes: 21 additions & 12 deletions llmware/retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1320,37 +1320,46 @@ def aggregate_text(self, qr_list):

return text_agg, meta_agg

def document_lookup(self, doc_id="", file_source=""):

""" Takes as an input either a doc_id or file_source (e.g., filename) that is in a Library, and
returns all of the non-image text and table blocks in the document. """
def document_lookup(self, doc_id="", file_source="", include_images=False):
"""
Takes as an input either a doc_id or file_source (e.g., filename) that is in a Library, and
returns all of the text and table blocks in the document. Images can be optionally included.
Parameters:
doc_id (str): Document ID.
file_source (str): Source file name.
include_images (bool): Whether to include images in the result. Defaults to False.
Returns:
list: Filtered list of document blocks.
"""

if doc_id:
kv_dict = {"doc_ID": doc_id}
elif file_source:
kv_dict = {"file_source": file_source}
else:
raise RuntimeError("Query document_lookup method requires as input either a document ID or "
"the name of a file already parsed in the library ")
raise RuntimeError(
"Query document_lookup method requires as input either a document ID or "
"the name of a file already parsed in the library"
)

output = CollectionRetrieval(self.library_name, account_name=self.account_name).filter_by_key_dict(kv_dict)

if len(output) == 0:
logger.warning(f"update: Query - document_lookup - nothing found - {doc_id} - {file_source}")
result = []

return result
return []

output_final = []

# exclude images to avoid potential duplicate text
for entries in output:
if entries["content_type"] != "image":
# Filter out images if include_images is False
if include_images or entries["content_type"] != "image":
entries.update({"matches": []})
entries.update({"page_num": entries["master_index"]})
output_final.append(entries)

output_final = sorted(output_final, key=lambda x:x["block_ID"], reverse=False)
output_final = sorted(output_final, key=lambda x: x["block_ID"], reverse=False)

return output_final

Expand Down

0 comments on commit b5ae53f

Please sign in to comment.