Skip to content

Commit 1d1bfd5

Browse files
shneebaTobias
andauthored
get_docid_filename_map (#4)
Co-authored-by: Tobias <[email protected]>
1 parent 9333e06 commit 1d1bfd5

File tree

1 file changed

+51
-0
lines changed

1 file changed

+51
-0
lines changed

llmware/retrieval.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1760,3 +1760,54 @@ def filter_by_key_value_range(self, key, value_range, results_only=True):
17601760

17611761
return result_dict
17621762

1763+
def get_docid_filename_map(self):
1764+
"""
1765+
Returns a list of dictionaries, each containing "doc_ID" and "file_source"
1766+
for every distinct doc_ID in this library.
1767+
1768+
The mapping is one doc_ID -> one file_source (the first occurrence in
1769+
the collection). This is usually enough to uniquely match a doc_ID to
1770+
a particular file.
1771+
1772+
Returns
1773+
-------
1774+
list of dict
1775+
Each dict has the form:
1776+
{
1777+
"doc_ID": <int or str>,
1778+
"file_source": <str>
1779+
}
1780+
where <int or str> is the document ID, and <str> is the associated filename or path.
1781+
"""
1782+
# 1) Create a CollectionRetrieval object for this library
1783+
collection_retrieval = CollectionRetrieval(self.library_name, account_name=self.account_name)
1784+
1785+
# 2) Retrieve the entire collection cursor
1786+
collection_cursor = collection_retrieval.get_whole_collection()
1787+
1788+
# 3) Pull all results from the cursor
1789+
all_entries = collection_cursor.pull_all()
1790+
1791+
# 4) Build a dictionary to store doc_ID -> file_source
1792+
doc_map = {}
1793+
1794+
for entry in all_entries:
1795+
doc_id = entry.get("doc_ID")
1796+
file_source = entry.get("file_source")
1797+
1798+
# Only set file_source once per doc_ID (first occurrence)
1799+
if doc_id is not None and file_source is not None:
1800+
if doc_id not in doc_map:
1801+
doc_map[doc_id] = file_source
1802+
1803+
# 5) Convert the mapping into a list of dicts
1804+
result = [
1805+
{"doc_ID": did, "file_source": fname}
1806+
for did, fname in doc_map.items()
1807+
]
1808+
1809+
# 6) Close underlying database cursor/connection
1810+
collection_retrieval.close()
1811+
1812+
# 7) Return the final list
1813+
return result

0 commit comments

Comments
 (0)