@@ -1760,3 +1760,54 @@ def filter_by_key_value_range(self, key, value_range, results_only=True):
1760
1760
1761
1761
return result_dict
1762
1762
1763
+ def get_docid_filename_map (self ):
1764
+ """
1765
+ Returns a list of dictionaries, each containing "doc_ID" and "file_source"
1766
+ for every distinct doc_ID in this library.
1767
+
1768
+ The mapping is one doc_ID -> one file_source (the first occurrence in
1769
+ the collection). This is usually enough to uniquely match a doc_ID to
1770
+ a particular file.
1771
+
1772
+ Returns
1773
+ -------
1774
+ list of dict
1775
+ Each dict has the form:
1776
+ {
1777
+ "doc_ID": <int or str>,
1778
+ "file_source": <str>
1779
+ }
1780
+ where <int or str> is the document ID, and <str> is the associated filename or path.
1781
+ """
1782
+ # 1) Create a CollectionRetrieval object for this library
1783
+ collection_retrieval = CollectionRetrieval (self .library_name , account_name = self .account_name )
1784
+
1785
+ # 2) Retrieve the entire collection cursor
1786
+ collection_cursor = collection_retrieval .get_whole_collection ()
1787
+
1788
+ # 3) Pull all results from the cursor
1789
+ all_entries = collection_cursor .pull_all ()
1790
+
1791
+ # 4) Build a dictionary to store doc_ID -> file_source
1792
+ doc_map = {}
1793
+
1794
+ for entry in all_entries :
1795
+ doc_id = entry .get ("doc_ID" )
1796
+ file_source = entry .get ("file_source" )
1797
+
1798
+ # Only set file_source once per doc_ID (first occurrence)
1799
+ if doc_id is not None and file_source is not None :
1800
+ if doc_id not in doc_map :
1801
+ doc_map [doc_id ] = file_source
1802
+
1803
+ # 5) Convert the mapping into a list of dicts
1804
+ result = [
1805
+ {"doc_ID" : did , "file_source" : fname }
1806
+ for did , fname in doc_map .items ()
1807
+ ]
1808
+
1809
+ # 6) Close underlying database cursor/connection
1810
+ collection_retrieval .close ()
1811
+
1812
+ # 7) Return the final list
1813
+ return result
0 commit comments