Skip to content

Commit 543f73a

Browse files
authored
Merge pull request #1672 from albertdbio/fix/update-colpali-demo-pdf-scraping
Update ColPali demo pdf scraping
2 parents 741ca35 + 38fcb12 commit 543f73a

File tree

3 files changed

+68
-875
lines changed

3 files changed

+68
-875
lines changed

visual-retrieval-colpali/prepare_feed_deploy.py

+40-15
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# We will use ColPali as the model to extract patch vectors from images of pdf pages.
55
# At query time, we use MaxSim to retrieve and/or (based on the configuration) rank the page results.
66
#
7-
# To see the application in action, visit TODO:
7+
# To see the application in action, visit https://huggingface.co/spaces/vespa-engine/colpali-vespa-visual-retrieval
88
#
99
# The web application is written in FastHTML, meaning the complete application is written in python.
1010
#
@@ -169,20 +169,45 @@
169169
links = []
170170
url_to_year = {}
171171

172-
# Find all 'div's with id starting with 'year-'
173-
for year_div in soup.find_all("div", id=lambda x: x and x.startswith("year-")):
174-
year_id = year_div.get("id", "")
172+
# Find all the years
173+
for year_section in soup.find_all("section", attrs={"data-name": "report-year"}):
174+
year_id = year_section.get("data-filter-section", "")
175175
year = year_id.replace("year-", "")
176-
177-
# Within this div, find all 'a' elements with the specific classes
178-
for a_tag in year_div.select("a.button.button--download-secondary[href]"):
179-
href = a_tag["href"]
180-
full_url = urljoin(url, href)
181-
# exclude non-pdf links
182-
if full_url.endswith(".pdf"):
183-
links.append(full_url)
184-
url_to_year[full_url] = year
185-
links, url_to_year
176+
177+
# Find the yearly report section
178+
report_section = year_section.find("div", attrs={
179+
"data-filter-section": "year",
180+
"data-name": "report-type"
181+
})
182+
if not report_section:
183+
continue
184+
185+
# Get the first link
186+
report_link = report_section.select_one("ul.link-list a")
187+
if not report_link:
188+
continue
189+
190+
report_url = urljoin(url, report_link["href"])
191+
192+
# Visit the report page to find PDF download links
193+
try:
194+
report_response = requests.get(report_url)
195+
report_response.raise_for_status()
196+
report_soup = BeautifulSoup(report_response.text, "html.parser")
197+
198+
# Find only the first PDF download link with the specific class
199+
pdf_link = report_soup.select_one("a.btn.btn-secondary[data-right-icon='download']")
200+
if not pdf_link or not pdf_link["href"].endswith(".pdf"):
201+
continue
202+
203+
pdf_url = urljoin(report_url, pdf_link["href"])
204+
links.append(pdf_url)
205+
url_to_year[pdf_url] = year
206+
print(f"Found PDF: {pdf_url} (Year: {year})")
207+
except Exception as e:
208+
print(f"Error fetching report page {report_url}: {e}")
209+
210+
print(f"Found {len(links)} PDF links")
186211
# -
187212

188213
# Limit the number of PDFs to download
@@ -471,6 +496,7 @@ def generate_queries(image, prompt_text, pydantic_model):
471496

472497
# +
473498
# write title, url, page_no, text, queries, not image to JSON
499+
os.makedirs("output", exist_ok=True)
474500
with open("output/pdf_pages.json", "w") as f:
475501
to_write = [{k: v for k, v in pdf.items() if k != "image"} for pdf in pdf_pages]
476502
json.dump(to_write, f, indent=2)
@@ -590,7 +616,6 @@ def float_to_binary_embedding(float_query_embedding: dict) -> dict:
590616

591617

592618
# Save vespa_feed to vespa_feed.json
593-
os.makedirs("output", exist_ok=True)
594619
with open("output/vespa_feed.json", "w") as f:
595620
vespa_feed_to_save = []
596621
for page in vespa_feed:

visual-retrieval-colpali/pyproject.toml

+5-9
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name = "visual-retrieval-colpali"
33
version = "0.1.0"
44
description = "Visual retrieval with ColPali"
55
readme = "README.md"
6-
requires-python = ">=3.10, <3.13"
6+
requires-python = ">=3.10, <3.12"
77
license = { text = "Apache-2.0" }
88
dependencies = [
99
"python-fasthtml",
@@ -20,23 +20,19 @@ dependencies = [
2020
"shad4fast>=1.2.1",
2121
"google-generativeai>=0.7.2",
2222
"spacy",
23-
"pip"
23+
"pip",
2424
]
2525

2626
# dev-dependencies
2727
[project.optional-dependencies]
28-
dev = [
29-
"ruff",
30-
"python-dotenv",
31-
"huggingface_hub[cli]"
32-
]
28+
dev = ["ruff", "python-dotenv", "huggingface_hub[cli]"]
3329
feed = [
3430
"ipykernel",
3531
"jupytext",
3632
"pydantic",
3733
"beautifulsoup4",
3834
"pdf2image",
39-
"google-generativeai"
35+
"google-generativeai",
4036
]
4137
[tool.ruff]
4238
# Exclude a variety of commonly ignored directories.
@@ -115,4 +111,4 @@ docstring-code-format = false
115111
#
116112
# This only has an effect when the `docstring-code-format` setting is
117113
# enabled.
118-
docstring-code-line-length = "dynamic"
114+
docstring-code-line-length = "dynamic"

0 commit comments

Comments
 (0)