|
4 | 4 | # We will use ColPali as the model to extract patch vectors from images of pdf pages.
|
5 | 5 | # At query time, we use MaxSim to retrieve and/or (based on the configuration) rank the page results.
|
6 | 6 | #
|
7 |
| -# To see the application in action, visit TODO: |
| 7 | +# To see the application in action, visit https://huggingface.co/spaces/vespa-engine/colpali-vespa-visual-retrieval |
8 | 8 | #
|
9 | 9 | # The web application is written in FastHTML, meaning the complete application is written in python.
|
10 | 10 | #
|
|
169 | 169 | links = []
|
170 | 170 | url_to_year = {}
|
171 | 171 |
|
172 |
| -# Find all 'div's with id starting with 'year-' |
173 |
| -for year_div in soup.find_all("div", id=lambda x: x and x.startswith("year-")): |
174 |
| - year_id = year_div.get("id", "") |
| 172 | +# Find all the years |
| 173 | +for year_section in soup.find_all("section", attrs={"data-name": "report-year"}): |
| 174 | + year_id = year_section.get("data-filter-section", "") |
175 | 175 | year = year_id.replace("year-", "")
|
176 |
| - |
177 |
| - # Within this div, find all 'a' elements with the specific classes |
178 |
| - for a_tag in year_div.select("a.button.button--download-secondary[href]"): |
179 |
| - href = a_tag["href"] |
180 |
| - full_url = urljoin(url, href) |
181 |
| - # exclude non-pdf links |
182 |
| - if full_url.endswith(".pdf"): |
183 |
| - links.append(full_url) |
184 |
| - url_to_year[full_url] = year |
185 |
| -links, url_to_year |
| 176 | + |
| 177 | + # Find the yearly report section |
| 178 | + report_section = year_section.find("div", attrs={ |
| 179 | + "data-filter-section": "year", |
| 180 | + "data-name": "report-type" |
| 181 | + }) |
| 182 | + if not report_section: |
| 183 | + continue |
| 184 | + |
| 185 | + # Get the first link |
| 186 | + report_link = report_section.select_one("ul.link-list a") |
| 187 | + if not report_link: |
| 188 | + continue |
| 189 | + |
| 190 | + report_url = urljoin(url, report_link["href"]) |
| 191 | + |
| 192 | + # Visit the report page to find PDF download links |
| 193 | + try: |
| 194 | + report_response = requests.get(report_url) |
| 195 | + report_response.raise_for_status() |
| 196 | + report_soup = BeautifulSoup(report_response.text, "html.parser") |
| 197 | + |
| 198 | + # Find only the first PDF download link with the specific class |
| 199 | + pdf_link = report_soup.select_one("a.btn.btn-secondary[data-right-icon='download']") |
| 200 | + if not pdf_link or not pdf_link["href"].endswith(".pdf"): |
| 201 | + continue |
| 202 | + |
| 203 | + pdf_url = urljoin(report_url, pdf_link["href"]) |
| 204 | + links.append(pdf_url) |
| 205 | + url_to_year[pdf_url] = year |
| 206 | + print(f"Found PDF: {pdf_url} (Year: {year})") |
| 207 | + except Exception as e: |
| 208 | + print(f"Error fetching report page {report_url}: {e}") |
| 209 | + |
| 210 | +print(f"Found {len(links)} PDF links") |
186 | 211 | # -
|
187 | 212 |
|
188 | 213 | # Limit the number of PDFs to download
|
@@ -471,6 +496,7 @@ def generate_queries(image, prompt_text, pydantic_model):
|
471 | 496 |
|
472 | 497 | # +
|
473 | 498 | # write title, url, page_no, text, queries, not image to JSON
|
| 499 | +os.makedirs("output", exist_ok=True) |
474 | 500 | with open("output/pdf_pages.json", "w") as f:
|
475 | 501 | to_write = [{k: v for k, v in pdf.items() if k != "image"} for pdf in pdf_pages]
|
476 | 502 | json.dump(to_write, f, indent=2)
|
@@ -590,7 +616,6 @@ def float_to_binary_embedding(float_query_embedding: dict) -> dict:
|
590 | 616 |
|
591 | 617 |
|
592 | 618 | # Save vespa_feed to vespa_feed.json
|
593 |
| -os.makedirs("output", exist_ok=True) |
594 | 619 | with open("output/vespa_feed.json", "w") as f:
|
595 | 620 | vespa_feed_to_save = []
|
596 | 621 | for page in vespa_feed:
|
|
0 commit comments