apify · Mantisus · Apr 21, 2025 · Apr 22, 2025 · Apr 24, 2025 · Apr 24, 2025
diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
@@ -5,6 +5,7 @@
 from protego import Protego
 from yarl import URL
 
+from crawlee._utils.sitemap import Sitemap
 from crawlee._utils.web import is_status_code_client_error
 
 if TYPE_CHECKING:
@@ -15,9 +16,10 @@
 
 
 class RobotsTxtFile:
-    def __init__(self, url: str, robots: Protego) -> None:
+    def __init__(self, url: str, robots: Protego, proxy_info: ProxyInfo | None = None) -> None:
         self._robots = robots
         self._original_url = URL(url).origin()
+        self._proxy_info = proxy_info
 
     @classmethod
     async def from_content(cls, url: str, content: str) -> Self:
@@ -56,7 +58,7 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N
 
         robots = Protego.parse(body.decode('utf-8'))
 
-        return cls(url, robots)
+        return cls(url, robots, proxy_info=proxy_info)
 
     def is_allowed(self, url: str, user_agent: str = '*') -> bool:
         """Check if the given URL is allowed for the given user agent.
@@ -83,3 +85,14 @@ def get_crawl_delay(self, user_agent: str = '*') -> int | None:
         """
         crawl_delay = self._robots.crawl_delay(user_agent)
         return int(crawl_delay) if crawl_delay is not None else None
+
+    async def parse_sitemaps(self) -> Sitemap:
+        """Parse the sitemaps from the robots.txt file and return a `Sitemap` instance."""
+        sitemaps = self.get_sitemaps()
+        proxy_url = self._proxy_info.url if self._proxy_info else None
+        return await Sitemap.load(sitemaps, proxy_url)
+
+    async def parse_urls_from_sitemaps(self) -> list[str]:
+        """Parse the sitemaps in the robots.txt file and return a list URLs."""
+        sitemap = await self.parse_sitemaps()
+        return sitemap.urls