Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

"Hacking websites with CDP" is now on YouTube #3380

Open
mdmintz opened this issue Jan 1, 2025 · 10 comments
Open

"Hacking websites with CDP" is now on YouTube #3380

mdmintz opened this issue Jan 1, 2025 · 10 comments
Assignees
Labels
News / Announcements News Tutorials & Learning Tutorial videos and blog posts UC Mode / CDP Mode Undetected Chromedriver Mode / CDP Mode

Comments

@mdmintz
Copy link
Member

mdmintz commented Jan 1, 2025

"Hacking websites with CDP" is now on YouTube:

https://www.youtube.com/watch?v=vt2zsdiNh3U

@mdmintz mdmintz added News / Announcements News Tutorials & Learning Tutorial videos and blog posts UC Mode / CDP Mode Undetected Chromedriver Mode / CDP Mode labels Jan 1, 2025
@mdmintz mdmintz self-assigned this Jan 1, 2025
@mdmintz mdmintz pinned this issue Jan 4, 2025
@mdmintz mdmintz unpinned this issue Jan 4, 2025
@mdmintz
Copy link
Member Author

mdmintz commented Jan 6, 2025

The code for taking control of existing Chrome browsers via the remote-debugging-port is here:

#3354 (comment)

@boludoz
Copy link

boludoz commented Feb 6, 2025

The code for taking control of existing Chrome browsers via the remote-debugging-port is here:

#3354 (comment)

So we don't need chromedriver anymore? Like nodriver or Selenium driverless?

@mdmintz
Copy link
Member Author

mdmintz commented Feb 6, 2025

With pure CDP Mode, chromedriver isn’t necessary.

@mdmintz
Copy link
Member Author

mdmintz commented Feb 8, 2025

If stealth is important, you may have to use one of the CDP formats here: #3354 (comment)

Otherwise, you can try passing in the remote-debugging-port via chromium_arg, but that might not get you the results you’re looking for.

@dongdestroyer

This comment has been minimized.

@guocity
Copy link

guocity commented Feb 27, 2025

BaseCase class with SB() is synchronous is there anyway that i can use asynchronous driver from cdp_util instead of basecase driver?
from seleniumbase.undetected.cdp_driver import cdp_util
driver = await cdp_util.start_async()

@guocity
Copy link

guocity commented Feb 27, 2025

how do i call sb.uc_gui_click_captcha() from async code?

@mdmintz
Copy link
Member Author

mdmintz commented Feb 27, 2025

The async code can only call direct CDP methods.
(See https://chromedevtools.github.io/devtools-protocol/ for those methods)

@dima23113
Copy link

The async code can only call direct CDP methods. (See https://chromedevtools.github.io/devtools-protocol/ for those methods)

Hi! Please tell me, I've already racked my brains trying to find a solution. Why cdp doesn't get the request body, if it's 100% there and if I uncomment the line xhr_responses = await receiveXHR(tab, xhr_requests.get(browser_index, []), browser_index) in the while loop, then everything will be fine, but only if I screw up everything from the while loop further.

`
CDP.network.ResponseReceived with CDP.network.ResourceType.XHR.
import asyncio
import json
import random

import mycdp
import time
from seleniumbase import cdp_driver

import config
from utils import distribute_scrolls, is_have_products, get_xhr_products, scroll_down, need_click_load_more

products = []

last_xhr_request = {}

xhr_requests = {}


def listenXHR(page, index_):
    async def handler(evt: mycdp.network.ResponseReceived):
        # Get AJAX requests
        if evt.type_ is mycdp.network.ResourceType.XHR and 'api/catalog/products' in evt.response.url:
            if xhr_requests.get(index_):
                xhr_requests[index_].append([evt.response.url, evt.request_id])
                last_xhr_request[index_] = time.time()
            else:
                xhr_requests[index_] = [[evt.response.url, evt.request_id], ]
                last_xhr_request[index_] = time.time()

    page.add_handler(mycdp.network.ResponseReceived, handler)


async def receiveXHR(page, requests, index_):
    responses = []
    retries = 0
    max_retries = 5
    # Wait at least 2 seconds after last XHR request for more
    while True:
        if last_xhr_request[index_] is None or retries > max_retries:
            break
        if time.time() - last_xhr_request[index_] <= 3:
            retries = retries + 1
            await asyncio.sleep(2)
            continue
        else:
            break
    await page
    # Loop through gathered requests and get response body
    for request in requests:
        try:
            res = await page.send(mycdp.network.get_response_body(request[1]))
            if res is None:
                continue
            responses.append({
                "url": request[0],
                "body": json.loads(res[0]),
                "is_base64": res[1],
            })
        except Exception as e:
            print("Error getting response:", e)
    if responses:
        xhr_requests[index_] = []
    return responses


async def request_paused_handler(event, tab):
    r = event.request
    is_image = ".png" in r.url or ".jpg" in r.url or ".gif" in r.url or ".webp" in r.url or "pcdn.goldapple.ru" in r.url or "/front/api/apm/events" in r.url or '.mp4' in r.url
    if not is_image:  # Let the data through
        tab.feed_cdp(
            mycdp.fetch.continue_request(request_id=event.request_id)
        )
    else:  # Block the data (images)
        TIMED_OUT = mycdp.network.ErrorReason.TIMED_OUT
        tab.feed_cdp(
            mycdp.fetch.fail_request(event.request_id, TIMED_OUT)
        )


async def check_hxr(index_, tab):
    print("Starting check_hxr for index:", index_)  # Debug print
    products_ = []
    while True:
        print("Waiting for XHR responses...")  # Debug print
        xhr_responses = await receiveXHR(tab, xhr_requests.get(index_, []), index_)
        print("Received XHR responses:", xhr_responses)  # Debug print
        data = await get_xhr_products(xhr_responses)
        if data:
            products_.extend(data)
        print("request: ", xhr_requests.get(index_, []))  # Original print
        await asyncio.sleep(1)


async def crawl(browser_index, link, scrols, max_pages=None, start_pages=0):
    products_parsed = []
    retries = 5
    page_count = 0
    driver = await cdp_driver.start_async()
    tab = await driver.get("about:blank")
    listenXHR(tab, browser_index)
    tab.add_handler(mycdp.fetch.RequestPaused, request_paused_handler)
    url = f"{link}?p={start_pages}&storestocks=1"
    await tab.get(url)
    await asyncio.sleep(5)
    # check_hxr_task = asyncio.create_task(check_hxr(browser_index, tab))
    # print("hello")
    while True:
        # xhr_responses = await receiveXHR(tab, xhr_requests.get(browser_index, []), browser_index)
        # data = await get_xhr_products(xhr_responses)
        # if not data:
        #     retries -= 1
        # else:
        #     products_parsed.extend(data)
        if not await is_have_products(tab):
            break
        await scroll_down(tab)
        await need_click_load_more(tab)
        print(f"Браузер {browser_index} - Страница {page_count}; Всего продуктов: {len(products_parsed)}")
        # if retries == 0:
        #     break
        page_count += 1
        if page_count == 5:
            break
    xhr_responses = await receiveXHR(tab, xhr_requests.get(browser_index, []), browser_index)
    data = await get_xhr_products(xhr_responses)
    print(data)
    print(f"Браузер {browser_index} собрал {len(products_parsed)} продуктов на странице {url}")
    products.extend(products_parsed)


async def main():
    driver = await cdp_driver.start_async()
    for n, link in enumerate(config.links):
        tasks = []
        tab = await driver.get("https://goldapple.ru/" + link)
        await tab.wait_for("span[data-category-products-count]")
        product_count = await tab.select("span[data-category-products-count]")
        product_count = product_count.attrs.get("data-category-products-count")
        product_pages = int(int(product_count) / 24)
        scrolls_per_browser = distribute_scrolls(product_pages, 1)
        start_pages = []
        start = 0
        for s in scrolls_per_browser:
            start_pages.append(start)
            start += s
        for i in range(1):
            tasks.append(crawl(i, "https://goldapple.ru" + link, scrolls_per_browser[i], 100, start_pages[i]))
        await asyncio.gather(*tasks)


if __name__ == "__main__":
    asyncio.run(main())
`

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
News / Announcements News Tutorials & Learning Tutorial videos and blog posts UC Mode / CDP Mode Undetected Chromedriver Mode / CDP Mode
Projects
None yet
Development

No branches or pull requests

5 participants