From 0e32c2db157da750a7268af939a93cc41584b813 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Fri, 20 Dec 2024 17:25:45 +0100 Subject: [PATCH 01/24] adding functions to locate elements based on their tagname, attribute, and value --- zendriver/core/tab.py | 172 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 83acc07d..1b484d20 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -221,6 +221,69 @@ async def find( ) await self.sleep(0.5) return item + + async def locate( + self, + tagname: str, + attribute: str, + value: str, + timeout: Union[int, float] = 10, + ): + """ + locate a single element by tagname, attribute, and value; this way it is guaranteed to get your targeted element every single time. + can also be used to wait for such element to appear. + + :param tagname: tagname of the element to search for + :type tagname: str + :param attribute: the attribute we will be filtering the element by + :type attribute: str + :param value: the value we will be checking the attribute to narrow our list of elements as much as possible + :type value: str + :param best_match: :param best_match: when True (default), it will return the element which has the most + comparable string length. this could help tremendously, when for example + you search for "login", you'd probably want the login button element, + and not thousands of scripts,meta,headings containing a string of "login". + When False, it will return naively just the first match (but is way faster). + :type best_match: bool + :param return_enclosing_element: + since we deal with nodes instead of elements, the find function most often returns + so called text nodes, which is actually a element of plain text, which is + the somehow imaginary "child" of a "span", "p", "script" or any other elements which have text between their opening + and closing tags. + most often when we search by text, we actually aim for the element containing the text instead of + a lousy plain text node, so by default the containing element is returned. + + however, there are (why not) exceptions, for example elements that use the "placeholder=" property. + this text is rendered, but is not a pure text node. in that case you can set this flag to False. + since in this case we are probably interested in just that element, and not it's parent. + + + # todo, automatically determine node type + # ignore the return_enclosing_element flag if the found node is NOT a text node but a + # regular element (one having a tag) in which case that is exactly what we need. + :type return_enclosing_element: bool + :param timeout: raise timeout exception when after this many seconds nothing is found. + :type timeout: float,int + """ + loop = asyncio.get_running_loop() + start_time = loop.time() + + tagname, attribute, value = tagname.strip().upper(), attribute.strip(), value.strip() + + item = await self.locate_element_by_tagname_attribute_value( + tagname, attribute, value + ) + while not item: + await self.wait() + item = await self.locate_element_by_tagname_attribute_value( + tagname, attribute, value + ) + if loop.time() - start_time > timeout: + raise asyncio.TimeoutError( + f"time ran out while waiting for element: {tagname}[{attribute}={value}]" + ) + await self.sleep(0.5) + return item async def select( self, @@ -284,6 +347,39 @@ async def find_all( ) await self.sleep(0.5) return items + + async def locate_all( + self, + tagname: str, + attribute: str, + value: str, + timeout: Union[int, float] = 10, + ) -> List[Element]: + """ + find multiple elements by text + can also be used to wait for such element to appear. + + :param text: text to search for. note: script contents are also considered text + :type text: str + + :param timeout: raise timeout exception when after this many seconds nothing is found. + :type timeout: float,int + """ + loop = asyncio.get_running_loop() + now = loop.time() + + text = text.strip() + items = await self.find_elements_by_text(text) + + while not items: + await self.wait() + items = await self.find_elements_by_text(text) + if loop.time() - now > timeout: + raise asyncio.TimeoutError( + f"time ran out while waiting for elements: {tagname}[{attribute}={value}]" + ) + await self.sleep(0.5) + return items async def select_all( self, selector: str, timeout: Union[int, float] = 10, include_frames=False @@ -644,6 +740,82 @@ async def find_element_by_text( await self.send(cdp.dom.disable()) return None + + async def locate_element_by_tagname_attribute_value( + self, + tagname: str, + attribute: str, + value: str, + ) -> Element | None: + """ + finds and returns the first element containing , or best match + + :param tagname: + :type tagname: str + :param attribute: + :type attribute: str + :param value: + :type value: str + + :param return_enclosing_element: + :type return_enclosing_element: + :return: + :rtype: Element + """ + async def traverse(node, parent_tree): + """ + Recursive traversal of the DOM and shadow DOM to find out targeted element. + """ + if not node: + return None + + # check if the node matches the tag and attribute criteria + if ( + node.node_type == 1 # element node + and node.node_name.lower() == tagname.lower() + and node.attributes + and attribute in node.attributes + and node.attributes[attribute] == value + ): + return element.create(node, self, parent_tree) + + # traverse shadow roots if they exist + if node.shadow_roots: + for shadow_root in node.shadow_roots: + result = await traverse(shadow_root, parent_tree) + if result: + return result + + # traverse child nodes + if node.children: + for child in node.children: + result = await traverse(child, parent_tree) + if result: + return result + + return None + + # fetch the document root + doc = await self.send(cdp.dom.get_document(depth=-1, pierce=True)) + + # start traversing the DOM tree + result = await traverse(doc, doc) + if result: + return result + + # search within iframes + iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") + for iframe in iframes: + iframe_elem = element.create(iframe, self, iframe.content_document) + if not iframe_elem: + continue + + iframe_doc = iframe.content_document + result = await traverse(iframe_doc, iframe_doc) + if result: + return result + + return None async def back(self): """ From 1547b3d6546a1f3a93837760b7f033800c75f540 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Fri, 20 Dec 2024 19:46:08 +0100 Subject: [PATCH 02/24] making the functions more efficient and adding functionality to locate and return multiple elements --- zendriver/core/tab.py | 377 +++++++++++++++++++++++++----------------- 1 file changed, 223 insertions(+), 154 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 1b484d20..95577958 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -162,6 +162,47 @@ async def open_external_inspector(self): import webbrowser webbrowser.open(self.inspector_url) + + async def locate( + self, + tagname: str, + attribute: str, + value: str, + timeout: Union[int, float] = 10, + ) -> Element: + """ + locate a single element by tagname, attribute, and value + can also be used to wait for such element to appear. + + :param tagname: tagname of the element to search for + :type tagname: str + :param attribute: the attribute we will be filtering the element by + :type attribute: str + :param value: the value we will be checking the attribute to narrow our list of elements as much as possible + :type value: str + + :param timeout: raise timeout exception when after this many seconds nothing is found. + :type timeout: float,int + """ + loop = asyncio.get_running_loop() + start_time = loop.time() + + tagname, attribute, value = tagname.strip().upper(), attribute.strip(), value.strip() + + item = await self.locate_element_by_tagname_attribute_value( + tagname, attribute, value + ) + while not item: + await self.wait() + item = await self.locate_element_by_tagname_attribute_value( + tagname, attribute, value + ) + if loop.time() - start_time > timeout: + raise asyncio.TimeoutError( + f"time ran out while waiting for element: {tagname}[{attribute}=\"{value}\"]" + ) + await self.sleep(0.5) + return item async def find( self, @@ -221,69 +262,6 @@ async def find( ) await self.sleep(0.5) return item - - async def locate( - self, - tagname: str, - attribute: str, - value: str, - timeout: Union[int, float] = 10, - ): - """ - locate a single element by tagname, attribute, and value; this way it is guaranteed to get your targeted element every single time. - can also be used to wait for such element to appear. - - :param tagname: tagname of the element to search for - :type tagname: str - :param attribute: the attribute we will be filtering the element by - :type attribute: str - :param value: the value we will be checking the attribute to narrow our list of elements as much as possible - :type value: str - :param best_match: :param best_match: when True (default), it will return the element which has the most - comparable string length. this could help tremendously, when for example - you search for "login", you'd probably want the login button element, - and not thousands of scripts,meta,headings containing a string of "login". - When False, it will return naively just the first match (but is way faster). - :type best_match: bool - :param return_enclosing_element: - since we deal with nodes instead of elements, the find function most often returns - so called text nodes, which is actually a element of plain text, which is - the somehow imaginary "child" of a "span", "p", "script" or any other elements which have text between their opening - and closing tags. - most often when we search by text, we actually aim for the element containing the text instead of - a lousy plain text node, so by default the containing element is returned. - - however, there are (why not) exceptions, for example elements that use the "placeholder=" property. - this text is rendered, but is not a pure text node. in that case you can set this flag to False. - since in this case we are probably interested in just that element, and not it's parent. - - - # todo, automatically determine node type - # ignore the return_enclosing_element flag if the found node is NOT a text node but a - # regular element (one having a tag) in which case that is exactly what we need. - :type return_enclosing_element: bool - :param timeout: raise timeout exception when after this many seconds nothing is found. - :type timeout: float,int - """ - loop = asyncio.get_running_loop() - start_time = loop.time() - - tagname, attribute, value = tagname.strip().upper(), attribute.strip(), value.strip() - - item = await self.locate_element_by_tagname_attribute_value( - tagname, attribute, value - ) - while not item: - await self.wait() - item = await self.locate_element_by_tagname_attribute_value( - tagname, attribute, value - ) - if loop.time() - start_time > timeout: - raise asyncio.TimeoutError( - f"time ran out while waiting for element: {tagname}[{attribute}={value}]" - ) - await self.sleep(0.5) - return item async def select( self, @@ -317,17 +295,23 @@ async def select( await self.sleep(0.5) return item - async def find_all( + async def locate_all( self, - text: str, + tagname: str, + attribute: str, + value: str, timeout: Union[int, float] = 10, ) -> List[Element]: """ - find multiple elements by text + locate multiple elements by tagname, attribute, and value can also be used to wait for such element to appear. - :param text: text to search for. note: script contents are also considered text - :type text: str + :param tagname: tagname of the element to search for + :type tagname: str + :param attribute: the attribute we will be filtering the element by + :type attribute: str + :param value: the value we will be checking the attribute to narrow our list of elements as much as possible + :type value: str :param timeout: raise timeout exception when after this many seconds nothing is found. :type timeout: float,int @@ -335,24 +319,31 @@ async def find_all( loop = asyncio.get_running_loop() now = loop.time() - text = text.strip() - items = await self.find_elements_by_text(text) + tagname, attribute, value = tagname.strip().upper(), attribute.strip(), value.strip() + + items = await self.locate_elements_by_tagname_attribute_value( + tagname = tagname, + attribute = attribute, + value = value + ) while not items: await self.wait() - items = await self.find_elements_by_text(text) + items = await self.locate_elements_by_tagname_attribute_value( + tagname = tagname, + attribute = attribute, + value = value + ) if loop.time() - now > timeout: raise asyncio.TimeoutError( - "time ran out while waiting for text: %s" % text + f"time ran out while waiting for elements: {tagname}[{attribute}=\"{value}\"]" ) await self.sleep(0.5) return items - - async def locate_all( + + async def find_all( self, - tagname: str, - attribute: str, - value: str, + text: str, timeout: Union[int, float] = 10, ) -> List[Element]: """ @@ -376,7 +367,7 @@ async def locate_all( items = await self.find_elements_by_text(text) if loop.time() - now > timeout: raise asyncio.TimeoutError( - f"time ran out while waiting for elements: {tagname}[{attribute}={value}]" + "time ran out while waiting for text: %s" % text ) await self.sleep(0.5) return items @@ -558,6 +549,160 @@ async def query_selector( return return element.create(node, self, doc) + async def locate_element_by_tagname_attribute_value( + self, + tagname: str, + attribute: str, + value: str, + ) -> Element | None: + """ + locates and returns the first element containing , or best match + + :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'..). + :type tagname: str + :param attribute: The attribute to match (e.g., 'id', 'name'..). + :type attribute: str + :param value: The value of the attribute to match. + :type value: str + + :return: A single element + :rtype: Element + """ + async def traverse(node, parent_tree): + """ + Recursive traversal of the DOM and shadow DOM to find out targeted element. + """ + if not node: + return None + + # check if the node matches the tag and attribute criteria + if ( + node.node_type == 1 # element node + and node.node_name.lower() == tagname.lower() + and node.attributes + and attribute in node.attributes + and any( + node.attributes[i] == attribute and node.attributes[i + 1] == value + for i in range(0, len(node.attributes), 2) + ) + ): + return element.create(node, self, parent_tree) + + tasks = list() + + # traverse shadow roots if they exist + if node.shadow_roots: + tasks.extend(traverse(shadow_root, parent_tree) for shadow_root in node.shadow_roots) + + # traverse child nodes + if node.children: + tasks.extend(traverse(child, parent_tree) for child in node.children) + + for task in asyncio.as_completed(tasks): + result = await task + if result: + return result + + return None + + # fetch the document root + doc = await self.send(cdp.dom.get_document(depth=-1, pierce=True)) + + # start traversing the DOM tree + result = await traverse(doc, doc) + if result: + return result + + # search within iframes concurrently + iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") + iframe_tasks = [ + traverse(iframe.content_document, iframe.content_document) + for iframe in iframes + if iframe.content_document + ] + + for iframe_task in asyncio.as_completed(iframe_tasks): + result = await iframe_task + if result: + return result + + return None + + async def locate_elements_by_tagname_attribute_value( + self, + tagname: str, + attribute: str, + value: str, + ) -> list[Element]: + """ + locates and returns all elements with the specified tagname, attribute, and value. + + :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'..). + :type tagname: str + :param attribute: The attribute to match (e.g., 'id', 'name'..). + :type attribute: str + :param value: The value of the attribute to match. + :type value: str + + :return: List of matching elements. + :rtype: list[Element] + """ + results = [] + + async def traverse(node, parent_tree): + """ + Recursive traversal of the DOM, including shadow DOM and iframes, to collect all matching elements. + """ + if not node: + return + + # Check if the node matches the tag and attribute criteria + if ( + node.node_type == 1 # element node + and node.node_name.lower() == tagname.lower() + and node.attributes + and attribute in node.attributes + and any( + node.attributes[i] == attribute and value in node.attributes[i + 1].split() # searches inside the attributes of the node and checks whether our targeted attribute contains our targeted value, this would also work if we have a Div element with the attribute Class equaling "Class1 Class2" and we're only targeting the value Class1 + for i in range(0, len(node.attributes), 2) + ) + ): + results.append(element.create(node, self, parent_tree)) + + # Use asyncio.gather to explore shadow roots and children concurrently + tasks = [] + + # Traverse shadow roots if present + if node.shadow_roots: + tasks.extend(traverse(shadow_root, parent_tree) for shadow_root in node.shadow_roots) + + # Traverse child nodes + if node.children: + tasks.extend(traverse(child, parent_tree) for child in node.children) + + # Process all tasks concurrently + if tasks: + await asyncio.gather(*tasks) + + # Fetch the document root + doc = await self.send(cdp.dom.get_document(depth=-1, pierce=True)) + + # Start traversing the main document + await traverse(doc, doc) + + # Search within iframes concurrently + iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") + iframe_tasks = [ + traverse(iframe.content_document, iframe.content_document) + for iframe in iframes + if iframe.content_document + ] + + if iframe_tasks: + await asyncio.gather(*iframe_tasks) + + return results + async def find_elements_by_text( self, text: str, @@ -740,82 +885,6 @@ async def find_element_by_text( await self.send(cdp.dom.disable()) return None - - async def locate_element_by_tagname_attribute_value( - self, - tagname: str, - attribute: str, - value: str, - ) -> Element | None: - """ - finds and returns the first element containing , or best match - - :param tagname: - :type tagname: str - :param attribute: - :type attribute: str - :param value: - :type value: str - - :param return_enclosing_element: - :type return_enclosing_element: - :return: - :rtype: Element - """ - async def traverse(node, parent_tree): - """ - Recursive traversal of the DOM and shadow DOM to find out targeted element. - """ - if not node: - return None - - # check if the node matches the tag and attribute criteria - if ( - node.node_type == 1 # element node - and node.node_name.lower() == tagname.lower() - and node.attributes - and attribute in node.attributes - and node.attributes[attribute] == value - ): - return element.create(node, self, parent_tree) - - # traverse shadow roots if they exist - if node.shadow_roots: - for shadow_root in node.shadow_roots: - result = await traverse(shadow_root, parent_tree) - if result: - return result - - # traverse child nodes - if node.children: - for child in node.children: - result = await traverse(child, parent_tree) - if result: - return result - - return None - - # fetch the document root - doc = await self.send(cdp.dom.get_document(depth=-1, pierce=True)) - - # start traversing the DOM tree - result = await traverse(doc, doc) - if result: - return result - - # search within iframes - iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") - for iframe in iframes: - iframe_elem = element.create(iframe, self, iframe.content_document) - if not iframe_elem: - continue - - iframe_doc = iframe.content_document - result = await traverse(iframe_doc, iframe_doc) - if result: - return result - - return None async def back(self): """ From df9bd3ff42266063817ed17a5e41b5af6ef20aef Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Fri, 20 Dec 2024 19:49:04 +0100 Subject: [PATCH 03/24] comments --- zendriver/core/tab.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 95577958..25a530e1 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -570,7 +570,7 @@ async def locate_element_by_tagname_attribute_value( """ async def traverse(node, parent_tree): """ - Recursive traversal of the DOM and shadow DOM to find out targeted element. + recursive traversal of the DOM and shadow DOM to find out targeted element. """ if not node: return None @@ -651,7 +651,7 @@ async def locate_elements_by_tagname_attribute_value( async def traverse(node, parent_tree): """ - Recursive traversal of the DOM, including shadow DOM and iframes, to collect all matching elements. + recursive traversal of the DOM, including shadow DOM and iframes, to collect all matching elements. """ if not node: return @@ -666,31 +666,30 @@ async def traverse(node, parent_tree): node.attributes[i] == attribute and value in node.attributes[i + 1].split() # searches inside the attributes of the node and checks whether our targeted attribute contains our targeted value, this would also work if we have a Div element with the attribute Class equaling "Class1 Class2" and we're only targeting the value Class1 for i in range(0, len(node.attributes), 2) ) - ): + ): # if we find a match element, we append it to our list of results results.append(element.create(node, self, parent_tree)) - # Use asyncio.gather to explore shadow roots and children concurrently - tasks = [] + tasks = list() - # Traverse shadow roots if present + # traverse shadow roots if present if node.shadow_roots: tasks.extend(traverse(shadow_root, parent_tree) for shadow_root in node.shadow_roots) - # Traverse child nodes + # traverse child nodes if node.children: tasks.extend(traverse(child, parent_tree) for child in node.children) - # Process all tasks concurrently + # process all tasks concurrently if tasks: await asyncio.gather(*tasks) - # Fetch the document root + # fetch the document root doc = await self.send(cdp.dom.get_document(depth=-1, pierce=True)) - # Start traversing the main document + # start traversing the main document await traverse(doc, doc) - # Search within iframes concurrently + # search within iframes concurrently iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") iframe_tasks = [ traverse(iframe.content_document, iframe.content_document) From e7dff0ec85275200a468a9f7081aa16f4b0c40e4 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Wed, 22 Jan 2025 19:39:45 +0100 Subject: [PATCH 04/24] changing find function to search the webpage, including shadowroots and iframes, and adding the functionality to search either by text, or using a tagname and a dictionary of attributes --- zendriver/core/tab.py | 125 ++++++++++++++++++++++++++++-------------- 1 file changed, 84 insertions(+), 41 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 25a530e1..bb42714a 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -206,7 +206,9 @@ async def locate( async def find( self, - text: str, + text: Optional[str] = None, + tagname: Optional[str] = None, + attrs: Optional[dict[str, str]] = None, best_match: bool = True, return_enclosing_element=True, timeout: Union[int, float] = 10, @@ -242,26 +244,67 @@ async def find( :type return_enclosing_element: bool :param timeout: raise timeout exception when after this many seconds nothing is found. :type timeout: float,int - """ - loop = asyncio.get_running_loop() - start_time = loop.time() + """ - text = text.strip() + if(text and not tagname): + loop = asyncio.get_running_loop() + start_time = loop.time() + + text = text.strip() - item = await self.find_element_by_text( - text, best_match, return_enclosing_element - ) - while not item: - await self.wait() item = await self.find_element_by_text( text, best_match, return_enclosing_element ) - if loop.time() - start_time > timeout: - raise asyncio.TimeoutError( - "time ran out while waiting for text: %s" % text + while not item: + await self.wait() + item = await self.find_element_by_text( + text, best_match, return_enclosing_element ) - await self.sleep(0.5) - return item + if loop.time() - start_time > timeout: + raise asyncio.TimeoutError( + "Time ran out while waiting for text: %s" % text + ) + await self.sleep(0.5) + return item + elif(tagname): + # loop = asyncio.get_running_loop() + # start_time = loop.time() + + # tagname, attribute, value = tagname.strip().upper(), attribute.strip(), value.strip() + + # item = await self.locate_element_by_tagname_attribute_value( + # tagname, attribute, value + # ) + # while not item: + # await self.wait() + # item = await self.locate_element_by_tagname_attribute_value( + # tagname, attribute, value + # ) + # if loop.time() - start_time > timeout: + # raise asyncio.TimeoutError( + # f"time ran out while waiting for element: {tagname}[{attribute}=\"{value}\"]" + # ) + # await self.sleep(0.5) + # return item + loop = asyncio.get_running_loop() + start_time = loop.time() + + tagname = tagname.strip().upper() + attrs = {k.strip(): v.strip() for k, v in attrs.items()} + + item = await self.locate_element_by_tagname_attribute_value(tagname, attrs) + while not item: + await self.wait() + item = await self.locate_element_by_tagname_attribute_value(tagname, attrs) + if loop.time() - start_time > timeout: + raise asyncio.TimeoutError( + f"Time ran out while waiting for element: {tagname}, with attributes: {attrs}" + ) + await self.sleep(0.5) + return item + elif(not text and not tagname): + # raising an error in case neither text nor tagname values were provided + raise ValueError("You must provide either tagname or text to locate an element.") async def select( self, @@ -552,8 +595,7 @@ async def query_selector( async def locate_element_by_tagname_attribute_value( self, tagname: str, - attribute: str, - value: str, + attrs: dict[str, str], ) -> Element | None: """ locates and returns the first element containing , or best match @@ -580,10 +622,12 @@ async def traverse(node, parent_tree): node.node_type == 1 # element node and node.node_name.lower() == tagname.lower() and node.attributes - and attribute in node.attributes - and any( - node.attributes[i] == attribute and node.attributes[i + 1] == value - for i in range(0, len(node.attributes), 2) + and all( + any( + node.attributes[i] == attr and value in node.attributes[i + 1].split() + for i in range(0, len(node.attributes), 2) + ) + for attr, value in attrs.items() ) ): return element.create(node, self, parent_tree) @@ -628,46 +672,45 @@ async def traverse(node, parent_tree): return None - async def locate_elements_by_tagname_attribute_value( + async def locate_elements_by_tagname_attributes( self, tagname: str, - attribute: str, - value: str, + attrs: dict[str, str], ) -> list[Element]: """ - locates and returns all elements with the specified tagname, attribute, and value. + Locates and returns all elements with the specified tagname and matching attributes. - :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'..). + :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'). :type tagname: str - :param attribute: The attribute to match (e.g., 'id', 'name'..). - :type attribute: str - :param value: The value of the attribute to match. - :type value: str + :param attrs: A dictionary of attributes and their corresponding values to match. + :type attrs: dict[str, str] :return: List of matching elements. :rtype: list[Element] """ - results = [] + elements = list() async def traverse(node, parent_tree): """ - recursive traversal of the DOM, including shadow DOM and iframes, to collect all matching elements. + Recursive traversal of the DOM, including shadow DOM and iframes, to collect all matching elements. """ if not node: return - # Check if the node matches the tag and attribute criteria + # Check if the node matches the tag and all attribute-value pairs in attrs if ( - node.node_type == 1 # element node + node.node_type == 1 # Element node and node.node_name.lower() == tagname.lower() and node.attributes - and attribute in node.attributes - and any( - node.attributes[i] == attribute and value in node.attributes[i + 1].split() # searches inside the attributes of the node and checks whether our targeted attribute contains our targeted value, this would also work if we have a Div element with the attribute Class equaling "Class1 Class2" and we're only targeting the value Class1 - for i in range(0, len(node.attributes), 2) + and all( + any( + node.attributes[i] == attr and value in node.attributes[i + 1].split() + for i in range(0, len(node.attributes), 2) + ) + for attr, value in attrs.items() ) - ): # if we find a match element, we append it to our list of results - results.append(element.create(node, self, parent_tree)) + ): # if we find a matching element, append it to our list of results + elements.append(element.create(node, self, parent_tree)) tasks = list() @@ -700,7 +743,7 @@ async def traverse(node, parent_tree): if iframe_tasks: await asyncio.gather(*iframe_tasks) - return results + return elements async def find_elements_by_text( self, From 190d9deb3285069e8315a29b3756964bef4b9ac2 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Wed, 22 Jan 2025 20:26:22 +0100 Subject: [PATCH 05/24] more code --- zendriver/core/tab.py | 350 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 323 insertions(+), 27 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index bb42714a..3fd22650 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -209,8 +209,8 @@ async def find( text: Optional[str] = None, tagname: Optional[str] = None, attrs: Optional[dict[str, str]] = None, - best_match: bool = True, - return_enclosing_element=True, + best_match: Optional[bool] = True, + return_enclosing_element = True, timeout: Union[int, float] = 10, ): """ @@ -266,45 +266,26 @@ async def find( ) await self.sleep(0.5) return item - elif(tagname): - # loop = asyncio.get_running_loop() - # start_time = loop.time() - - # tagname, attribute, value = tagname.strip().upper(), attribute.strip(), value.strip() - - # item = await self.locate_element_by_tagname_attribute_value( - # tagname, attribute, value - # ) - # while not item: - # await self.wait() - # item = await self.locate_element_by_tagname_attribute_value( - # tagname, attribute, value - # ) - # if loop.time() - start_time > timeout: - # raise asyncio.TimeoutError( - # f"time ran out while waiting for element: {tagname}[{attribute}=\"{value}\"]" - # ) - # await self.sleep(0.5) - # return item + elif(tagname or attrs): loop = asyncio.get_running_loop() start_time = loop.time() tagname = tagname.strip().upper() attrs = {k.strip(): v.strip() for k, v in attrs.items()} - item = await self.locate_element_by_tagname_attribute_value(tagname, attrs) + item = await self.find_element_by_tagname_attrs(tagname, attrs) while not item: await self.wait() - item = await self.locate_element_by_tagname_attribute_value(tagname, attrs) + item = await self.find_element_by_tagname_attrs(tagname, attrs) if loop.time() - start_time > timeout: raise asyncio.TimeoutError( f"Time ran out while waiting for element: {tagname}, with attributes: {attrs}" ) await self.sleep(0.5) return item - elif(not text and not tagname): + elif(not text and not tagname and not attrs): # raising an error in case neither text nor tagname values were provided - raise ValueError("You must provide either tagname or text to locate an element.") + raise ValueError("You must provide either tagname, attrs, or text to locate an element.") async def select( self, @@ -386,7 +367,9 @@ async def locate_all( async def find_all( self, - text: str, + text: Optional[str] = None, + tagname: Optional[str] = None, + attrs: Optional[dict[str, str]] = None, timeout: Union[int, float] = 10, ) -> List[Element]: """ @@ -399,6 +382,24 @@ async def find_all( :param timeout: raise timeout exception when after this many seconds nothing is found. :type timeout: float,int """ + if(text and not tagname): + loop = asyncio.get_running_loop() + now = loop.time() + + text = text.strip() + items = await self.find_elements_by_text(text) + + while not items: + await self.wait() + items = await self.find_elements_by_text(text) + if loop.time() - now > timeout: + raise asyncio.TimeoutError( + "time ran out while waiting for text: %s" % text + ) + await self.sleep(0.5) + return items + elif(tagname or attrs): + loop = asyncio.get_running_loop() now = loop.time() @@ -591,6 +592,156 @@ async def query_selector( if not node: return return element.create(node, self, doc) + + # async def find_element_by_tagname_attrs( + # self, + # tagname: str, + # attrs: dict[str, str], + # ) -> Element | None: + # """ + # Finds and returns the first element matching the tagname and attributes. + + # :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'). + # :type tagname: str + # :param attrs: A dictionary of attribute-value pairs to match. + # :type attrs: dict[str, str] + + # :return: A single element or None if no match is found. + # :rtype: Element + # """ + # async def traverse(node, parent_tree): + # """ + # Recursive traversal of the DOM and shadow DOM to find the targeted element. + # """ + # if not node: + # return None + + # # check if the node matches the tag and attribute criteria + # matches_tagname = node.node_type == 1 and node.node_name.lower() == tagname.lower() + # matches_attrs = attrs and node.attributes and all( + # any( + # node.attributes[i] == attr and value in node.attributes[i + 1].split() + # for i in range(0, len(node.attributes), 2) + # ) + # for attr, value in attrs.items() + # ) + + # if matches_tagname and matches_attrs: + # return element.create(node, self, parent_tree) + + # # traverse shadow roots and child nodes + # tasks = list() + # if node.shadow_roots: + # tasks.extend(traverse(shadow_root, parent_tree) for shadow_root in node.shadow_roots) + # if node.children: + # tasks.extend(traverse(child, parent_tree) for child in node.children) + + # for task in asyncio.as_completed(tasks): + # result = await task + # if result: + # return result + + # return None + + # # fetch the document root + # doc = await self.send(cdp.dom.get_document(depth=-1, pierce=True)) + + # # start traversing the DOM tree + # result = await traverse(doc, doc) + # if result: + # return result + + # # search within iframes concurrently + # iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") + # iframe_tasks = [ + # traverse(iframe.content_document, iframe.content_document) + # for iframe in iframes + # if iframe.content_document + # ] + + # for iframe_task in asyncio.as_completed(iframe_tasks): + # result = await iframe_task + # if result: + # return result + + # return None + + async def find_element_by_tagname_attrs( + self, + tagname: str | None = None, + attrs: dict[str, str] | None = None, + ) -> Element | None: + """ + Finds and returns the first element matching the tagname and attributes. + + :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'). Optional. + :type tagname: str | None + :param attrs: A dictionary of attribute-value pairs to match. Optional. + :type attrs: dict[str, str] | None + + :return: A single element or None if no match is found. + :rtype: Element | None + """ + async def traverse(node, parent_tree): + """ + Recursive traversal of the DOM and shadow DOM to find the targeted element. + """ + if not node: + return None + + # Check tagname and attributes if provided + matches_tagname = ( + not tagname or (node.node_type == 1 and node.node_name.lower() == tagname.lower()) + ) + matches_attrs = ( + not attrs or (node.attributes and all( + any( + node.attributes[i] == attr and value in node.attributes[i + 1].split() + for i in range(0, len(node.attributes), 2) + ) + for attr, value in attrs.items() + )) + ) + + if matches_tagname and matches_attrs: + return element.create(node, self, parent_tree) + + # Traverse shadow roots and child nodes + tasks = list() + if node.shadow_roots: + tasks.extend(traverse(shadow_root, parent_tree) for shadow_root in node.shadow_roots) + if node.children: + tasks.extend(traverse(child, parent_tree) for child in node.children) + + for task in asyncio.as_completed(tasks): + result = await task + if result: + return result + + return None + + # Fetch the document root + doc = await self.send(cdp.dom.get_document(depth=-1, pierce=True)) + + # Start traversing the DOM tree + result = await traverse(doc, doc) + if result: + return result + + # Search within iframes concurrently + iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") + iframe_tasks = [ + traverse(iframe.content_document, iframe.content_document) + for iframe in iframes + if iframe.content_document + ] + + for iframe_task in asyncio.as_completed(iframe_tasks): + result = await iframe_task + if result: + return result + + return None async def locate_element_by_tagname_attribute_value( self, @@ -671,6 +822,151 @@ async def traverse(node, parent_tree): return result return None + + # async def find_elements_by_tagname_attrs( + # self, + # tagname: str, + # attrs: dict[str, str], + # ) -> list[Element]: + # """ + # Locates and returns all elements with the specified tagname and matching attributes. + + # :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'). + # :type tagname: str + # :param attrs: A dictionary of attributes and their corresponding values to match. + # :type attrs: dict[str, str] + + # :return: List of matching elements. + # :rtype: list[Element] + # """ + # elements = list() + + # async def traverse(node, parent_tree): + # """ + # Recursive traversal of the DOM, including shadow DOM and iframes, to collect all matching elements. + # """ + # if not node: + # return + + # # Check if the node matches the tag and all attribute-value pairs in attrs + # if ( + # node.node_type == 1 # Element node + # and node.node_name.lower() == tagname.lower() + # and node.attributes + # and all( + # any( + # node.attributes[i] == attr and value in node.attributes[i + 1].split() + # for i in range(0, len(node.attributes), 2) + # ) + # for attr, value in attrs.items() + # ) + # ): # if we find a matching element, append it to our list of results + # elements.append(element.create(node, self, parent_tree)) + + # tasks = list() + + # # traverse shadow roots if present + # if node.shadow_roots: + # tasks.extend(traverse(shadow_root, parent_tree) for shadow_root in node.shadow_roots) + + # # traverse child nodes + # if node.children: + # tasks.extend(traverse(child, parent_tree) for child in node.children) + + # # process all tasks concurrently + # if tasks: + # await asyncio.gather(*tasks) + + # # fetch the document root + # doc = await self.send(cdp.dom.get_document(depth=-1, pierce=True)) + + # # start traversing the main document + # await traverse(doc, doc) + + # # search within iframes concurrently + # iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") + # iframe_tasks = [ + # traverse(iframe.content_document, iframe.content_document) + # for iframe in iframes + # if iframe.content_document + # ] + + # if iframe_tasks: + # await asyncio.gather(*iframe_tasks) + + # return elements + + async def find_elements_by_tagname_attrs( + self, + tagname: Optional[str] = None, + attrs: Optional[dict[str, str]] = None, + ) -> list[Element]: + """ + Finds and returns all elements with the specified tagname and matching attributes. + + :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'). + :type tagname: str + :param attrs: A dictionary of attributes and their corresponding values to match. + :type attrs: dict[str, str] + + :return: List of matching elements. + :rtype: list[Element] + """ + elements = list() + + async def traverse(node, parent_tree): + """ + Recursive traversal of the DOM, including shadow DOM and iframes, to collect all matching elements. + """ + if not node: + return None + + # check if the node matches the tagname and attribute-value pairs + if ( + node.node_type == 1 # Element node + and (not tagname or node.node_name.lower() == tagname.lower()) + and node.attributes + and (not attrs or all( + any( + node.attributes[i] == attr and value in node.attributes[i + 1].split() + for i in range(0, len(node.attributes), 2) + ) + for attr, value in attrs.items() + )) + ): + elements.append(element.create(node, self, parent_tree)) + + tasks = list() + + # traverse shadow roots + if node.shadow_roots: + tasks.extend(traverse(shadow_root, parent_tree) for shadow_root in node.shadow_roots) + + # traverse child nodes + if node.children: + tasks.extend(traverse(child, parent_tree) for child in node.children) + + if tasks: + await asyncio.gather(*tasks) + + # fetch the document root + doc = await self.send(cdp.dom.get_document(depth=-1, pierce=True)) + + # traverse the DOM tree + await traverse(doc, doc) + + # handle iframes + iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") + iframe_tasks = [ + traverse(iframe.content_document, iframe.content_document) + for iframe in iframes + if iframe.content_document + ] + + if iframe_tasks: + await asyncio.gather(*iframe_tasks) + + return elements async def locate_elements_by_tagname_attributes( self, From dc7b2a02ecd818f96c5a62cfa4e145d883f0c073 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Wed, 22 Jan 2025 20:32:53 +0100 Subject: [PATCH 06/24] finalizing --- zendriver/core/tab.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 3fd22650..4a19fc37 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -399,22 +399,25 @@ async def find_all( await self.sleep(0.5) return items elif(tagname or attrs): - - loop = asyncio.get_running_loop() - now = loop.time() + loop = asyncio.get_running_loop() + start_time = loop.time() - text = text.strip() - items = await self.find_elements_by_text(text) + tagname = tagname.strip().upper() + attrs = {k.strip(): v.strip() for k, v in attrs.items()} - while not items: - await self.wait() - items = await self.find_elements_by_text(text) - if loop.time() - now > timeout: - raise asyncio.TimeoutError( - "time ran out while waiting for text: %s" % text - ) - await self.sleep(0.5) - return items + items = await self.find_elements_by_tagname_attrs(tagname, attrs) + while not item: + await self.wait() + item = await self.find_elements_by_tagname_attrs(tagname, attrs) + if loop.time() - start_time > timeout: + raise asyncio.TimeoutError( + f"Time ran out while waiting for element: {tagname}, with attributes: {attrs}" + ) + await self.sleep(0.5) + return items + elif(not text and not tagname and not attrs): + # raising an error in case neither text nor tagname values were provided + raise ValueError("You must provide either tagname, attrs, or text to locate the elements.") async def select_all( self, selector: str, timeout: Union[int, float] = 10, include_frames=False From 12eab344922aae08ff85a99f92b320897b6c1d2f Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Wed, 22 Jan 2025 20:59:47 +0100 Subject: [PATCH 07/24] added functionality to find elements using their tagname or attributes for a more efficient way to find elements, users have total control over which method they want to search; either by tagname, attributes, or text --- zendriver/core/tab.py | 592 ++++++++---------------------------------- 1 file changed, 109 insertions(+), 483 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 4a19fc37..7e7fd106 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -162,47 +162,6 @@ async def open_external_inspector(self): import webbrowser webbrowser.open(self.inspector_url) - - async def locate( - self, - tagname: str, - attribute: str, - value: str, - timeout: Union[int, float] = 10, - ) -> Element: - """ - locate a single element by tagname, attribute, and value - can also be used to wait for such element to appear. - - :param tagname: tagname of the element to search for - :type tagname: str - :param attribute: the attribute we will be filtering the element by - :type attribute: str - :param value: the value we will be checking the attribute to narrow our list of elements as much as possible - :type value: str - - :param timeout: raise timeout exception when after this many seconds nothing is found. - :type timeout: float,int - """ - loop = asyncio.get_running_loop() - start_time = loop.time() - - tagname, attribute, value = tagname.strip().upper(), attribute.strip(), value.strip() - - item = await self.locate_element_by_tagname_attribute_value( - tagname, attribute, value - ) - while not item: - await self.wait() - item = await self.locate_element_by_tagname_attribute_value( - tagname, attribute, value - ) - if loop.time() - start_time > timeout: - raise asyncio.TimeoutError( - f"time ran out while waiting for element: {tagname}[{attribute}=\"{value}\"]" - ) - await self.sleep(0.5) - return item async def find( self, @@ -246,7 +205,7 @@ async def find( :type timeout: float,int """ - if(text and not tagname): + if(text and not tagname and not attrs): # searches by text only if tagname and attrs weren't provided because it is more efficient to search by tagname and attrs. loop = asyncio.get_running_loop() start_time = loop.time() @@ -270,8 +229,14 @@ async def find( loop = asyncio.get_running_loop() start_time = loop.time() - tagname = tagname.strip().upper() - attrs = {k.strip(): v.strip() for k, v in attrs.items()} + if(tagname): + tagname = tagname.strip().upper() + else: + tagname = None + if(attrs): + attrs = {k.strip(): v.strip() for k, v in attrs.items()} + else: + attrs = None item = await self.find_element_by_tagname_attrs(tagname, attrs) while not item: @@ -285,7 +250,7 @@ async def find( return item elif(not text and not tagname and not attrs): # raising an error in case neither text nor tagname values were provided - raise ValueError("You must provide either tagname, attrs, or text to locate an element.") + raise ValueError("You must provide either tagname, attrs, or text to find an element.") async def select( self, @@ -319,52 +284,6 @@ async def select( await self.sleep(0.5) return item - async def locate_all( - self, - tagname: str, - attribute: str, - value: str, - timeout: Union[int, float] = 10, - ) -> List[Element]: - """ - locate multiple elements by tagname, attribute, and value - can also be used to wait for such element to appear. - - :param tagname: tagname of the element to search for - :type tagname: str - :param attribute: the attribute we will be filtering the element by - :type attribute: str - :param value: the value we will be checking the attribute to narrow our list of elements as much as possible - :type value: str - - :param timeout: raise timeout exception when after this many seconds nothing is found. - :type timeout: float,int - """ - loop = asyncio.get_running_loop() - now = loop.time() - - tagname, attribute, value = tagname.strip().upper(), attribute.strip(), value.strip() - - items = await self.locate_elements_by_tagname_attribute_value( - tagname = tagname, - attribute = attribute, - value = value - ) - - while not items: - await self.wait() - items = await self.locate_elements_by_tagname_attribute_value( - tagname = tagname, - attribute = attribute, - value = value - ) - if loop.time() - now > timeout: - raise asyncio.TimeoutError( - f"time ran out while waiting for elements: {tagname}[{attribute}=\"{value}\"]" - ) - await self.sleep(0.5) - return items - async def find_all( self, text: Optional[str] = None, @@ -382,7 +301,7 @@ async def find_all( :param timeout: raise timeout exception when after this many seconds nothing is found. :type timeout: float,int """ - if(text and not tagname): + if(text and not tagname and not attrs): # searches by text only if tagname and attrs weren't provided because it is more efficient to search by tagname and attrs. loop = asyncio.get_running_loop() now = loop.time() @@ -402,13 +321,19 @@ async def find_all( loop = asyncio.get_running_loop() start_time = loop.time() - tagname = tagname.strip().upper() - attrs = {k.strip(): v.strip() for k, v in attrs.items()} + if(tagname): + tagname = tagname.strip().upper() + else: + tagname = None + if(attrs): + attrs = {k.strip(): v.strip() for k, v in attrs.items()} + else: + attrs = None items = await self.find_elements_by_tagname_attrs(tagname, attrs) - while not item: + while not items: await self.wait() - item = await self.find_elements_by_tagname_attrs(tagname, attrs) + items = await self.find_elements_by_tagname_attrs(tagname, attrs) if loop.time() - start_time > timeout: raise asyncio.TimeoutError( f"Time ran out while waiting for element: {tagname}, with attributes: {attrs}" @@ -417,7 +342,7 @@ async def find_all( return items elif(not text and not tagname and not attrs): # raising an error in case neither text nor tagname values were provided - raise ValueError("You must provide either tagname, attrs, or text to locate the elements.") + raise ValueError("You must provide either tagname, attrs, or text to find elements.") async def select_all( self, selector: str, timeout: Union[int, float] = 10, include_frames=False @@ -595,79 +520,6 @@ async def query_selector( if not node: return return element.create(node, self, doc) - - # async def find_element_by_tagname_attrs( - # self, - # tagname: str, - # attrs: dict[str, str], - # ) -> Element | None: - # """ - # Finds and returns the first element matching the tagname and attributes. - - # :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'). - # :type tagname: str - # :param attrs: A dictionary of attribute-value pairs to match. - # :type attrs: dict[str, str] - - # :return: A single element or None if no match is found. - # :rtype: Element - # """ - # async def traverse(node, parent_tree): - # """ - # Recursive traversal of the DOM and shadow DOM to find the targeted element. - # """ - # if not node: - # return None - - # # check if the node matches the tag and attribute criteria - # matches_tagname = node.node_type == 1 and node.node_name.lower() == tagname.lower() - # matches_attrs = attrs and node.attributes and all( - # any( - # node.attributes[i] == attr and value in node.attributes[i + 1].split() - # for i in range(0, len(node.attributes), 2) - # ) - # for attr, value in attrs.items() - # ) - - # if matches_tagname and matches_attrs: - # return element.create(node, self, parent_tree) - - # # traverse shadow roots and child nodes - # tasks = list() - # if node.shadow_roots: - # tasks.extend(traverse(shadow_root, parent_tree) for shadow_root in node.shadow_roots) - # if node.children: - # tasks.extend(traverse(child, parent_tree) for child in node.children) - - # for task in asyncio.as_completed(tasks): - # result = await task - # if result: - # return result - - # return None - - # # fetch the document root - # doc = await self.send(cdp.dom.get_document(depth=-1, pierce=True)) - - # # start traversing the DOM tree - # result = await traverse(doc, doc) - # if result: - # return result - - # # search within iframes concurrently - # iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") - # iframe_tasks = [ - # traverse(iframe.content_document, iframe.content_document) - # for iframe in iframes - # if iframe.content_document - # ] - - # for iframe_task in asyncio.as_completed(iframe_tasks): - # result = await iframe_task - # if result: - # return result - - # return None async def find_element_by_tagname_attrs( self, @@ -692,7 +544,7 @@ async def traverse(node, parent_tree): if not node: return None - # Check tagname and attributes if provided + # check tagname and attributes if provided matches_tagname = ( not tagname or (node.node_type == 1 and node.node_name.lower() == tagname.lower()) ) @@ -709,7 +561,7 @@ async def traverse(node, parent_tree): if matches_tagname and matches_attrs: return element.create(node, self, parent_tree) - # Traverse shadow roots and child nodes + # traverse shadow roots and child nodes tasks = list() if node.shadow_roots: tasks.extend(traverse(shadow_root, parent_tree) for shadow_root in node.shadow_roots) @@ -723,86 +575,6 @@ async def traverse(node, parent_tree): return None - # Fetch the document root - doc = await self.send(cdp.dom.get_document(depth=-1, pierce=True)) - - # Start traversing the DOM tree - result = await traverse(doc, doc) - if result: - return result - - # Search within iframes concurrently - iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") - iframe_tasks = [ - traverse(iframe.content_document, iframe.content_document) - for iframe in iframes - if iframe.content_document - ] - - for iframe_task in asyncio.as_completed(iframe_tasks): - result = await iframe_task - if result: - return result - - return None - - async def locate_element_by_tagname_attribute_value( - self, - tagname: str, - attrs: dict[str, str], - ) -> Element | None: - """ - locates and returns the first element containing , or best match - - :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'..). - :type tagname: str - :param attribute: The attribute to match (e.g., 'id', 'name'..). - :type attribute: str - :param value: The value of the attribute to match. - :type value: str - - :return: A single element - :rtype: Element - """ - async def traverse(node, parent_tree): - """ - recursive traversal of the DOM and shadow DOM to find out targeted element. - """ - if not node: - return None - - # check if the node matches the tag and attribute criteria - if ( - node.node_type == 1 # element node - and node.node_name.lower() == tagname.lower() - and node.attributes - and all( - any( - node.attributes[i] == attr and value in node.attributes[i + 1].split() - for i in range(0, len(node.attributes), 2) - ) - for attr, value in attrs.items() - ) - ): - return element.create(node, self, parent_tree) - - tasks = list() - - # traverse shadow roots if they exist - if node.shadow_roots: - tasks.extend(traverse(shadow_root, parent_tree) for shadow_root in node.shadow_roots) - - # traverse child nodes - if node.children: - tasks.extend(traverse(child, parent_tree) for child in node.children) - - for task in asyncio.as_completed(tasks): - result = await task - if result: - return result - - return None - # fetch the document root doc = await self.send(cdp.dom.get_document(depth=-1, pierce=True)) @@ -826,79 +598,6 @@ async def traverse(node, parent_tree): return None - # async def find_elements_by_tagname_attrs( - # self, - # tagname: str, - # attrs: dict[str, str], - # ) -> list[Element]: - # """ - # Locates and returns all elements with the specified tagname and matching attributes. - - # :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'). - # :type tagname: str - # :param attrs: A dictionary of attributes and their corresponding values to match. - # :type attrs: dict[str, str] - - # :return: List of matching elements. - # :rtype: list[Element] - # """ - # elements = list() - - # async def traverse(node, parent_tree): - # """ - # Recursive traversal of the DOM, including shadow DOM and iframes, to collect all matching elements. - # """ - # if not node: - # return - - # # Check if the node matches the tag and all attribute-value pairs in attrs - # if ( - # node.node_type == 1 # Element node - # and node.node_name.lower() == tagname.lower() - # and node.attributes - # and all( - # any( - # node.attributes[i] == attr and value in node.attributes[i + 1].split() - # for i in range(0, len(node.attributes), 2) - # ) - # for attr, value in attrs.items() - # ) - # ): # if we find a matching element, append it to our list of results - # elements.append(element.create(node, self, parent_tree)) - - # tasks = list() - - # # traverse shadow roots if present - # if node.shadow_roots: - # tasks.extend(traverse(shadow_root, parent_tree) for shadow_root in node.shadow_roots) - - # # traverse child nodes - # if node.children: - # tasks.extend(traverse(child, parent_tree) for child in node.children) - - # # process all tasks concurrently - # if tasks: - # await asyncio.gather(*tasks) - - # # fetch the document root - # doc = await self.send(cdp.dom.get_document(depth=-1, pierce=True)) - - # # start traversing the main document - # await traverse(doc, doc) - - # # search within iframes concurrently - # iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") - # iframe_tasks = [ - # traverse(iframe.content_document, iframe.content_document) - # for iframe in iframes - # if iframe.content_document - # ] - - # if iframe_tasks: - # await asyncio.gather(*iframe_tasks) - - # return elements - async def find_elements_by_tagname_attrs( self, tagname: Optional[str] = None, @@ -970,164 +669,6 @@ async def traverse(node, parent_tree): await asyncio.gather(*iframe_tasks) return elements - - async def locate_elements_by_tagname_attributes( - self, - tagname: str, - attrs: dict[str, str], - ) -> list[Element]: - """ - Locates and returns all elements with the specified tagname and matching attributes. - - :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'). - :type tagname: str - :param attrs: A dictionary of attributes and their corresponding values to match. - :type attrs: dict[str, str] - - :return: List of matching elements. - :rtype: list[Element] - """ - elements = list() - - async def traverse(node, parent_tree): - """ - Recursive traversal of the DOM, including shadow DOM and iframes, to collect all matching elements. - """ - if not node: - return - - # Check if the node matches the tag and all attribute-value pairs in attrs - if ( - node.node_type == 1 # Element node - and node.node_name.lower() == tagname.lower() - and node.attributes - and all( - any( - node.attributes[i] == attr and value in node.attributes[i + 1].split() - for i in range(0, len(node.attributes), 2) - ) - for attr, value in attrs.items() - ) - ): # if we find a matching element, append it to our list of results - elements.append(element.create(node, self, parent_tree)) - - tasks = list() - - # traverse shadow roots if present - if node.shadow_roots: - tasks.extend(traverse(shadow_root, parent_tree) for shadow_root in node.shadow_roots) - - # traverse child nodes - if node.children: - tasks.extend(traverse(child, parent_tree) for child in node.children) - - # process all tasks concurrently - if tasks: - await asyncio.gather(*tasks) - - # fetch the document root - doc = await self.send(cdp.dom.get_document(depth=-1, pierce=True)) - - # start traversing the main document - await traverse(doc, doc) - - # search within iframes concurrently - iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") - iframe_tasks = [ - traverse(iframe.content_document, iframe.content_document) - for iframe in iframes - if iframe.content_document - ] - - if iframe_tasks: - await asyncio.gather(*iframe_tasks) - - return elements - - async def find_elements_by_text( - self, - text: str, - tag_hint: Optional[str] = None, - ) -> list[Element]: - """ - returns element which match the given text. - please note: this may (or will) also return any other element (like inline scripts), - which happen to contain that text. - - :param text: - :type text: - :param tag_hint: when provided, narrows down search to only elements which match given tag eg: a, div, script, span - :type tag_hint: str - :return: - :rtype: - """ - text = text.strip() - doc = await self.send(cdp.dom.get_document(-1, True)) - search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) - if nresult: - node_ids = await self.send( - cdp.dom.get_search_results(search_id, 0, nresult) - ) - else: - node_ids = [] - - await self.send(cdp.dom.discard_search_results(search_id)) - - items = [] - for nid in node_ids: - node = util.filter_recurse(doc, lambda n: n.node_id == nid) - if not node: - node = await self.send(cdp.dom.resolve_node(node_id=nid)) - if not node: - continue - # remote_object = await self.send(cdp.dom.resolve_node(backend_node_id=node.backend_node_id)) - # node_id = await self.send(cdp.dom.request_node(object_id=remote_object.object_id)) - try: - elem = element.create(node, self, doc) - except: # noqa - continue - if elem.node_type == 3: - # if found element is a text node (which is plain text, and useless for our purpose), - # we return the parent element of the node (which is often a tag which can have text between their - # opening and closing tags (that is most tags, except for example "img" and "video", "br") - - if not elem.parent: - # check if parent actually has a parent and update it to be absolutely sure - await elem.update() - - items.append( - elem.parent or elem - ) # when it really has no parent, use the text node itself - continue - else: - # just add the element itself - items.append(elem) - - # since we already fetched the entire doc, including shadow and frames - # let's also search through the iframes - iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") - if iframes: - iframes_elems = [ - element.create(iframe, self, iframe.content_document) - for iframe in iframes - ] - for iframe_elem in iframes_elems: - if iframe_elem.content_document: - iframe_text_nodes = util.filter_recurse_all( - iframe_elem, - lambda node: node.node_type == 3 # noqa - and text.lower() in node.node_value.lower(), - ) - if iframe_text_nodes: - iframe_text_elems = [ - element.create(text_node, self, iframe_elem.tree) - for text_node in iframe_text_nodes - ] - items.extend( - text_node.parent for text_node in iframe_text_elems - ) - await self.send(cdp.dom.disable()) - return items or [] async def find_element_by_text( self, @@ -1227,6 +768,91 @@ async def find_element_by_text( return None + async def find_elements_by_text( + self, + text: str, + tag_hint: Optional[str] = None, + ) -> list[Element]: + """ + returns element which match the given text. + please note: this may (or will) also return any other element (like inline scripts), + which happen to contain that text. + + :param text: + :type text: + :param tag_hint: when provided, narrows down search to only elements which match given tag eg: a, div, script, span + :type tag_hint: str + :return: + :rtype: + """ + text = text.strip() + doc = await self.send(cdp.dom.get_document(-1, True)) + search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) + if nresult: + node_ids = await self.send( + cdp.dom.get_search_results(search_id, 0, nresult) + ) + else: + node_ids = [] + + await self.send(cdp.dom.discard_search_results(search_id)) + + items = [] + for nid in node_ids: + node = util.filter_recurse(doc, lambda n: n.node_id == nid) + if not node: + node = await self.send(cdp.dom.resolve_node(node_id=nid)) + if not node: + continue + # remote_object = await self.send(cdp.dom.resolve_node(backend_node_id=node.backend_node_id)) + # node_id = await self.send(cdp.dom.request_node(object_id=remote_object.object_id)) + try: + elem = element.create(node, self, doc) + except: # noqa + continue + if elem.node_type == 3: + # if found element is a text node (which is plain text, and useless for our purpose), + # we return the parent element of the node (which is often a tag which can have text between their + # opening and closing tags (that is most tags, except for example "img" and "video", "br") + + if not elem.parent: + # check if parent actually has a parent and update it to be absolutely sure + await elem.update() + + items.append( + elem.parent or elem + ) # when it really has no parent, use the text node itself + continue + else: + # just add the element itself + items.append(elem) + + # since we already fetched the entire doc, including shadow and frames + # let's also search through the iframes + iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") + if iframes: + iframes_elems = [ + element.create(iframe, self, iframe.content_document) + for iframe in iframes + ] + for iframe_elem in iframes_elems: + if iframe_elem.content_document: + iframe_text_nodes = util.filter_recurse_all( + iframe_elem, + lambda node: node.node_type == 3 # noqa + and text.lower() in node.node_value.lower(), + ) + if iframe_text_nodes: + iframe_text_elems = [ + element.create(text_node, self, iframe_elem.tree) + for text_node in iframe_text_nodes + ] + items.extend( + text_node.parent for text_node in iframe_text_elems + ) + await self.send(cdp.dom.disable()) + return items or [] + async def back(self): """ history back From cbdbf0699dcc9c8ae04175c3154f6f3da8cf2409 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Wed, 22 Jan 2025 21:28:39 +0100 Subject: [PATCH 08/24] added functionality to wait for elements using either their tagname, attributes, or both --- zendriver/core/tab.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 7e7fd106..b6da23f8 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -1282,6 +1282,8 @@ async def scroll_up(self, amount=25): async def wait_for( self, + tagname: Optional[str] = None, + attrs: Optional[dict[str, str]] = None, selector: str | None = None, text: str | None = None, timeout: int | float = 10, @@ -1294,6 +1296,10 @@ async def wait_for( it will block for a maximum of seconds, after which an TimeoutError will be raised + :param tagname: element tagname + :type tagname: str + :param attrs: dictionary of attributes + :type attrs: dictionary :param selector: css selector :type selector: :param text: text @@ -1306,6 +1312,24 @@ async def wait_for( """ loop = asyncio.get_running_loop() start_time = loop.time() + if tagname or attrs: # waiting for an element using either their tagname, attributes, or both + if(not tagname): # in case attrs were provided but not tagname + tagname = None + if(not attrs): # in case tagname was provided but not attrs + attrs = None + item = await self.find( + tagname = tagname, + attrs = attrs + ) + while not item and loop.time() - start_time < timeout: + item = await self.find( + tagname = tagname, + attrs = attrs + ) + await self.sleep(0.5) + + if item: + return item if selector: item = await self.query_selector(selector) while not item and loop.time() - start_time < timeout: From 520f17c73a5ba858c1b391046c2a081d4d68aa55 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Thu, 30 Jan 2025 18:38:20 +0100 Subject: [PATCH 09/24] adding functionality to search with tagname, attrs, and text combines --- zendriver/core/tab.py | 44 +++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index b6da23f8..277b56fe 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -225,18 +225,19 @@ async def find( ) await self.sleep(0.5) return item + elif(tagname or attrs): loop = asyncio.get_running_loop() start_time = loop.time() - if(tagname): - tagname = tagname.strip().upper() - else: - tagname = None - if(attrs): - attrs = {k.strip(): v.strip() for k, v in attrs.items()} - else: - attrs = None + tagname = tagname.strip().upper() if tagname else None + + attrs = {k.strip(): v.strip() for k, v in attrs.items()} if attrs else None + + if(text): + if(not attrs): + attrs = dict + attrs['innerText'] = text.strip() item = await self.find_element_by_tagname_attrs(tagname, attrs) while not item: @@ -317,18 +318,19 @@ async def find_all( ) await self.sleep(0.5) return items + elif(tagname or attrs): loop = asyncio.get_running_loop() start_time = loop.time() - if(tagname): - tagname = tagname.strip().upper() - else: - tagname = None - if(attrs): - attrs = {k.strip(): v.strip() for k, v in attrs.items()} - else: - attrs = None + tagname = tagname.strip().upper() if tagname else None + + attrs = {k.strip(): v.strip() for k, v in attrs.items()} if attrs else None + + if(text): + if(not attrs): + attrs = dict + attrs['innerText'] = text.strip() items = await self.find_elements_by_tagname_attrs(tagname, attrs) while not items: @@ -692,6 +694,11 @@ async def find_element_by_text( :return: :rtype: """ + return self.find_element_by_tagname_attrs( + attrs = { + "innerText":text.strip() + } + ) doc = await self.send(cdp.dom.get_document(-1, True)) text = text.strip() search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) @@ -785,6 +792,11 @@ async def find_elements_by_text( :return: :rtype: """ + return self.find_elements_by_tagname_attrs( + attrs = { + "innerText":text.strip() + } + ) text = text.strip() doc = await self.send(cdp.dom.get_document(-1, True)) search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) From 9c465ca12ced77a83f86e74c79c59ae3405a77b8 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Fri, 31 Jan 2025 19:28:15 +0100 Subject: [PATCH 10/24] searching by text is now done with attributes by adding the text value into attrs dictionary, ex: attrs['innerText'] = textValue --- zendriver/core/tab.py | 338 ++++++++++++++++++++++-------------------- 1 file changed, 180 insertions(+), 158 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 277b56fe..41396bb4 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -209,15 +209,16 @@ async def find( loop = asyncio.get_running_loop() start_time = loop.time() - text = text.strip() + attrs = dict() + attrs['innerText'] = text.strip() - item = await self.find_element_by_text( - text, best_match, return_enclosing_element + item = await self.find_element_by_tagname_attrs( + attrs = attrs ) while not item: await self.wait() - item = await self.find_element_by_text( - text, best_match, return_enclosing_element + item = await self.find_element_by_tagname_attrs( + attrs = attrs ) if loop.time() - start_time > timeout: raise asyncio.TimeoutError( @@ -303,28 +304,35 @@ async def find_all( :type timeout: float,int """ if(text and not tagname and not attrs): # searches by text only if tagname and attrs weren't provided because it is more efficient to search by tagname and attrs. + loop = asyncio.get_running_loop() now = loop.time() - text = text.strip() - items = await self.find_elements_by_text(text) + attrs = dict() + attrs['innerText'] = text.strip() # even if only text is provided, we're gonna + + items = await self.find_elements_by_tagname_attrs( + attrs = attrs + ) while not items: await self.wait() - items = await self.find_elements_by_text(text) + items = await self.find_elements_by_tagname_attrs( + attrs = attrs + ) if loop.time() - now > timeout: raise asyncio.TimeoutError( "time ran out while waiting for text: %s" % text ) await self.sleep(0.5) return items - + elif(tagname or attrs): + loop = asyncio.get_running_loop() start_time = loop.time() tagname = tagname.strip().upper() if tagname else None - attrs = {k.strip(): v.strip() for k, v in attrs.items()} if attrs else None if(text): @@ -333,6 +341,7 @@ async def find_all( attrs['innerText'] = text.strip() items = await self.find_elements_by_tagname_attrs(tagname, attrs) + while not items: await self.wait() items = await self.find_elements_by_tagname_attrs(tagname, attrs) @@ -342,6 +351,7 @@ async def find_all( ) await self.sleep(0.5) return items + elif(not text and not tagname and not attrs): # raising an error in case neither text nor tagname values were provided raise ValueError("You must provide either tagname, attrs, or text to find elements.") @@ -699,81 +709,81 @@ async def find_element_by_text( "innerText":text.strip() } ) - doc = await self.send(cdp.dom.get_document(-1, True)) - text = text.strip() - search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) - - node_ids = await self.send(cdp.dom.get_search_results(search_id, 0, nresult)) - await self.send(cdp.dom.discard_search_results(search_id)) - - if not node_ids: - node_ids = [] - items = [] - for nid in node_ids: - node = util.filter_recurse(doc, lambda n: n.node_id == nid) - if node is None: - continue - - try: - elem = element.create(node, self, doc) - except: # noqa - continue - if elem.node_type == 3: - # if found element is a text node (which is plain text, and useless for our purpose), - # we return the parent element of the node (which is often a tag which can have text between their - # opening and closing tags (that is most tags, except for example "img" and "video", "br") - - if not elem.parent: - # check if parent actually has a parent and update it to be absolutely sure - await elem.update() - - items.append( - elem.parent or elem - ) # when it really has no parent, use the text node itself - continue - else: - # just add the element itself - items.append(elem) - - # since we already fetched the entire doc, including shadow and frames - # let's also search through the iframes - iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") - if iframes: - iframes_elems = [ - element.create(iframe, self, iframe.content_document) - for iframe in iframes - ] - for iframe_elem in iframes_elems: - iframe_text_nodes = util.filter_recurse_all( - iframe_elem, - lambda node: node.node_type == 3 # noqa - and text.lower() in node.node_value.lower(), - ) - if iframe_text_nodes: - iframe_text_elems = [ - element.create(text_node, self, iframe_elem.tree) - for text_node in iframe_text_nodes - ] - items.extend(text_node.parent for text_node in iframe_text_elems) - try: - if not items: - return None - if best_match: - closest_by_length = min( - items, key=lambda el: abs(len(text) - len(el.text_all)) - ) - elem = closest_by_length or items[0] - - return elem - else: - # naively just return the first result - for elem in items: - if elem: - return elem - finally: - await self.send(cdp.dom.disable()) - - return None + # doc = await self.send(cdp.dom.get_document(-1, True)) + # text = text.strip() + # search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) + + # node_ids = await self.send(cdp.dom.get_search_results(search_id, 0, nresult)) + # await self.send(cdp.dom.discard_search_results(search_id)) + + # if not node_ids: + # node_ids = [] + # items = [] + # for nid in node_ids: + # node = util.filter_recurse(doc, lambda n: n.node_id == nid) + # if node is None: + # continue + + # try: + # elem = element.create(node, self, doc) + # except: # noqa + # continue + # if elem.node_type == 3: + # # if found element is a text node (which is plain text, and useless for our purpose), + # # we return the parent element of the node (which is often a tag which can have text between their + # # opening and closing tags (that is most tags, except for example "img" and "video", "br") + + # if not elem.parent: + # # check if parent actually has a parent and update it to be absolutely sure + # await elem.update() + + # items.append( + # elem.parent or elem + # ) # when it really has no parent, use the text node itself + # continue + # else: + # # just add the element itself + # items.append(elem) + + # # since we already fetched the entire doc, including shadow and frames + # # let's also search through the iframes + # iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") + # if iframes: + # iframes_elems = [ + # element.create(iframe, self, iframe.content_document) + # for iframe in iframes + # ] + # for iframe_elem in iframes_elems: + # iframe_text_nodes = util.filter_recurse_all( + # iframe_elem, + # lambda node: node.node_type == 3 # noqa + # and text.lower() in node.node_value.lower(), + # ) + # if iframe_text_nodes: + # iframe_text_elems = [ + # element.create(text_node, self, iframe_elem.tree) + # for text_node in iframe_text_nodes + # ] + # items.extend(text_node.parent for text_node in iframe_text_elems) + # try: + # if not items: + # return None + # if best_match: + # closest_by_length = min( + # items, key=lambda el: abs(len(text) - len(el.text_all)) + # ) + # elem = closest_by_length or items[0] + + # return elem + # else: + # # naively just return the first result + # for elem in items: + # if elem: + # return elem + # finally: + # await self.send(cdp.dom.disable()) + + # return None async def find_elements_by_text( self, @@ -797,73 +807,73 @@ async def find_elements_by_text( "innerText":text.strip() } ) - text = text.strip() - doc = await self.send(cdp.dom.get_document(-1, True)) - search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) - if nresult: - node_ids = await self.send( - cdp.dom.get_search_results(search_id, 0, nresult) - ) - else: - node_ids = [] - - await self.send(cdp.dom.discard_search_results(search_id)) - - items = [] - for nid in node_ids: - node = util.filter_recurse(doc, lambda n: n.node_id == nid) - if not node: - node = await self.send(cdp.dom.resolve_node(node_id=nid)) - if not node: - continue - # remote_object = await self.send(cdp.dom.resolve_node(backend_node_id=node.backend_node_id)) - # node_id = await self.send(cdp.dom.request_node(object_id=remote_object.object_id)) - try: - elem = element.create(node, self, doc) - except: # noqa - continue - if elem.node_type == 3: - # if found element is a text node (which is plain text, and useless for our purpose), - # we return the parent element of the node (which is often a tag which can have text between their - # opening and closing tags (that is most tags, except for example "img" and "video", "br") - - if not elem.parent: - # check if parent actually has a parent and update it to be absolutely sure - await elem.update() - - items.append( - elem.parent or elem - ) # when it really has no parent, use the text node itself - continue - else: - # just add the element itself - items.append(elem) - - # since we already fetched the entire doc, including shadow and frames - # let's also search through the iframes - iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") - if iframes: - iframes_elems = [ - element.create(iframe, self, iframe.content_document) - for iframe in iframes - ] - for iframe_elem in iframes_elems: - if iframe_elem.content_document: - iframe_text_nodes = util.filter_recurse_all( - iframe_elem, - lambda node: node.node_type == 3 # noqa - and text.lower() in node.node_value.lower(), - ) - if iframe_text_nodes: - iframe_text_elems = [ - element.create(text_node, self, iframe_elem.tree) - for text_node in iframe_text_nodes - ] - items.extend( - text_node.parent for text_node in iframe_text_elems - ) - await self.send(cdp.dom.disable()) - return items or [] + # text = text.strip() + # doc = await self.send(cdp.dom.get_document(-1, True)) + # search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) + # if nresult: + # node_ids = await self.send( + # cdp.dom.get_search_results(search_id, 0, nresult) + # ) + # else: + # node_ids = [] + + # await self.send(cdp.dom.discard_search_results(search_id)) + + # items = [] + # for nid in node_ids: + # node = util.filter_recurse(doc, lambda n: n.node_id == nid) + # if not node: + # node = await self.send(cdp.dom.resolve_node(node_id=nid)) + # if not node: + # continue + # # remote_object = await self.send(cdp.dom.resolve_node(backend_node_id=node.backend_node_id)) + # # node_id = await self.send(cdp.dom.request_node(object_id=remote_object.object_id)) + # try: + # elem = element.create(node, self, doc) + # except: # noqa + # continue + # if elem.node_type == 3: + # # if found element is a text node (which is plain text, and useless for our purpose), + # # we return the parent element of the node (which is often a tag which can have text between their + # # opening and closing tags (that is most tags, except for example "img" and "video", "br") + + # if not elem.parent: + # # check if parent actually has a parent and update it to be absolutely sure + # await elem.update() + + # items.append( + # elem.parent or elem + # ) # when it really has no parent, use the text node itself + # continue + # else: + # # just add the element itself + # items.append(elem) + + # # since we already fetched the entire doc, including shadow and frames + # # let's also search through the iframes + # iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") + # if iframes: + # iframes_elems = [ + # element.create(iframe, self, iframe.content_document) + # for iframe in iframes + # ] + # for iframe_elem in iframes_elems: + # if iframe_elem.content_document: + # iframe_text_nodes = util.filter_recurse_all( + # iframe_elem, + # lambda node: node.node_type == 3 # noqa + # and text.lower() in node.node_value.lower(), + # ) + # if iframe_text_nodes: + # iframe_text_elems = [ + # element.create(text_node, self, iframe_elem.tree) + # for text_node in iframe_text_nodes + # ] + # items.extend( + # text_node.parent for text_node in iframe_text_elems + # ) + # await self.send(cdp.dom.disable()) + # return items or [] async def back(self): """ @@ -1324,11 +1334,13 @@ async def wait_for( """ loop = asyncio.get_running_loop() start_time = loop.time() + if tagname or attrs: # waiting for an element using either their tagname, attributes, or both - if(not tagname): # in case attrs were provided but not tagname - tagname = None - if(not attrs): # in case tagname was provided but not attrs - attrs = None + + if(not tagname): tagname = None # in case attrs were provided but not tagname + + if(not attrs): attrs = None # in case tagname was provided but not attrs + item = await self.find( tagname = tagname, attrs = attrs @@ -1342,7 +1354,9 @@ async def wait_for( if item: return item + if selector: + item = await self.query_selector(selector) while not item and loop.time() - start_time < timeout: item = await self.query_selector(selector) @@ -1350,10 +1364,18 @@ async def wait_for( if item: return item + if text: - item = await self.find_element_by_text(text) + + attrs = dict() + attrs['innerText'] = text.strip() + item = await self.find_element_by_tagname_attrs( + attrs = attrs + ) while not item and loop.time() - start_time < timeout: - item = await self.find_element_by_text(text) + item = await self.find_element_by_tagname_attrs( + attrs = attrs + ) await self.sleep(0.5) if item: From 058c069deb6f489a9eeef65c391d53d82aadac42 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Tue, 4 Feb 2025 17:25:03 +0100 Subject: [PATCH 11/24] adding functionality to search by any combination of tagname, attrs, and text --- zendriver/core/tab.py | 347 ++++++++++++++++++++++-------------------- 1 file changed, 178 insertions(+), 169 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 41396bb4..3085c601 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -212,12 +212,12 @@ async def find( attrs = dict() attrs['innerText'] = text.strip() - item = await self.find_element_by_tagname_attrs( + item = await self.find_element_by_tagname_attrs_text( attrs = attrs ) while not item: await self.wait() - item = await self.find_element_by_tagname_attrs( + item = await self.find_element_by_tagname_attrs_text( attrs = attrs ) if loop.time() - start_time > timeout: @@ -240,10 +240,10 @@ async def find( attrs = dict attrs['innerText'] = text.strip() - item = await self.find_element_by_tagname_attrs(tagname, attrs) + item = await self.find_element_by_tagname_attrs_text(tagname, attrs) while not item: await self.wait() - item = await self.find_element_by_tagname_attrs(tagname, attrs) + item = await self.find_element_by_tagname_attrs_text(tagname, attrs) if loop.time() - start_time > timeout: raise asyncio.TimeoutError( f"Time ran out while waiting for element: {tagname}, with attributes: {attrs}" @@ -533,10 +533,11 @@ async def query_selector( return return element.create(node, self, doc) - async def find_element_by_tagname_attrs( + async def find_element_by_tagname_attrs_text_text( self, tagname: str | None = None, attrs: dict[str, str] | None = None, + text: str | None = None ) -> Element | None: """ Finds and returns the first element matching the tagname and attributes. @@ -549,6 +550,7 @@ async def find_element_by_tagname_attrs( :return: A single element or None if no match is found. :rtype: Element | None """ + async def traverse(node, parent_tree): """ Recursive traversal of the DOM and shadow DOM to find the targeted element. @@ -556,27 +558,39 @@ async def traverse(node, parent_tree): if not node: return None - # check tagname and attributes if provided + # create an element to check for the conditions we're looking for + elem = element.create(node, self, parent_tree) + + # check for conditions matches_tagname = ( - not tagname or (node.node_type == 1 and node.node_name.lower() == tagname.lower()) - ) + not tagname or (elem.tag_name and tagname.strip().lower() == elem.tag_name.strip().lower()) + ) # this condition evaluates to True if tagname was not provided; no filtering by tagname. Or if tagname equals our targeted element's tagname + matches_attrs = ( - not attrs or (node.attributes and all( + not attrs or (elem.attributes and all( any( - node.attributes[i] == attr and value in node.attributes[i + 1].split() - for i in range(0, len(node.attributes), 2) + elem.attributes[i] == attr and value in elem.attributes[i + 1].split() + for i in range(0, len(elem.attributes), 2) ) for attr, value in attrs.items() )) - ) + ) # this condition evaluates to True if attrs was not provided; no filtering by attrs. Or if the provided attrs are in our targeted element's attributes + + matches_text = ( + not text or (elem.text and text.strip().lower() in elem.text.strip().lower()) + ) # this condition evaluates to True if text was not provided; no filtering by text. Or if text is in our targeted element's text - if matches_tagname and matches_attrs: - return element.create(node, self, parent_tree) + # if all conditions match, we return the target element + if matches_tagname and matches_attrs and matches_text: + return elem - # traverse shadow roots and child nodes tasks = list() + + # traverse shadow roots nodes if node.shadow_roots: tasks.extend(traverse(shadow_root, parent_tree) for shadow_root in node.shadow_roots) + + # traverse child nodes if node.children: tasks.extend(traverse(child, parent_tree) for child in node.children) @@ -704,86 +718,86 @@ async def find_element_by_text( :return: :rtype: """ - return self.find_element_by_tagname_attrs( - attrs = { - "innerText":text.strip() - } - ) - # doc = await self.send(cdp.dom.get_document(-1, True)) - # text = text.strip() - # search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) - - # node_ids = await self.send(cdp.dom.get_search_results(search_id, 0, nresult)) - # await self.send(cdp.dom.discard_search_results(search_id)) - - # if not node_ids: - # node_ids = [] - # items = [] - # for nid in node_ids: - # node = util.filter_recurse(doc, lambda n: n.node_id == nid) - # if node is None: - # continue - - # try: - # elem = element.create(node, self, doc) - # except: # noqa - # continue - # if elem.node_type == 3: - # # if found element is a text node (which is plain text, and useless for our purpose), - # # we return the parent element of the node (which is often a tag which can have text between their - # # opening and closing tags (that is most tags, except for example "img" and "video", "br") - - # if not elem.parent: - # # check if parent actually has a parent and update it to be absolutely sure - # await elem.update() - - # items.append( - # elem.parent or elem - # ) # when it really has no parent, use the text node itself - # continue - # else: - # # just add the element itself - # items.append(elem) - - # # since we already fetched the entire doc, including shadow and frames - # # let's also search through the iframes - # iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") - # if iframes: - # iframes_elems = [ - # element.create(iframe, self, iframe.content_document) - # for iframe in iframes - # ] - # for iframe_elem in iframes_elems: - # iframe_text_nodes = util.filter_recurse_all( - # iframe_elem, - # lambda node: node.node_type == 3 # noqa - # and text.lower() in node.node_value.lower(), - # ) - # if iframe_text_nodes: - # iframe_text_elems = [ - # element.create(text_node, self, iframe_elem.tree) - # for text_node in iframe_text_nodes - # ] - # items.extend(text_node.parent for text_node in iframe_text_elems) - # try: - # if not items: - # return None - # if best_match: - # closest_by_length = min( - # items, key=lambda el: abs(len(text) - len(el.text_all)) - # ) - # elem = closest_by_length or items[0] - - # return elem - # else: - # # naively just return the first result - # for elem in items: - # if elem: - # return elem - # finally: - # await self.send(cdp.dom.disable()) - - # return None + doc = await self.send(cdp.dom.get_document(-1, True)) + text = text.strip() + search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) + + if nresult: + node_ids = await self.send( + cdp.dom.get_search_results(search_id, 0, nresult) + ) + else: + node_ids = [] + await self.send(cdp.dom.discard_search_results(search_id)) + + if not node_ids: + node_ids = [] + items = [] + for nid in node_ids: + node = util.filter_recurse(doc, lambda n: n.node_id == nid) + if node is None: + continue + + try: + elem = element.create(node, self, doc) + except: # noqa + continue + if elem.node_type == 3: + # if found element is a text node (which is plain text, and useless for our purpose), + # we return the parent element of the node (which is often a tag which can have text between their + # opening and closing tags (that is most tags, except for example "img" and "video", "br") + + if not elem.parent: + # check if parent actually has a parent and update it to be absolutely sure + await elem.update() + + items.append( + elem.parent or elem + ) # when it really has no parent, use the text node itself + continue + else: + # just add the element itself + items.append(elem) + + # since we already fetched the entire doc, including shadow and frames + # let's also search through the iframes + iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") + if iframes: + iframes_elems = [ + element.create(iframe, self, iframe.content_document) + for iframe in iframes + ] + for iframe_elem in iframes_elems: + iframe_text_nodes = util.filter_recurse_all( + iframe_elem, + lambda node: node.node_type == 3 # noqa + and text.lower() in node.node_value.lower(), + ) + if iframe_text_nodes: + iframe_text_elems = [ + element.create(text_node, self, iframe_elem.tree) + for text_node in iframe_text_nodes + ] + items.extend(text_node.parent for text_node in iframe_text_elems) + try: + if not items: + return None + if best_match: + closest_by_length = min( + items, key=lambda el: abs(len(text) - len(el.text_all)) + ) + elem = closest_by_length or items[0] + + return elem + else: + # naively just return the first result + for elem in items: + if elem: + return elem + finally: + await self.send(cdp.dom.disable()) + + return None async def find_elements_by_text( self, @@ -802,78 +816,73 @@ async def find_elements_by_text( :return: :rtype: """ - return self.find_elements_by_tagname_attrs( - attrs = { - "innerText":text.strip() - } - ) - # text = text.strip() - # doc = await self.send(cdp.dom.get_document(-1, True)) - # search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) - # if nresult: - # node_ids = await self.send( - # cdp.dom.get_search_results(search_id, 0, nresult) - # ) - # else: - # node_ids = [] - - # await self.send(cdp.dom.discard_search_results(search_id)) - - # items = [] - # for nid in node_ids: - # node = util.filter_recurse(doc, lambda n: n.node_id == nid) - # if not node: - # node = await self.send(cdp.dom.resolve_node(node_id=nid)) - # if not node: - # continue - # # remote_object = await self.send(cdp.dom.resolve_node(backend_node_id=node.backend_node_id)) - # # node_id = await self.send(cdp.dom.request_node(object_id=remote_object.object_id)) - # try: - # elem = element.create(node, self, doc) - # except: # noqa - # continue - # if elem.node_type == 3: - # # if found element is a text node (which is plain text, and useless for our purpose), - # # we return the parent element of the node (which is often a tag which can have text between their - # # opening and closing tags (that is most tags, except for example "img" and "video", "br") - - # if not elem.parent: - # # check if parent actually has a parent and update it to be absolutely sure - # await elem.update() - - # items.append( - # elem.parent or elem - # ) # when it really has no parent, use the text node itself - # continue - # else: - # # just add the element itself - # items.append(elem) - - # # since we already fetched the entire doc, including shadow and frames - # # let's also search through the iframes - # iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") - # if iframes: - # iframes_elems = [ - # element.create(iframe, self, iframe.content_document) - # for iframe in iframes - # ] - # for iframe_elem in iframes_elems: - # if iframe_elem.content_document: - # iframe_text_nodes = util.filter_recurse_all( - # iframe_elem, - # lambda node: node.node_type == 3 # noqa - # and text.lower() in node.node_value.lower(), - # ) - # if iframe_text_nodes: - # iframe_text_elems = [ - # element.create(text_node, self, iframe_elem.tree) - # for text_node in iframe_text_nodes - # ] - # items.extend( - # text_node.parent for text_node in iframe_text_elems - # ) - # await self.send(cdp.dom.disable()) - # return items or [] + text = text.strip() + doc = await self.send(cdp.dom.get_document(-1, True)) + search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) + if nresult: + node_ids = await self.send( + cdp.dom.get_search_results(search_id, 0, nresult) + ) + else: + node_ids = [] + + await self.send(cdp.dom.discard_search_results(search_id)) + + items = [] + for nid in node_ids: + node = util.filter_recurse(doc, lambda n: n.node_id == nid) + if not node: + node = await self.send(cdp.dom.resolve_node(node_id=nid)) + if not node: + continue + # remote_object = await self.send(cdp.dom.resolve_node(backend_node_id=node.backend_node_id)) + # node_id = await self.send(cdp.dom.request_node(object_id=remote_object.object_id)) + try: + elem = element.create(node, self, doc) + except: # noqa + continue + if elem.node_type == 3: + # if found element is a text node (which is plain text, and useless for our purpose), + # we return the parent element of the node (which is often a tag which can have text between their + # opening and closing tags (that is most tags, except for example "img" and "video", "br") + + if not elem.parent: + # check if parent actually has a parent and update it to be absolutely sure + await elem.update() + + items.append( + elem.parent or elem + ) # when it really has no parent, use the text node itself + continue + else: + # just add the element itself + items.append(elem) + + # since we already fetched the entire doc, including shadow and frames + # let's also search through the iframes + iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") + if iframes: + iframes_elems = [ + element.create(iframe, self, iframe.content_document) + for iframe in iframes + ] + for iframe_elem in iframes_elems: + if iframe_elem.content_document: + iframe_text_nodes = util.filter_recurse_all( + iframe_elem, + lambda node: node.node_type == 3 # noqa + and text.lower() in node.node_value.lower(), + ) + if iframe_text_nodes: + iframe_text_elems = [ + element.create(text_node, self, iframe_elem.tree) + for text_node in iframe_text_nodes + ] + items.extend( + text_node.parent for text_node in iframe_text_elems + ) + await self.send(cdp.dom.disable()) + return items or [] async def back(self): """ @@ -1369,11 +1378,11 @@ async def wait_for( attrs = dict() attrs['innerText'] = text.strip() - item = await self.find_element_by_tagname_attrs( + item = await self.find_element_by_tagname_attrs_text( attrs = attrs ) while not item and loop.time() - start_time < timeout: - item = await self.find_element_by_tagname_attrs( + item = await self.find_element_by_tagname_attrs_text( attrs = attrs ) await self.sleep(0.5) From df2ad7b5574155df6383b1ccb0fe1ad9ea8b7da7 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Tue, 4 Feb 2025 17:30:39 +0100 Subject: [PATCH 12/24] adding find_elements_by_tagname_attrs_text to search for elements with any combination of tagname, attrs, and text --- zendriver/core/tab.py | 74 ++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 3085c601..975e41d8 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -311,13 +311,13 @@ async def find_all( attrs = dict() attrs['innerText'] = text.strip() # even if only text is provided, we're gonna - items = await self.find_elements_by_tagname_attrs( + items = await self.find_elements_by_tagname_attrs_text( attrs = attrs ) while not items: await self.wait() - items = await self.find_elements_by_tagname_attrs( + items = await self.find_elements_by_tagname_attrs_text( attrs = attrs ) if loop.time() - now > timeout: @@ -340,11 +340,11 @@ async def find_all( attrs = dict attrs['innerText'] = text.strip() - items = await self.find_elements_by_tagname_attrs(tagname, attrs) + items = await self.find_elements_by_tagname_attrs_text(tagname, attrs) while not items: await self.wait() - items = await self.find_elements_by_tagname_attrs(tagname, attrs) + items = await self.find_elements_by_tagname_attrs_text(tagname, attrs) if loop.time() - start_time > timeout: raise asyncio.TimeoutError( f"Time ran out while waiting for element: {tagname}, with attributes: {attrs}" @@ -624,49 +624,58 @@ async def traverse(node, parent_tree): return None - async def find_elements_by_tagname_attrs( - self, - tagname: Optional[str] = None, - attrs: Optional[dict[str, str]] = None, - ) -> list[Element]: + async def find_elements_by_tagname_attrs_text_text( + self, + tagname: Optional[str] = None, + attrs: Optional[dict[str, str]] = None, + text: Optional[str] = None + ) -> list[Element]: """ - Finds and returns all elements with the specified tagname and matching attributes. + Finds and returns all elements matching the tagname, attributes, and optional innerText. :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'). - :type tagname: str :param attrs: A dictionary of attributes and their corresponding values to match. - :type attrs: dict[str, str] - + :param text: The expected innerText of the element. :return: List of matching elements. :rtype: list[Element] """ - elements = list() + + elements = [] async def traverse(node, parent_tree): - """ - Recursive traversal of the DOM, including shadow DOM and iframes, to collect all matching elements. - """ + """Recursive traversal of the DOM and shadow DOM to collect all matching elements.""" if not node: - return None + return + + # create an element to check for the conditions we're looking for + elem = element.create(node, self, parent_tree) - # check if the node matches the tagname and attribute-value pairs - if ( - node.node_type == 1 # Element node - and (not tagname or node.node_name.lower() == tagname.lower()) - and node.attributes - and (not attrs or all( + # check for conditions + matches_tagname = ( + not tagname or (elem.tag_name and tagname.strip().lower() == elem.tag_name.strip().lower()) + ) # this condition evaluates to True if tagname was not provided; no filtering by tagname. Or if tagname equals our targeted element's tagname + + matches_attrs = ( + not attrs or (elem.attributes and all( any( - node.attributes[i] == attr and value in node.attributes[i + 1].split() - for i in range(0, len(node.attributes), 2) + elem.attributes[i] == attr and value in elem.attributes[i + 1].split() + for i in range(0, len(elem.attributes), 2) ) for attr, value in attrs.items() )) - ): - elements.append(element.create(node, self, parent_tree)) + ) # this condition evaluates to True if attrs was not provided; no filtering by attrs. Or if the provided attrs are in our targeted element's attributes + + matches_text = ( + not text or (elem.text and text.strip().lower() in elem.text.strip().lower()) + ) # this condition evaluates to True if text was not provided; no filtering by text. Or if text is in our targeted element's text + + # if all conditions match, add the element to the list of elements to return + if matches_tagname and matches_attrs and matches_text: + elements.append(elem) tasks = list() - # traverse shadow roots + # traverse shadow roots nodes if node.shadow_roots: tasks.extend(traverse(shadow_root, parent_tree) for shadow_root in node.shadow_roots) @@ -674,16 +683,15 @@ async def traverse(node, parent_tree): if node.children: tasks.extend(traverse(child, parent_tree) for child in node.children) - if tasks: - await asyncio.gather(*tasks) + await asyncio.gather(*tasks) # fetch the document root doc = await self.send(cdp.dom.get_document(depth=-1, pierce=True)) - # traverse the DOM tree + # start traversing the DOM tree await traverse(doc, doc) - # handle iframes + # search within iframes concurrently iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") iframe_tasks = [ traverse(iframe.content_document, iframe.content_document) From 7aa23471fabdbc62a7606d1dbf4b3f38e32cae48 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Tue, 4 Feb 2025 17:32:55 +0100 Subject: [PATCH 13/24] modifying find and find_all to work with the new functionality of identifying elements --- zendriver/core/tab.py | 145 +++++++++++++++--------------------------- 1 file changed, 52 insertions(+), 93 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 975e41d8..4b32a1cd 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -165,11 +165,9 @@ async def open_external_inspector(self): async def find( self, - text: Optional[str] = None, tagname: Optional[str] = None, attrs: Optional[dict[str, str]] = None, - best_match: Optional[bool] = True, - return_enclosing_element = True, + text: Optional[str] = None, timeout: Union[int, float] = 10, ): """ @@ -203,56 +201,38 @@ async def find( :type return_enclosing_element: bool :param timeout: raise timeout exception when after this many seconds nothing is found. :type timeout: float,int - """ + """ - if(text and not tagname and not attrs): # searches by text only if tagname and attrs weren't provided because it is more efficient to search by tagname and attrs. - loop = asyncio.get_running_loop() - start_time = loop.time() + loop = asyncio.get_running_loop() + start_time = loop.time() - attrs = dict() - attrs['innerText'] = text.strip() + tagname = tagname.strip().lower() if tagname else None + attrs = {k.strip(): v.strip() for k, v in attrs.items()} if attrs else None + text = text.strip().lower() if text else None + if(not text and not tagname and not attrs): + # raising an error in case neither text nor tagname values were provided + raise ValueError("You must provide either tagname, attrs, or text to find an element.") + + item = await self.find_element_by_tagname_attrs_text( + tagname = tagname, + attrs = attrs, + text = text + ) + while(not item): + await self.wait() item = await self.find_element_by_tagname_attrs_text( - attrs = attrs + tagname = tagname, + attrs = attrs, + text = text ) - while not item: - await self.wait() - item = await self.find_element_by_tagname_attrs_text( - attrs = attrs + if(loop.time() - start_time > timeout): + raise asyncio.TimeoutError( + f'Time ran out while waiting for element with tagname: {tagname}, attributess: {attrs}, text:{text}' ) - if loop.time() - start_time > timeout: - raise asyncio.TimeoutError( - "Time ran out while waiting for text: %s" % text - ) - await self.sleep(0.5) - return item - - elif(tagname or attrs): - loop = asyncio.get_running_loop() - start_time = loop.time() + await self.sleep(0.5) - tagname = tagname.strip().upper() if tagname else None - - attrs = {k.strip(): v.strip() for k, v in attrs.items()} if attrs else None - - if(text): - if(not attrs): - attrs = dict - attrs['innerText'] = text.strip() - - item = await self.find_element_by_tagname_attrs_text(tagname, attrs) - while not item: - await self.wait() - item = await self.find_element_by_tagname_attrs_text(tagname, attrs) - if loop.time() - start_time > timeout: - raise asyncio.TimeoutError( - f"Time ran out while waiting for element: {tagname}, with attributes: {attrs}" - ) - await self.sleep(0.5) - return item - elif(not text and not tagname and not attrs): - # raising an error in case neither text nor tagname values were provided - raise ValueError("You must provide either tagname, attrs, or text to find an element.") + return item async def select( self, @@ -288,9 +268,9 @@ async def select( async def find_all( self, - text: Optional[str] = None, tagname: Optional[str] = None, attrs: Optional[dict[str, str]] = None, + text: Optional[str] = None, timeout: Union[int, float] = 10, ) -> List[Element]: """ @@ -303,58 +283,37 @@ async def find_all( :param timeout: raise timeout exception when after this many seconds nothing is found. :type timeout: float,int """ - if(text and not tagname and not attrs): # searches by text only if tagname and attrs weren't provided because it is more efficient to search by tagname and attrs. - - loop = asyncio.get_running_loop() - now = loop.time() - attrs = dict() - attrs['innerText'] = text.strip() # even if only text is provided, we're gonna + loop = asyncio.get_running_loop() + start_time = loop.time() + tagname = tagname.strip().lower() if tagname else None + attrs = {k.strip(): v.strip() for k, v in attrs.items()} if attrs else None + text = text.strip().lower() if text else None + + if(not text and not tagname and not attrs): + # raising an error in case neither text nor tagname values were provided + raise ValueError("You must provide either tagname, attrs, or text to find elements.") + + items = await self.find_elements_by_tagname_attrs_text( + tagname = tagname, + attrs = attrs, + text = text + ) + while not items: + await self.wait() items = await self.find_elements_by_tagname_attrs_text( - attrs = attrs + tagname = tagname, + attrs = attrs, + text = text ) - - while not items: - await self.wait() - items = await self.find_elements_by_tagname_attrs_text( - attrs = attrs + if loop.time() - start_time > timeout: + raise asyncio.TimeoutError( + f'Time ran out while waiting for elements with tagname: {tagname}, attributess: {attrs}, text:{text}' ) - if loop.time() - now > timeout: - raise asyncio.TimeoutError( - "time ran out while waiting for text: %s" % text - ) - await self.sleep(0.5) - return items - - elif(tagname or attrs): - - loop = asyncio.get_running_loop() - start_time = loop.time() - - tagname = tagname.strip().upper() if tagname else None - attrs = {k.strip(): v.strip() for k, v in attrs.items()} if attrs else None - - if(text): - if(not attrs): - attrs = dict - attrs['innerText'] = text.strip() - - items = await self.find_elements_by_tagname_attrs_text(tagname, attrs) - - while not items: - await self.wait() - items = await self.find_elements_by_tagname_attrs_text(tagname, attrs) - if loop.time() - start_time > timeout: - raise asyncio.TimeoutError( - f"Time ran out while waiting for element: {tagname}, with attributes: {attrs}" - ) - await self.sleep(0.5) - return items + await self.sleep(0.5) - elif(not text and not tagname and not attrs): - # raising an error in case neither text nor tagname values were provided - raise ValueError("You must provide either tagname, attrs, or text to find elements.") + return items async def select_all( self, selector: str, timeout: Union[int, float] = 10, include_frames=False From e16dfc80a5662657dd327c0d62664359972b7105 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Tue, 4 Feb 2025 17:49:45 +0100 Subject: [PATCH 14/24] modifying find_element_by_text and find_elements_by_text to depend on the new functions. modifying wait_for to work with the new functions. also, bug fixes --- zendriver/core/tab.py | 353 ++++++++++++++++++++---------------------- 1 file changed, 169 insertions(+), 184 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 4b32a1cd..1ddfdf7b 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -492,7 +492,7 @@ async def query_selector( return return element.create(node, self, doc) - async def find_element_by_tagname_attrs_text_text( + async def find_element_by_tagname_attrs_text( self, tagname: str | None = None, attrs: dict[str, str] | None = None, @@ -583,7 +583,7 @@ async def traverse(node, parent_tree): return None - async def find_elements_by_tagname_attrs_text_text( + async def find_elements_by_tagname_attrs_text( self, tagname: Optional[str] = None, attrs: Optional[dict[str, str]] = None, @@ -666,110 +666,105 @@ async def traverse(node, parent_tree): async def find_element_by_text( self, text: str, - best_match: Optional[bool] = False, - return_enclosing_element: Optional[bool] = True, ) -> Element | None: """ finds and returns the first element containing , or best match :param text: :type text: - :param best_match: when True, which is MUCH more expensive (thus much slower), - will find the closest match based on length. - this could help tremendously, when for example you search for "login", you'd probably want the login button element, - and not thousands of scripts,meta,headings containing a string of "login". - - :type best_match: bool - :param return_enclosing_element: - :type return_enclosing_element: :return: :rtype: """ - doc = await self.send(cdp.dom.get_document(-1, True)) - text = text.strip() - search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) - - if nresult: - node_ids = await self.send( - cdp.dom.get_search_results(search_id, 0, nresult) - ) + if(not text): + raise ValueError('You must provide a text value to find an element with.') else: - node_ids = [] - await self.send(cdp.dom.discard_search_results(search_id)) - - if not node_ids: - node_ids = [] - items = [] - for nid in node_ids: - node = util.filter_recurse(doc, lambda n: n.node_id == nid) - if node is None: - continue - - try: - elem = element.create(node, self, doc) - except: # noqa - continue - if elem.node_type == 3: - # if found element is a text node (which is plain text, and useless for our purpose), - # we return the parent element of the node (which is often a tag which can have text between their - # opening and closing tags (that is most tags, except for example "img" and "video", "br") - - if not elem.parent: - # check if parent actually has a parent and update it to be absolutely sure - await elem.update() - - items.append( - elem.parent or elem - ) # when it really has no parent, use the text node itself - continue - else: - # just add the element itself - items.append(elem) - - # since we already fetched the entire doc, including shadow and frames - # let's also search through the iframes - iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") - if iframes: - iframes_elems = [ - element.create(iframe, self, iframe.content_document) - for iframe in iframes - ] - for iframe_elem in iframes_elems: - iframe_text_nodes = util.filter_recurse_all( - iframe_elem, - lambda node: node.node_type == 3 # noqa - and text.lower() in node.node_value.lower(), - ) - if iframe_text_nodes: - iframe_text_elems = [ - element.create(text_node, self, iframe_elem.tree) - for text_node in iframe_text_nodes - ] - items.extend(text_node.parent for text_node in iframe_text_elems) - try: - if not items: - return None - if best_match: - closest_by_length = min( - items, key=lambda el: abs(len(text) - len(el.text_all)) - ) - elem = closest_by_length or items[0] - - return elem - else: - # naively just return the first result - for elem in items: - if elem: - return elem - finally: - await self.send(cdp.dom.disable()) - - return None + return await self.find( + text = text + ) + # doc = await self.send(cdp.dom.get_document(-1, True)) + # text = text.strip() + # search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) + + # if nresult: + # node_ids = await self.send( + # cdp.dom.get_search_results(search_id, 0, nresult) + # ) + # else: + # node_ids = [] + # await self.send(cdp.dom.discard_search_results(search_id)) + + # if not node_ids: + # node_ids = [] + # items = [] + # for nid in node_ids: + # node = util.filter_recurse(doc, lambda n: n.node_id == nid) + # if node is None: + # continue + + # try: + # elem = element.create(node, self, doc) + # except: # noqa + # continue + # if elem.node_type == 3: + # # if found element is a text node (which is plain text, and useless for our purpose), + # # we return the parent element of the node (which is often a tag which can have text between their + # # opening and closing tags (that is most tags, except for example "img" and "video", "br") + + # if not elem.parent: + # # check if parent actually has a parent and update it to be absolutely sure + # await elem.update() + + # items.append( + # elem.parent or elem + # ) # when it really has no parent, use the text node itself + # continue + # else: + # # just add the element itself + # items.append(elem) + + # # since we already fetched the entire doc, including shadow and frames + # # let's also search through the iframes + # iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") + # if iframes: + # iframes_elems = [ + # element.create(iframe, self, iframe.content_document) + # for iframe in iframes + # ] + # for iframe_elem in iframes_elems: + # iframe_text_nodes = util.filter_recurse_all( + # iframe_elem, + # lambda node: node.node_type == 3 # noqa + # and text.lower() in node.node_value.lower(), + # ) + # if iframe_text_nodes: + # iframe_text_elems = [ + # element.create(text_node, self, iframe_elem.tree) + # for text_node in iframe_text_nodes + # ] + # items.extend(text_node.parent for text_node in iframe_text_elems) + # try: + # if not items: + # return None + # if best_match: + # closest_by_length = min( + # items, key=lambda el: abs(len(text) - len(el.text_all)) + # ) + # elem = closest_by_length or items[0] + + # return elem + # else: + # # naively just return the first result + # for elem in items: + # if elem: + # return elem + # finally: + # await self.send(cdp.dom.disable()) + + # return None async def find_elements_by_text( self, text: str, - tag_hint: Optional[str] = None, ) -> list[Element]: """ returns element which match the given text. @@ -778,78 +773,82 @@ async def find_elements_by_text( :param text: :type text: - :param tag_hint: when provided, narrows down search to only elements which match given tag eg: a, div, script, span - :type tag_hint: str :return: :rtype: """ - text = text.strip() - doc = await self.send(cdp.dom.get_document(-1, True)) - search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) - if nresult: - node_ids = await self.send( - cdp.dom.get_search_results(search_id, 0, nresult) - ) + if(not text): + raise ValueError('You must provide a text value to find elements with.') else: - node_ids = [] - - await self.send(cdp.dom.discard_search_results(search_id)) - - items = [] - for nid in node_ids: - node = util.filter_recurse(doc, lambda n: n.node_id == nid) - if not node: - node = await self.send(cdp.dom.resolve_node(node_id=nid)) - if not node: - continue - # remote_object = await self.send(cdp.dom.resolve_node(backend_node_id=node.backend_node_id)) - # node_id = await self.send(cdp.dom.request_node(object_id=remote_object.object_id)) - try: - elem = element.create(node, self, doc) - except: # noqa - continue - if elem.node_type == 3: - # if found element is a text node (which is plain text, and useless for our purpose), - # we return the parent element of the node (which is often a tag which can have text between their - # opening and closing tags (that is most tags, except for example "img" and "video", "br") - - if not elem.parent: - # check if parent actually has a parent and update it to be absolutely sure - await elem.update() - - items.append( - elem.parent or elem - ) # when it really has no parent, use the text node itself - continue - else: - # just add the element itself - items.append(elem) - - # since we already fetched the entire doc, including shadow and frames - # let's also search through the iframes - iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") - if iframes: - iframes_elems = [ - element.create(iframe, self, iframe.content_document) - for iframe in iframes - ] - for iframe_elem in iframes_elems: - if iframe_elem.content_document: - iframe_text_nodes = util.filter_recurse_all( - iframe_elem, - lambda node: node.node_type == 3 # noqa - and text.lower() in node.node_value.lower(), - ) - if iframe_text_nodes: - iframe_text_elems = [ - element.create(text_node, self, iframe_elem.tree) - for text_node in iframe_text_nodes - ] - items.extend( - text_node.parent for text_node in iframe_text_elems - ) - await self.send(cdp.dom.disable()) - return items or [] + return await self.find_all( + text = text + ) + # text = text.strip() + # doc = await self.send(cdp.dom.get_document(-1, True)) + # search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) + # if nresult: + # node_ids = await self.send( + # cdp.dom.get_search_results(search_id, 0, nresult) + # ) + # else: + # node_ids = [] + + # await self.send(cdp.dom.discard_search_results(search_id)) + + # items = [] + # for nid in node_ids: + # node = util.filter_recurse(doc, lambda n: n.node_id == nid) + # if not node: + # node = await self.send(cdp.dom.resolve_node(node_id=nid)) + # if not node: + # continue + # # remote_object = await self.send(cdp.dom.resolve_node(backend_node_id=node.backend_node_id)) + # # node_id = await self.send(cdp.dom.request_node(object_id=remote_object.object_id)) + # try: + # elem = element.create(node, self, doc) + # except: # noqa + # continue + # if elem.node_type == 3: + # # if found element is a text node (which is plain text, and useless for our purpose), + # # we return the parent element of the node (which is often a tag which can have text between their + # # opening and closing tags (that is most tags, except for example "img" and "video", "br") + + # if not elem.parent: + # # check if parent actually has a parent and update it to be absolutely sure + # await elem.update() + + # items.append( + # elem.parent or elem + # ) # when it really has no parent, use the text node itself + # continue + # else: + # # just add the element itself + # items.append(elem) + + # # since we already fetched the entire doc, including shadow and frames + # # let's also search through the iframes + # iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") + # if iframes: + # iframes_elems = [ + # element.create(iframe, self, iframe.content_document) + # for iframe in iframes + # ] + # for iframe_elem in iframes_elems: + # if iframe_elem.content_document: + # iframe_text_nodes = util.filter_recurse_all( + # iframe_elem, + # lambda node: node.node_type == 3 # noqa + # and text.lower() in node.node_value.lower(), + # ) + # if iframe_text_nodes: + # iframe_text_elems = [ + # element.create(text_node, self, iframe_elem.tree) + # for text_node in iframe_text_nodes + # ] + # items.extend( + # text_node.parent for text_node in iframe_text_elems + # ) + # await self.send(cdp.dom.disable()) + # return items or [] async def back(self): """ @@ -1311,20 +1310,22 @@ async def wait_for( loop = asyncio.get_running_loop() start_time = loop.time() - if tagname or attrs: # waiting for an element using either their tagname, attributes, or both + if tagname or attrs or text: # waiting for an element using either their tagname, attributes, text, or all. - if(not tagname): tagname = None # in case attrs were provided but not tagname - - if(not attrs): attrs = None # in case tagname was provided but not attrs + if not tagname: tagname = None + if not attrs: attrs = None + if not text: text = None item = await self.find( tagname = tagname, - attrs = attrs + attrs = attrs, + text = text ) - while not item and loop.time() - start_time < timeout: + while(not item and loop.time() - start_time < timeout): item = await self.find( tagname = tagname, - attrs = attrs + attrs = attrs, + text = text ) await self.sleep(0.5) @@ -1341,23 +1342,7 @@ async def wait_for( if item: return item - if text: - - attrs = dict() - attrs['innerText'] = text.strip() - item = await self.find_element_by_tagname_attrs_text( - attrs = attrs - ) - while not item and loop.time() - start_time < timeout: - item = await self.find_element_by_tagname_attrs_text( - attrs = attrs - ) - await self.sleep(0.5) - - if item: - return item - - raise asyncio.TimeoutError("time ran out while waiting") + raise asyncio.TimeoutError('Time ran out while waiting.') async def download_file(self, url: str, filename: Optional[PathLike] = None): """ From 224e49f0af4d654f795a4cb412810c85ded7d645 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Tue, 4 Feb 2025 17:50:31 +0100 Subject: [PATCH 15/24] removing unnecessary code --- zendriver/core/tab.py | 147 ------------------------------------------ 1 file changed, 147 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 1ddfdf7b..4f885aa3 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -681,86 +681,6 @@ async def find_element_by_text( return await self.find( text = text ) - # doc = await self.send(cdp.dom.get_document(-1, True)) - # text = text.strip() - # search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) - - # if nresult: - # node_ids = await self.send( - # cdp.dom.get_search_results(search_id, 0, nresult) - # ) - # else: - # node_ids = [] - # await self.send(cdp.dom.discard_search_results(search_id)) - - # if not node_ids: - # node_ids = [] - # items = [] - # for nid in node_ids: - # node = util.filter_recurse(doc, lambda n: n.node_id == nid) - # if node is None: - # continue - - # try: - # elem = element.create(node, self, doc) - # except: # noqa - # continue - # if elem.node_type == 3: - # # if found element is a text node (which is plain text, and useless for our purpose), - # # we return the parent element of the node (which is often a tag which can have text between their - # # opening and closing tags (that is most tags, except for example "img" and "video", "br") - - # if not elem.parent: - # # check if parent actually has a parent and update it to be absolutely sure - # await elem.update() - - # items.append( - # elem.parent or elem - # ) # when it really has no parent, use the text node itself - # continue - # else: - # # just add the element itself - # items.append(elem) - - # # since we already fetched the entire doc, including shadow and frames - # # let's also search through the iframes - # iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") - # if iframes: - # iframes_elems = [ - # element.create(iframe, self, iframe.content_document) - # for iframe in iframes - # ] - # for iframe_elem in iframes_elems: - # iframe_text_nodes = util.filter_recurse_all( - # iframe_elem, - # lambda node: node.node_type == 3 # noqa - # and text.lower() in node.node_value.lower(), - # ) - # if iframe_text_nodes: - # iframe_text_elems = [ - # element.create(text_node, self, iframe_elem.tree) - # for text_node in iframe_text_nodes - # ] - # items.extend(text_node.parent for text_node in iframe_text_elems) - # try: - # if not items: - # return None - # if best_match: - # closest_by_length = min( - # items, key=lambda el: abs(len(text) - len(el.text_all)) - # ) - # elem = closest_by_length or items[0] - - # return elem - # else: - # # naively just return the first result - # for elem in items: - # if elem: - # return elem - # finally: - # await self.send(cdp.dom.disable()) - - # return None async def find_elements_by_text( self, @@ -782,73 +702,6 @@ async def find_elements_by_text( return await self.find_all( text = text ) - # text = text.strip() - # doc = await self.send(cdp.dom.get_document(-1, True)) - # search_id, nresult = await self.send(cdp.dom.perform_search(text, True)) - # if nresult: - # node_ids = await self.send( - # cdp.dom.get_search_results(search_id, 0, nresult) - # ) - # else: - # node_ids = [] - - # await self.send(cdp.dom.discard_search_results(search_id)) - - # items = [] - # for nid in node_ids: - # node = util.filter_recurse(doc, lambda n: n.node_id == nid) - # if not node: - # node = await self.send(cdp.dom.resolve_node(node_id=nid)) - # if not node: - # continue - # # remote_object = await self.send(cdp.dom.resolve_node(backend_node_id=node.backend_node_id)) - # # node_id = await self.send(cdp.dom.request_node(object_id=remote_object.object_id)) - # try: - # elem = element.create(node, self, doc) - # except: # noqa - # continue - # if elem.node_type == 3: - # # if found element is a text node (which is plain text, and useless for our purpose), - # # we return the parent element of the node (which is often a tag which can have text between their - # # opening and closing tags (that is most tags, except for example "img" and "video", "br") - - # if not elem.parent: - # # check if parent actually has a parent and update it to be absolutely sure - # await elem.update() - - # items.append( - # elem.parent or elem - # ) # when it really has no parent, use the text node itself - # continue - # else: - # # just add the element itself - # items.append(elem) - - # # since we already fetched the entire doc, including shadow and frames - # # let's also search through the iframes - # iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") - # if iframes: - # iframes_elems = [ - # element.create(iframe, self, iframe.content_document) - # for iframe in iframes - # ] - # for iframe_elem in iframes_elems: - # if iframe_elem.content_document: - # iframe_text_nodes = util.filter_recurse_all( - # iframe_elem, - # lambda node: node.node_type == 3 # noqa - # and text.lower() in node.node_value.lower(), - # ) - # if iframe_text_nodes: - # iframe_text_elems = [ - # element.create(text_node, self, iframe_elem.tree) - # for text_node in iframe_text_nodes - # ] - # items.extend( - # text_node.parent for text_node in iframe_text_elems - # ) - # await self.send(cdp.dom.disable()) - # return items or [] async def back(self): """ From b30a856cd7225bc130023bb1b1c583323ac1b344 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Tue, 4 Feb 2025 17:58:36 +0100 Subject: [PATCH 16/24] adding documentation to the functions --- zendriver/core/tab.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 4f885aa3..14cd8eb1 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -174,15 +174,12 @@ async def find( find single element by text can also be used to wait for such element to appear. + :param tagname: tagname to search for. ex: div, span, input, button.. + :type tagname: str + :param attrs: attributes to search for. ex: {'class':'class1', 'name':'name1', 'id':'123'} + :type attrs: dict :param text: text to search for. note: script contents are also considered text :type text: str - :param best_match: :param best_match: when True (default), it will return the element which has the most - comparable string length. this could help tremendously, when for example - you search for "login", you'd probably want the login button element, - and not thousands of scripts,meta,headings containing a string of "login". - When False, it will return naively just the first match (but is way faster). - :type best_match: bool - :param return_enclosing_element: since we deal with nodes instead of elements, the find function most often returns so called text nodes, which is actually a element of plain text, which is the somehow imaginary "child" of a "span", "p", "script" or any other elements which have text between their opening @@ -277,6 +274,10 @@ async def find_all( find multiple elements by text can also be used to wait for such element to appear. + :param tagname: tagname to search for. ex: div, span, input, button.. + :type tagname: str + :param attrs: attributes to search for. ex: {'class':'class1', 'name':'name1', 'id':'123'} + :type attrs: dict :param text: text to search for. note: script contents are also considered text :type text: str From 210d0a043720f51566329b2180a55c780dca26b4 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Tue, 4 Feb 2025 21:15:55 +0100 Subject: [PATCH 17/24] formatting and linting --- zendriver/core/tab.py | 189 +++++++++++++++++++++++------------------- 1 file changed, 103 insertions(+), 86 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 14cd8eb1..ec677b75 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -207,25 +207,23 @@ async def find( attrs = {k.strip(): v.strip() for k, v in attrs.items()} if attrs else None text = text.strip().lower() if text else None - if(not text and not tagname and not attrs): + if not text and not tagname and not attrs: # raising an error in case neither text nor tagname values were provided - raise ValueError("You must provide either tagname, attrs, or text to find an element.") + raise ValueError( + "You must provide either tagname, attrs, or text to find an element." + ) item = await self.find_element_by_tagname_attrs_text( - tagname = tagname, - attrs = attrs, - text = text + tagname=tagname, attrs=attrs, text=text ) - while(not item): + while not item: await self.wait() item = await self.find_element_by_tagname_attrs_text( - tagname = tagname, - attrs = attrs, - text = text + tagname=tagname, attrs=attrs, text=text ) - if(loop.time() - start_time > timeout): + if loop.time() - start_time > timeout: raise asyncio.TimeoutError( - f'Time ran out while waiting for element with tagname: {tagname}, attributess: {attrs}, text:{text}' + f"Time ran out while waiting for element with tagname: {tagname}, attributess: {attrs}, text:{text}" ) await self.sleep(0.5) @@ -292,25 +290,23 @@ async def find_all( attrs = {k.strip(): v.strip() for k, v in attrs.items()} if attrs else None text = text.strip().lower() if text else None - if(not text and not tagname and not attrs): + if not text and not tagname and not attrs: # raising an error in case neither text nor tagname values were provided - raise ValueError("You must provide either tagname, attrs, or text to find elements.") - + raise ValueError( + "You must provide either tagname, attrs, or text to find elements." + ) + items = await self.find_elements_by_tagname_attrs_text( - tagname = tagname, - attrs = attrs, - text = text + tagname=tagname, attrs=attrs, text=text ) while not items: await self.wait() items = await self.find_elements_by_tagname_attrs_text( - tagname = tagname, - attrs = attrs, - text = text + tagname=tagname, attrs=attrs, text=text ) if loop.time() - start_time > timeout: raise asyncio.TimeoutError( - f'Time ran out while waiting for elements with tagname: {tagname}, attributess: {attrs}, text:{text}' + f"Time ran out while waiting for elements with tagname: {tagname}, attributess: {attrs}, text:{text}" ) await self.sleep(0.5) @@ -494,11 +490,11 @@ async def query_selector( return element.create(node, self, doc) async def find_element_by_tagname_attrs_text( - self, - tagname: str | None = None, - attrs: dict[str, str] | None = None, - text: str | None = None - ) -> Element | None: + self, + tagname: str | None = None, + attrs: dict[str, str] | None = None, + text: str | None = None, + ) -> Element | None: """ Finds and returns the first element matching the tagname and attributes. @@ -523,33 +519,46 @@ async def traverse(node, parent_tree): # check for conditions matches_tagname = ( - not tagname or (elem.tag_name and tagname.strip().lower() == elem.tag_name.strip().lower()) - ) # this condition evaluates to True if tagname was not provided; no filtering by tagname. Or if tagname equals our targeted element's tagname + not tagname + or ( + elem.tag_name + and tagname.strip().lower() == elem.tag_name.strip().lower() + ) + ) # this condition evaluates to True if tagname was not provided; no filtering by tagname. Or if tagname equals our targeted element's tagname matches_attrs = ( - not attrs or (elem.attributes and all( - any( - elem.attributes[i] == attr and value in elem.attributes[i + 1].split() - for i in range(0, len(elem.attributes), 2) + not attrs + or ( + elem.attributes + and all( + any( + elem.attributes[i] == attr + and value in elem.attributes[i + 1].split() + for i in range(0, len(elem.attributes), 2) + ) + for attr, value in attrs.items() ) - for attr, value in attrs.items() - )) - ) # this condition evaluates to True if attrs was not provided; no filtering by attrs. Or if the provided attrs are in our targeted element's attributes + ) + ) # this condition evaluates to True if attrs was not provided; no filtering by attrs. Or if the provided attrs are in our targeted element's attributes matches_text = ( - not text or (elem.text and text.strip().lower() in elem.text.strip().lower()) - ) # this condition evaluates to True if text was not provided; no filtering by text. Or if text is in our targeted element's text + not text + or (elem.text and text.strip().lower() in elem.text.strip().lower()) + ) # this condition evaluates to True if text was not provided; no filtering by text. Or if text is in our targeted element's text # if all conditions match, we return the target element if matches_tagname and matches_attrs and matches_text: return elem - tasks = list() + tasks = [] # traverse shadow roots nodes if node.shadow_roots: - tasks.extend(traverse(shadow_root, parent_tree) for shadow_root in node.shadow_roots) - + tasks.extend( + traverse(shadow_root, parent_tree) + for shadow_root in node.shadow_roots + ) + # traverse child nodes if node.children: tasks.extend(traverse(child, parent_tree) for child in node.children) @@ -588,7 +597,7 @@ async def find_elements_by_tagname_attrs_text( self, tagname: Optional[str] = None, attrs: Optional[dict[str, str]] = None, - text: Optional[str] = None + text: Optional[str] = None, ) -> list[Element]: """ Finds and returns all elements matching the tagname, attributes, and optional innerText. @@ -612,32 +621,45 @@ async def traverse(node, parent_tree): # check for conditions matches_tagname = ( - not tagname or (elem.tag_name and tagname.strip().lower() == elem.tag_name.strip().lower()) - ) # this condition evaluates to True if tagname was not provided; no filtering by tagname. Or if tagname equals our targeted element's tagname + not tagname + or ( + elem.tag_name + and tagname.strip().lower() == elem.tag_name.strip().lower() + ) + ) # this condition evaluates to True if tagname was not provided; no filtering by tagname. Or if tagname equals our targeted element's tagname matches_attrs = ( - not attrs or (elem.attributes and all( - any( - elem.attributes[i] == attr and value in elem.attributes[i + 1].split() - for i in range(0, len(elem.attributes), 2) + not attrs + or ( + elem.attributes + and all( + any( + elem.attributes[i] == attr + and value in elem.attributes[i + 1].split() + for i in range(0, len(elem.attributes), 2) + ) + for attr, value in attrs.items() ) - for attr, value in attrs.items() - )) - ) # this condition evaluates to True if attrs was not provided; no filtering by attrs. Or if the provided attrs are in our targeted element's attributes + ) + ) # this condition evaluates to True if attrs was not provided; no filtering by attrs. Or if the provided attrs are in our targeted element's attributes matches_text = ( - not text or (elem.text and text.strip().lower() in elem.text.strip().lower()) - ) # this condition evaluates to True if text was not provided; no filtering by text. Or if text is in our targeted element's text + not text + or (elem.text and text.strip().lower() in elem.text.strip().lower()) + ) # this condition evaluates to True if text was not provided; no filtering by text. Or if text is in our targeted element's text # if all conditions match, add the element to the list of elements to return if matches_tagname and matches_attrs and matches_text: elements.append(elem) - tasks = list() + tasks = [] # traverse shadow roots nodes if node.shadow_roots: - tasks.extend(traverse(shadow_root, parent_tree) for shadow_root in node.shadow_roots) + tasks.extend( + traverse(shadow_root, parent_tree) + for shadow_root in node.shadow_roots + ) # traverse child nodes if node.children: @@ -676,12 +698,10 @@ async def find_element_by_text( :return: :rtype: """ - if(not text): - raise ValueError('You must provide a text value to find an element with.') + if not text: + raise ValueError("You must provide a text value to find an element with.") else: - return await self.find( - text = text - ) + return await self.find(text=text) async def find_elements_by_text( self, @@ -697,12 +717,10 @@ async def find_elements_by_text( :return: :rtype: """ - if(not text): - raise ValueError('You must provide a text value to find elements with.') + if not text: + raise ValueError("You must provide a text value to find elements with.") else: - return await self.find_all( - text = text - ) + return await self.find_all(text=text) async def back(self): """ @@ -1135,8 +1153,8 @@ async def wait_for( self, tagname: Optional[str] = None, attrs: Optional[dict[str, str]] = None, - selector: str | None = None, - text: str | None = None, + selector: Optional[str] = None, + text: Optional[str] = None, timeout: int | float = 10, ) -> element.Element: """ @@ -1164,30 +1182,25 @@ async def wait_for( loop = asyncio.get_running_loop() start_time = loop.time() - if tagname or attrs or text: # waiting for an element using either their tagname, attributes, text, or all. - - if not tagname: tagname = None - if not attrs: attrs = None - if not text: text = None - - item = await self.find( - tagname = tagname, - attrs = attrs, - text = text - ) - while(not item and loop.time() - start_time < timeout): - item = await self.find( - tagname = tagname, - attrs = attrs, - text = text - ) + if ( + tagname or attrs or text + ): # waiting for an element using either their tagname, attributes, text, or all. + if not tagname: + tagname = None + if not attrs: + attrs = None + if not text: + text = None + + item = await self.find(tagname=tagname, attrs=attrs, text=text) + while not item and loop.time() - start_time < timeout: + item = await self.find(tagname=tagname, attrs=attrs, text=text) await self.sleep(0.5) if item: return item if selector: - item = await self.query_selector(selector) while not item and loop.time() - start_time < timeout: item = await self.query_selector(selector) @@ -1196,7 +1209,7 @@ async def wait_for( if item: return item - raise asyncio.TimeoutError('Time ran out while waiting.') + raise asyncio.TimeoutError("Time ran out while waiting.") async def download_file(self, url: str, filename: Optional[PathLike] = None): """ @@ -1446,6 +1459,8 @@ async def set_local_storage(self, items: dict): def __call__( self, + tagname: str | None = None, + attrs: dict[str, str] | None = None, text: str | None = None, selector: str | None = None, timeout: int | float = 10, @@ -1459,7 +1474,9 @@ def __call__( :return: :rtype: """ - return self.wait_for(text, selector, timeout) + return self.wait_for( + tagname=tagname, attrs=attrs, text=text, selector=selector, timeout=timeout + ) def __eq__(self, other: Any) -> bool: if not isinstance(other, Tab): From 15e753f4e46e636ac5ed3f378ec61ef7ba95579c Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Thu, 6 Feb 2025 19:13:41 +0100 Subject: [PATCH 18/24] added private function _find_elements_by_tagname_attrs_text that handles the logic of finding elements, instead of duplicating the logic; now find_element_by_tagname_attrs_text and find_elements_by_tagname_attrs_text call _find_elements_by_tagname_attrs_text with an argument return_after_first_match indicating whether we're aiming for a single element, or a list of elements --- zendriver/core/tab.py | 131 ++++++++++++++---------------------------- 1 file changed, 43 insertions(+), 88 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index ec677b75..180deed7 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -502,109 +502,62 @@ async def find_element_by_tagname_attrs_text( :type tagname: str | None :param attrs: A dictionary of attribute-value pairs to match. Optional. :type attrs: dict[str, str] | None + :param text: The expected text value of the element. Optional. + :type attrs: str | None :return: A single element or None if no match is found. :rtype: Element | None """ - async def traverse(node, parent_tree): - """ - Recursive traversal of the DOM and shadow DOM to find the targeted element. - """ - if not node: - return None - - # create an element to check for the conditions we're looking for - elem = element.create(node, self, parent_tree) - - # check for conditions - matches_tagname = ( - not tagname - or ( - elem.tag_name - and tagname.strip().lower() == elem.tag_name.strip().lower() - ) - ) # this condition evaluates to True if tagname was not provided; no filtering by tagname. Or if tagname equals our targeted element's tagname + return await self._find_elements_by_tagname_attrs_text( + tagname = tagname, + attrs = attrs, + text = text, + return_after_first_match = True + )[0] - matches_attrs = ( - not attrs - or ( - elem.attributes - and all( - any( - elem.attributes[i] == attr - and value in elem.attributes[i + 1].split() - for i in range(0, len(elem.attributes), 2) - ) - for attr, value in attrs.items() - ) - ) - ) # this condition evaluates to True if attrs was not provided; no filtering by attrs. Or if the provided attrs are in our targeted element's attributes - - matches_text = ( - not text - or (elem.text and text.strip().lower() in elem.text.strip().lower()) - ) # this condition evaluates to True if text was not provided; no filtering by text. Or if text is in our targeted element's text - - # if all conditions match, we return the target element - if matches_tagname and matches_attrs and matches_text: - return elem - - tasks = [] - - # traverse shadow roots nodes - if node.shadow_roots: - tasks.extend( - traverse(shadow_root, parent_tree) - for shadow_root in node.shadow_roots - ) - - # traverse child nodes - if node.children: - tasks.extend(traverse(child, parent_tree) for child in node.children) - - for task in asyncio.as_completed(tasks): - result = await task - if result: - return result - - return None - - # fetch the document root - doc = await self.send(cdp.dom.get_document(depth=-1, pierce=True)) - - # start traversing the DOM tree - result = await traverse(doc, doc) - if result: - return result - - # search within iframes concurrently - iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") - iframe_tasks = [ - traverse(iframe.content_document, iframe.content_document) - for iframe in iframes - if iframe.content_document - ] - - for iframe_task in asyncio.as_completed(iframe_tasks): - result = await iframe_task - if result: - return result + async def find_elements_by_tagname_attrs_text( + self, + tagname: Optional[str] = None, + attrs: Optional[dict[str, str]] = None, + text: Optional[str] = None, + ) -> list[Element]: + """ + Finds and returns all elements matching the tagname, attributes, and optional innerText. - return None + :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'). Optional. + :type tagname: str | None + :param attrs: A dictionary of attributes and their corresponding values to match. Optional. + :type attrs: dict[str, str] | None + :param text: The expected text value of the element. Optional. + :type attrs: str | None + :return: List of matching elements. + :rtype: list[Element] + """ - async def find_elements_by_tagname_attrs_text( + return await self._find_elements_by_tagname_attrs_text( + tagname = tagname, + attrs = attrs, + text = text, + return_after_first_match = False + ) + + async def _find_elements_by_tagname_attrs_text( self, tagname: Optional[str] = None, attrs: Optional[dict[str, str]] = None, text: Optional[str] = None, + return_after_first_match: bool = False ) -> list[Element]: """ Finds and returns all elements matching the tagname, attributes, and optional innerText. - :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'). - :param attrs: A dictionary of attributes and their corresponding values to match. - :param text: The expected innerText of the element. + :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'). Optional. + :type tagname: str | None + :param attrs: A dictionary of attributes and their corresponding values to match. Optional. + :type attrs: dict[str, str] | None + :param text: The expected text value of the element. Optional. + :type attrs: str | None :return: List of matching elements. :rtype: list[Element] """ @@ -651,6 +604,8 @@ async def traverse(node, parent_tree): # if all conditions match, add the element to the list of elements to return if matches_tagname and matches_attrs and matches_text: elements.append(elem) + if return_after_first_match: + return elements tasks = [] @@ -684,7 +639,7 @@ async def traverse(node, parent_tree): if iframe_tasks: await asyncio.gather(*iframe_tasks) - return elements + return elements if not return_after_first_match else [None] async def find_element_by_text( self, From 2b14b52705ea4b3659116a27c8264d01d40d14f4 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Thu, 6 Feb 2025 19:16:22 +0100 Subject: [PATCH 19/24] comments --- zendriver/core/tab.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 180deed7..267ddc9d 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -604,7 +604,7 @@ async def traverse(node, parent_tree): # if all conditions match, add the element to the list of elements to return if matches_tagname and matches_attrs and matches_text: elements.append(elem) - if return_after_first_match: + if return_after_first_match: # if we're aiming to find a single element, we skip the rest of the code and return elements[elem] containing our target element return elements tasks = [] @@ -639,7 +639,7 @@ async def traverse(node, parent_tree): if iframe_tasks: await asyncio.gather(*iframe_tasks) - return elements if not return_after_first_match else [None] + return elements if not return_after_first_match else [None] # either we return a list of elements if we're trying to find multiple elements, or a list that contains None because find_element_by_tagname_attrs_text needs a return value if no element was found. async def find_element_by_text( self, From 2482848d3b4ebadfa702110f189c8e66f625cf98 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Thu, 6 Feb 2025 19:18:12 +0100 Subject: [PATCH 20/24] bug fixes --- zendriver/core/tab.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 267ddc9d..2543b2e5 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -509,12 +509,9 @@ async def find_element_by_tagname_attrs_text( :rtype: Element | None """ - return await self._find_elements_by_tagname_attrs_text( - tagname = tagname, - attrs = attrs, - text = text, - return_after_first_match = True - )[0] + return (await self._find_elements_by_tagname_attrs_text( + tagname=tagname, attrs=attrs, text=text, return_after_first_match=True + ))[0] async def find_elements_by_tagname_attrs_text( self, @@ -536,18 +533,15 @@ async def find_elements_by_tagname_attrs_text( """ return await self._find_elements_by_tagname_attrs_text( - tagname = tagname, - attrs = attrs, - text = text, - return_after_first_match = False + tagname=tagname, attrs=attrs, text=text, return_after_first_match=False ) - + async def _find_elements_by_tagname_attrs_text( self, tagname: Optional[str] = None, attrs: Optional[dict[str, str]] = None, text: Optional[str] = None, - return_after_first_match: bool = False + return_after_first_match: bool = False, ) -> list[Element]: """ Finds and returns all elements matching the tagname, attributes, and optional innerText. @@ -604,7 +598,7 @@ async def traverse(node, parent_tree): # if all conditions match, add the element to the list of elements to return if matches_tagname and matches_attrs and matches_text: elements.append(elem) - if return_after_first_match: # if we're aiming to find a single element, we skip the rest of the code and return elements[elem] containing our target element + if return_after_first_match: # if we're aiming to find a single element, we skip the rest of the code and return elements[elem] containing our target element return elements tasks = [] @@ -639,7 +633,9 @@ async def traverse(node, parent_tree): if iframe_tasks: await asyncio.gather(*iframe_tasks) - return elements if not return_after_first_match else [None] # either we return a list of elements if we're trying to find multiple elements, or a list that contains None because find_element_by_tagname_attrs_text needs a return value if no element was found. + return ( + elements if not return_after_first_match else [None] + ) # either we return a list of elements if we're trying to find multiple elements, or a list that contains None because find_element_by_tagname_attrs_text needs a return value if no element was found. async def find_element_by_text( self, From 9c6396de00ca4af7068985d4328435be2ff5351c Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Thu, 6 Feb 2025 21:24:53 +0100 Subject: [PATCH 21/24] bug fixes --- zendriver/core/tab.py | 125 ++++++++++++++++-------------------------- 1 file changed, 46 insertions(+), 79 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 2543b2e5..c3db24f4 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -213,13 +213,13 @@ async def find( "You must provide either tagname, attrs, or text to find an element." ) - item = await self.find_element_by_tagname_attrs_text( - tagname=tagname, attrs=attrs, text=text + items = await self._find_elements_by_tagname_attrs_text( # items is a list that might contain either a single element if found, or None + tagname=tagname, attrs=attrs, text=text, return_after_first_match=True ) - while not item: + while not items: await self.wait() - item = await self.find_element_by_tagname_attrs_text( - tagname=tagname, attrs=attrs, text=text + items = await self._find_elements_by_tagname_attrs_text( + tagname=tagname, attrs=attrs, text=text, return_after_first_match=True ) if loop.time() - start_time > timeout: raise asyncio.TimeoutError( @@ -227,7 +227,7 @@ async def find( ) await self.sleep(0.5) - return item + return items[0] # returning the first and only element of the list items async def select( self, @@ -296,17 +296,17 @@ async def find_all( "You must provide either tagname, attrs, or text to find elements." ) - items = await self.find_elements_by_tagname_attrs_text( - tagname=tagname, attrs=attrs, text=text + items = await self._find_elements_by_tagname_attrs_text( + tagname=tagname, attrs=attrs, text=text, return_after_first_match=False ) while not items: await self.wait() - items = await self.find_elements_by_tagname_attrs_text( - tagname=tagname, attrs=attrs, text=text + items = await self._find_elements_by_tagname_attrs_text( + tagname=tagname, attrs=attrs, text=text, return_after_first_match=False ) if loop.time() - start_time > timeout: raise asyncio.TimeoutError( - f"Time ran out while waiting for elements with tagname: {tagname}, attributess: {attrs}, text:{text}" + f"Time ran out while waiting for elements with tagname: {tagname}, attributess: {attrs}, text: {text}" ) await self.sleep(0.5) @@ -489,53 +489,6 @@ async def query_selector( return return element.create(node, self, doc) - async def find_element_by_tagname_attrs_text( - self, - tagname: str | None = None, - attrs: dict[str, str] | None = None, - text: str | None = None, - ) -> Element | None: - """ - Finds and returns the first element matching the tagname and attributes. - - :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'). Optional. - :type tagname: str | None - :param attrs: A dictionary of attribute-value pairs to match. Optional. - :type attrs: dict[str, str] | None - :param text: The expected text value of the element. Optional. - :type attrs: str | None - - :return: A single element or None if no match is found. - :rtype: Element | None - """ - - return (await self._find_elements_by_tagname_attrs_text( - tagname=tagname, attrs=attrs, text=text, return_after_first_match=True - ))[0] - - async def find_elements_by_tagname_attrs_text( - self, - tagname: Optional[str] = None, - attrs: Optional[dict[str, str]] = None, - text: Optional[str] = None, - ) -> list[Element]: - """ - Finds and returns all elements matching the tagname, attributes, and optional innerText. - - :param tagname: The name of the HTML tag to search for (e.g., 'button', 'input'). Optional. - :type tagname: str | None - :param attrs: A dictionary of attributes and their corresponding values to match. Optional. - :type attrs: dict[str, str] | None - :param text: The expected text value of the element. Optional. - :type attrs: str | None - :return: List of matching elements. - :rtype: list[Element] - """ - - return await self._find_elements_by_tagname_attrs_text( - tagname=tagname, attrs=attrs, text=text, return_after_first_match=False - ) - async def _find_elements_by_tagname_attrs_text( self, tagname: Optional[str] = None, @@ -551,16 +504,22 @@ async def _find_elements_by_tagname_attrs_text( :param attrs: A dictionary of attributes and their corresponding values to match. Optional. :type attrs: dict[str, str] | None :param text: The expected text value of the element. Optional. - :type attrs: str | None - :return: List of matching elements. + :type text: str | None + :param return_after_first_match: If True, stops traversal and returns a list containing only the first matching element. + :type return_after_first_match: bool + :return: List of matching elements. If return_after_first_match is True, the list contains at most one element. :rtype: list[Element] """ elements = [] + stop_searching = False # flag to indicate whether to stop searching async def traverse(node, parent_tree): """Recursive traversal of the DOM and shadow DOM to collect all matching elements.""" - if not node: + + nonlocal stop_searching + + if not node or stop_searching: return # create an element to check for the conditions we're looking for @@ -573,7 +532,7 @@ async def traverse(node, parent_tree): elem.tag_name and tagname.strip().lower() == elem.tag_name.strip().lower() ) - ) # this condition evaluates to True if tagname was not provided; no filtering by tagname. Or if tagname equals our targeted element's tagname + ) # this condition evaluates to True if tagname was not provided; no filtering by tagname. Or if tagname equals our targeted element's tagname matches_attrs = ( not attrs @@ -588,18 +547,23 @@ async def traverse(node, parent_tree): for attr, value in attrs.items() ) ) - ) # this condition evaluates to True if attrs was not provided; no filtering by attrs. Or if the provided attrs are in our targeted element's attributes + ) # this condition evaluates to True if attrs was not provided; no filtering by attrs. Or if the provided attrs are in our targeted element's attributes matches_text = ( not text or (elem.text and text.strip().lower() in elem.text.strip().lower()) - ) # this condition evaluates to True if text was not provided; no filtering by text. Or if text is in our targeted element's text + ) # this condition evaluates to True if text was not provided; no filtering by text. Or if text is in our targeted element's text # if all conditions match, add the element to the list of elements to return if matches_tagname and matches_attrs and matches_text: elements.append(elem) - if return_after_first_match: # if we're aiming to find a single element, we skip the rest of the code and return elements[elem] containing our target element - return elements + if return_after_first_match: # if return_after_first_match is True then we stop searching for other elements after finding one target element + stop_searching = True # set the flag to True to stop further traversal + return + + # if stop_searching is True, skip further traversal + if stop_searching: + return tasks = [] @@ -623,19 +587,22 @@ async def traverse(node, parent_tree): await traverse(doc, doc) # search within iframes concurrently - iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") - iframe_tasks = [ - traverse(iframe.content_document, iframe.content_document) - for iframe in iframes - if iframe.content_document - ] - - if iframe_tasks: - await asyncio.gather(*iframe_tasks) - - return ( - elements if not return_after_first_match else [None] - ) # either we return a list of elements if we're trying to find multiple elements, or a list that contains None because find_element_by_tagname_attrs_text needs a return value if no element was found. + if not stop_searching: # only search iframes if we haven't found a match yet + iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") + iframe_tasks = [ + traverse(iframe.content_document, iframe.content_document) + for iframe in iframes + if iframe.content_document + ] + + if iframe_tasks: + await asyncio.gather(*iframe_tasks) + + # return the appropriate result + if return_after_first_match: + return elements[:1] # return a list containing only the first element (or empty list if no match) + else: + return elements # return all matching elements async def find_element_by_text( self, From 594344bde0883437bd815a9b5ec238412dd01ea1 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Thu, 6 Feb 2025 21:28:11 +0100 Subject: [PATCH 22/24] formatting and linting --- zendriver/core/tab.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index c3db24f4..a9322411 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -213,7 +213,7 @@ async def find( "You must provide either tagname, attrs, or text to find an element." ) - items = await self._find_elements_by_tagname_attrs_text( # items is a list that might contain either a single element if found, or None + items = await self._find_elements_by_tagname_attrs_text( # items is a list that might contain either a single element if found, or None tagname=tagname, attrs=attrs, text=text, return_after_first_match=True ) while not items: @@ -227,7 +227,7 @@ async def find( ) await self.sleep(0.5) - return items[0] # returning the first and only element of the list items + return items[0] # returning the first and only element of the list items async def select( self, @@ -532,7 +532,7 @@ async def traverse(node, parent_tree): elem.tag_name and tagname.strip().lower() == elem.tag_name.strip().lower() ) - ) # this condition evaluates to True if tagname was not provided; no filtering by tagname. Or if tagname equals our targeted element's tagname + ) # this condition evaluates to True if tagname was not provided; no filtering by tagname. Or if tagname equals our targeted element's tagname matches_attrs = ( not attrs @@ -547,18 +547,20 @@ async def traverse(node, parent_tree): for attr, value in attrs.items() ) ) - ) # this condition evaluates to True if attrs was not provided; no filtering by attrs. Or if the provided attrs are in our targeted element's attributes + ) # this condition evaluates to True if attrs was not provided; no filtering by attrs. Or if the provided attrs are in our targeted element's attributes matches_text = ( not text or (elem.text and text.strip().lower() in elem.text.strip().lower()) - ) # this condition evaluates to True if text was not provided; no filtering by text. Or if text is in our targeted element's text + ) # this condition evaluates to True if text was not provided; no filtering by text. Or if text is in our targeted element's text # if all conditions match, add the element to the list of elements to return if matches_tagname and matches_attrs and matches_text: elements.append(elem) - if return_after_first_match: # if return_after_first_match is True then we stop searching for other elements after finding one target element - stop_searching = True # set the flag to True to stop further traversal + if return_after_first_match: # if return_after_first_match is True then we stop searching for other elements after finding one target element + stop_searching = ( + True # set the flag to True to stop further traversal + ) return # if stop_searching is True, skip further traversal @@ -587,8 +589,10 @@ async def traverse(node, parent_tree): await traverse(doc, doc) # search within iframes concurrently - if not stop_searching: # only search iframes if we haven't found a match yet - iframes = util.filter_recurse_all(doc, lambda node: node.node_name == "IFRAME") + if not stop_searching: # only search iframes if we haven't found a match yet + iframes = util.filter_recurse_all( + doc, lambda node: node.node_name == "IFRAME" + ) iframe_tasks = [ traverse(iframe.content_document, iframe.content_document) for iframe in iframes @@ -600,9 +604,11 @@ async def traverse(node, parent_tree): # return the appropriate result if return_after_first_match: - return elements[:1] # return a list containing only the first element (or empty list if no match) + return elements[ + :1 + ] # return a list containing only the first element (or empty list if no match) else: - return elements # return all matching elements + return elements # return all matching elements async def find_element_by_text( self, From 99b1d75b2b33b03e6007193953418a07dd4979b4 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Sat, 8 Feb 2025 20:50:19 +0100 Subject: [PATCH 23/24] resolving conflicts --- zendriver/core/tab.py | 259 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 256 insertions(+), 3 deletions(-) diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index a9322411..8cd163b7 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -4,11 +4,12 @@ import datetime import logging import pathlib +import re import typing import urllib.parse import warnings import webbrowser -from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union, Literal from .. import cdp from . import element, util @@ -488,7 +489,7 @@ async def query_selector( if not node: return return element.create(node, self, doc) - + async def _find_elements_by_tagname_attrs_text( self, tagname: Optional[str] = None, @@ -705,7 +706,10 @@ async def evaluate( async def js_dumps( self, obj_name: str, return_by_value: Optional[bool] = True - ) -> dict | typing.Tuple[cdp.runtime.RemoteObject, cdp.runtime.ExceptionDetails]: + ) -> ( + dict + | typing.Tuple[cdp.runtime.RemoteObject, cdp.runtime.ExceptionDetails | None] + ): """ dump given js object with its properties and values as a dict @@ -1135,6 +1139,63 @@ async def wait_for( raise asyncio.TimeoutError("Time ran out while waiting.") + async def wait_for_ready_state( + self, + until: Literal["loading", "interactive", "complete"] = "interactive", + timeout: int = 10, + ): + """ + Waits for the page to reach a certain ready state. + + :param until: The ready state to wait for. Can be one of "loading", "interactive", or "complete". + :type until: str + :param timeout: The maximum number of seconds to wait. + :type timeout: int + :raises asyncio.TimeoutError: If the timeout is reached before the ready state is reached. + :return: True if the ready state is reached. + :rtype: bool + """ + loop = asyncio.get_event_loop() + start_time = loop.time() + + while True: + ready_state = await self.evaluate("document.readyState") + if ready_state == until: + return True + + if loop.time() - start_time > timeout: + raise asyncio.TimeoutError( + "time ran out while waiting for load page until %s" % until + ) + + await asyncio.sleep(0.1) + + def expect_request( + self, url_pattern: Union[str, re.Pattern[str]] + ) -> "RequestExpectation": + """ + Creates a request expectation for a specific URL pattern. + + :param url_pattern: The URL pattern to match requests. + :type url_pattern: Union[str, re.Pattern[str]] + :return: A RequestExpectation instance. + :rtype: RequestExpectation + """ + return RequestExpectation(self, url_pattern) + + def expect_response( + self, url_pattern: Union[str, re.Pattern[str]] + ) -> "ResponseExpectation": + """ + Creates a response expectation for a specific URL pattern. + + :param url_pattern: The URL pattern to match responses. + :type url_pattern: Union[str, re.Pattern[str]] + :return: A ResponseExpectation instance. + :rtype: ResponseExpectation + """ + return ResponseExpectation(self, url_pattern) + async def download_file(self, url: str, filename: Optional[PathLike] = None): """ downloads file by given url. @@ -1381,6 +1442,44 @@ async def set_local_storage(self, items: dict): ] ) + async def set_user_agent( + self, + user_agent: str | None = None, + accept_language: str | None = None, + platform: str | None = None, + ) -> None: + """ + Set the user agent, accept language, and platform. + + These correspond to: + - navigator.userAgent + - navigator.language + - navigator.platform + + :param user_agent: user agent string + :type user_agent: str + :param accept_language: accept language string + :type accept_language: str + :param platform: platform string + :type platform: str + :return: + :rtype: + """ + if not user_agent: + user_agent = await self.evaluate("navigator.userAgent") + if not user_agent: + raise ValueError( + "Could not read existing user agent from navigator object" + ) + + await self.send( + cdp.network.set_user_agent_override( + user_agent=user_agent, + accept_language=accept_language, + platform=platform, + ) + ) + def __call__( self, tagname: str | None = None, @@ -1422,3 +1521,157 @@ def __repr__(self): extra = f"[url: {self.target.url}]" s = f"<{type(self).__name__} [{self.target_id}] [{self.type_}] {extra}>" return s + + +class BaseRequestExpectation: + """ + Base class for handling request and response expectations. + + This class provides a context manager to wait for specific network requests and responses + based on a URL pattern. It sets up handlers for request and response events and provides + properties to access the request, response, and response body. + + :param tab: The Tab instance to monitor. + :type tab: Tab + :param url_pattern: The URL pattern to match requests and responses. + :type url_pattern: Union[str, re.Pattern[str]] + """ + + def __init__(self, tab: Tab, url_pattern: Union[str, re.Pattern[str]]): + self.tab = tab + self.url_pattern = url_pattern + self.request_future: asyncio.Future[cdp.network.RequestWillBeSent] = ( + asyncio.Future() + ) + self.response_future: asyncio.Future[cdp.network.ResponseReceived] = ( + asyncio.Future() + ) + self.request_id: Union[cdp.network.RequestId, None] = None + + async def _request_handler(self, event: cdp.network.RequestWillBeSent): + """ + Internal handler for request events. + + :param event: The request event. + :type event: cdp.network.RequestWillBeSent + """ + if re.fullmatch(self.url_pattern, event.request.url): + self._remove_request_handler() + self.request_id = event.request_id + self.request_future.set_result(event) + + async def _response_handler(self, event: cdp.network.ResponseReceived): + """ + Internal handler for response events. + + :param event: The response event. + :type event: cdp.network.ResponseReceived + """ + if event.request_id == self.request_id: + self._remove_response_handler() + self.response_future.set_result(event) + + def _remove_request_handler(self): + """ + Remove the request event handler. + """ + self.tab.remove_handlers(cdp.network.RequestWillBeSent, self._request_handler) + + def _remove_response_handler(self): + """ + Remove the response event handler. + """ + self.tab.remove_handlers(cdp.network.ResponseReceived, self._response_handler) + + async def __aenter__(self): + """ + Enter the context manager, adding request and response handlers. + """ + self.tab.add_handler(cdp.network.RequestWillBeSent, self._request_handler) + self.tab.add_handler(cdp.network.ResponseReceived, self._response_handler) + return self + + async def __aexit__(self, *args): + """ + Exit the context manager, removing request and response handlers. + """ + self._remove_request_handler() + self._remove_response_handler() + + @property + async def request(self): + """ + Get the matched request. + + :return: The matched request. + :rtype: cdp.network.Request + """ + return (await self.request_future).request + + @property + async def response(self): + """ + Get the matched response. + + :return: The matched response. + :rtype: cdp.network.Response + """ + return (await self.response_future).response + + @property + async def response_body(self): + """ + Get the body of the matched response. + + :return: The response body. + :rtype: str + """ + request_id = (await self.request_future).request_id + body = await self.tab.send(cdp.network.get_response_body(request_id=request_id)) + return body + + +class RequestExpectation(BaseRequestExpectation): + """ + Class for handling request expectations. + + This class extends `BaseRequestExpectation` and provides a property to access the matched request. + + :param tab: The Tab instance to monitor. + :type tab: Tab + :param url_pattern: The URL pattern to match requests. + :type url_pattern: Union[str, re.Pattern[str]] + """ + + @property + async def value(self) -> cdp.network.RequestWillBeSent: + """ + Get the matched request event. + + :return: The matched request event. + :rtype: cdp.network.RequestWillBeSent + """ + return await self.request_future + + +class ResponseExpectation(BaseRequestExpectation): + """ + Class for handling response expectations. + + This class extends `BaseRequestExpectation` and provides a property to access the matched response. + + :param tab: The Tab instance to monitor. + :type tab: Tab + :param url_pattern: The URL pattern to match responses. + :type url_pattern: Union[str, re.Pattern[str]] + """ + + @property + async def value(self) -> cdp.network.ResponseReceived: + """ + Get the matched response event. + + :return: The matched response event. + :rtype: cdp.network.ResponseReceived + """ + return await self.response_future From ebf94dc1932585328f76ae2de511edae625384a0 Mon Sep 17 00:00:00 2001 From: hamzaaitbrik Date: Sat, 8 Feb 2025 20:55:16 +0100 Subject: [PATCH 24/24] resolving conflicts, formatting, and linting --- examples/network_monitor.py | 4 ++-- zendriver/core/tab.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/network_monitor.py b/examples/network_monitor.py index 7fb143c1..099fc39f 100644 --- a/examples/network_monitor.py +++ b/examples/network_monitor.py @@ -17,13 +17,13 @@ async def main(): tab = await browser.get("https://www.google.com/?hl=en") - reject_btn = await tab.find("reject all", best_match=True) + reject_btn = await tab.find(text="reject all") await reject_btn.click() search_inp = await tab.select("textarea") await search_inp.send_keys("undetected zendriver") - search_btn = await tab.find("google search", True) + search_btn = await tab.find(text="google search") await search_btn.click() for _ in range(10): diff --git a/zendriver/core/tab.py b/zendriver/core/tab.py index 8cd163b7..3c47f781 100644 --- a/zendriver/core/tab.py +++ b/zendriver/core/tab.py @@ -489,7 +489,7 @@ async def query_selector( if not node: return return element.create(node, self, doc) - + async def _find_elements_by_tagname_attrs_text( self, tagname: Optional[str] = None,