Skip to content

Commit

Permalink
Merge pull request #416 from danieldotnl/extract_html
Browse files Browse the repository at this point in the history
New feature to scrape and extract raw html
  • Loading branch information
danieldotnl authored Sep 4, 2024
2 parents d34cb55 + 1cccc6b commit f88e746
Show file tree
Hide file tree
Showing 6 changed files with 146 additions and 22 deletions.
15 changes: 8 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -189,13 +189,14 @@ multiscrape:

Used to configure scraping options.

| name | description | required | default | type |
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | ------- | --------------- |
| select | CSS selector used for retrieving the value of the attribute. Only required when `select_list` or `value_template` is not provided. | False | | string/template |
| select_list | CSS selector for multiple values of multiple elements which will be returned as csv. Only required when `select` or `value_template` is not provided. | False | | string/template |
| attribute | Attribute from the selected element to read as value. | False | | string |
| value_template | Defines a template applied to extract the value from the result of the selector (if provided) or raw page (if selector not provided) | False | | string/template |
| on_error | See [On-error](#on-error) | False | | |
| name | description | required | default | type |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -------- | ------- | --------------- |
| select | CSS selector used for retrieving the value of the attribute. Only required when `select_list` or `value_template` is not provided. | False | | string/template |
| select_list | CSS selector for multiple values of multiple elements which will be returned as csv. Only required when `select` or `value_template` is not provided. | False | | string/template |
| attribute | Attribute from the selected element to read as value. | False | | string |
| value_template | Defines a template applied to extract the value from the result of the selector (if provided) or raw page (if selector not provided) | False | | string/template |
| extract | Determines how the result of the CSS selector is extracted. Only applicable to HTML. `text` returns just text, `content` returns the html content of the selected tag and `tag` returns html including the selected tag. | False | text | string |
| on_error | See [On-error](#on-error) | False | | |

### On-error

Expand Down
3 changes: 3 additions & 0 deletions custom_components/multiscrape/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@
CONF_FORM_RESUBMIT_ERROR = "resubmit_on_error"
CONF_FORM_VARIABLES = "variables"
CONF_LOG_RESPONSE = "log_response"
CONF_EXTRACT = "extract"
EXTRACT_OPTIONS = ["text", "content", "tag"]
DEFAULT_PARSER = "lxml"
DEFAULT_EXTRACT = "text"

CONF_FIELDS = "fields"

Expand Down
14 changes: 8 additions & 6 deletions custom_components/multiscrape/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,20 @@
HTTP_BASIC_AUTHENTICATION,
HTTP_DIGEST_AUTHENTICATION)

from .const import (CONF_ATTR, CONF_FORM_INPUT, CONF_FORM_INPUT_FILTER,
CONF_FORM_RESUBMIT_ERROR, CONF_FORM_SELECT,
CONF_FORM_SUBMIT, CONF_FORM_SUBMIT_ONCE,
from .const import (CONF_ATTR, CONF_EXTRACT, CONF_FORM_INPUT,
CONF_FORM_INPUT_FILTER, CONF_FORM_RESUBMIT_ERROR,
CONF_FORM_SELECT, CONF_FORM_SUBMIT, CONF_FORM_SUBMIT_ONCE,
CONF_FORM_VARIABLES, CONF_LOG_RESPONSE, CONF_ON_ERROR,
CONF_ON_ERROR_DEFAULT, CONF_ON_ERROR_LOG,
CONF_ON_ERROR_VALUE, CONF_ON_ERROR_VALUE_DEFAULT,
CONF_ON_ERROR_VALUE_LAST, CONF_ON_ERROR_VALUE_NONE,
CONF_PARSER, CONF_PICTURE, CONF_SELECT, CONF_SELECT_LIST,
CONF_SENSOR_ATTRS, CONF_SEPARATOR, CONF_STATE_CLASS,
DEFAULT_BINARY_SENSOR_NAME, DEFAULT_BUTTON_NAME,
DEFAULT_FORCE_UPDATE, DEFAULT_METHOD, DEFAULT_PARSER,
DEFAULT_SENSOR_NAME, DEFAULT_SEPARATOR, DEFAULT_VERIFY_SSL,
DOMAIN, LOG_ERROR, LOG_LEVELS, METHODS)
DEFAULT_EXTRACT, DEFAULT_FORCE_UPDATE, DEFAULT_METHOD,
DEFAULT_PARSER, DEFAULT_SENSOR_NAME, DEFAULT_SEPARATOR,
DEFAULT_VERIFY_SSL, DOMAIN, EXTRACT_OPTIONS, LOG_ERROR,
LOG_LEVELS, METHODS)
from .scraper import DEFAULT_TIMEOUT

_LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -84,6 +85,7 @@
vol.Optional(CONF_ATTR): cv.string,
vol.Optional(CONF_VALUE_TEMPLATE): cv.template,
vol.Optional(CONF_ON_ERROR): vol.Schema(ON_ERROR_SCHEMA),
vol.Optional(CONF_EXTRACT, default=DEFAULT_EXTRACT): vol.In(EXTRACT_OPTIONS),
}

FORM_HEADERS_MAPPING_SCHEMA = {vol.Required(CONF_NAME): cv.string, **SELECTOR_SCHEMA}
Expand Down
19 changes: 14 additions & 5 deletions custom_components/multiscrape/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def scrape(self, selector, sensor, attribute=None, variables: dict = {}):
)
values = [tag[selector.attribute] for tag in tags]
else:
values = [tag.text for tag in tags]
values = [self.extract_tag_value(tag, selector) for tag in tags]
value = self._separator.join(values)
_LOGGER.debug("%s # List selector csv: %s", log_prefix, value)

Expand All @@ -142,10 +142,7 @@ def scrape(self, selector, sensor, attribute=None, variables: dict = {}):
)
value = tag[selector.attribute]
else:
if tag.name in ("style", "script", "template"):
value = tag.string
else:
value = tag.text
value = self.extract_tag_value(tag, selector)
_LOGGER.debug("%s # Selector result: %s", log_prefix, value)

if value is not None and selector.value_template is not None:
Expand All @@ -161,6 +158,18 @@ def scrape(self, selector, sensor, attribute=None, variables: dict = {}):
)
return value

def extract_tag_value(self, tag, selector):
"""Extract value from a tag."""
if tag.name in ("style", "script", "template"):
return tag.string
else:
if selector.extract == "text":
return tag.text
elif selector.extract == "content":
return ''.join(map(str, tag.contents))
elif selector.extract == "tag":
return str(tag)

async def _async_file_log(self, content_name, content):
try:
filename = f"{content_name}.txt"
Expand Down
9 changes: 5 additions & 4 deletions custom_components/multiscrape/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@

from homeassistant.const import CONF_VALUE_TEMPLATE

from .const import (CONF_ATTR, CONF_ON_ERROR, CONF_ON_ERROR_DEFAULT,
CONF_ON_ERROR_LOG, CONF_ON_ERROR_VALUE, CONF_SELECT,
CONF_SELECT_LIST, DEFAULT_ON_ERROR_LOG,
DEFAULT_ON_ERROR_VALUE)
from .const import (CONF_ATTR, CONF_EXTRACT, CONF_ON_ERROR,
CONF_ON_ERROR_DEFAULT, CONF_ON_ERROR_LOG,
CONF_ON_ERROR_VALUE, CONF_SELECT, CONF_SELECT_LIST,
DEFAULT_ON_ERROR_LOG, DEFAULT_ON_ERROR_VALUE)


class Selector:
Expand All @@ -27,6 +27,7 @@ def __init__(self, hass, conf):
if self.value_template and self.value_template.hass is None:
self.value_template.hass = hass

self.extract = conf.get(CONF_EXTRACT)
self.on_error = self.create_on_error(conf.get(CONF_ON_ERROR), hass)

if (
Expand Down
108 changes: 108 additions & 0 deletions tests/test_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Tests for scraper class."""
from homeassistant.core import HomeAssistant
from homeassistant.helpers.template import Template

from custom_components.multiscrape.const import DEFAULT_SEPARATOR
from custom_components.multiscrape.scraper import Scraper
from custom_components.multiscrape.selector import Selector


async def test_scrape_extract_text(hass: HomeAssistant) -> None:
"""Test scraping and extract text method."""
scraper = Scraper("test_scraper", hass, None, "lxml", DEFAULT_SEPARATOR)
await scraper.set_content(
"<div class='current-version material-card text'>"
"<h1>Current Version: 2024.8.3</h1>Released: <span class='release-date'>January 17, 2022</span>"
"<div class='links' style='links'><a href='/latest-release-notes/'>Release notes</a>"
"</div>"
"</div>"
"<template>Trying to get</template>"
"<div class='current-time'>"
"<h1>Current Time:</h1><span class='utc-time'>2022-12-22T13:15:30Z</span>"
"</div>"
)

selector_conf = {
"select": Template(".current-version h1", hass),
"extract": "text",
}

selector = Selector(hass, selector_conf)
value = scraper.scrape(selector, "test_sensor")
assert value == "Current Version: 2024.8.3"

async def test_scrape_extract_content(hass: HomeAssistant) -> None:
"""Test scraping and extract contents method."""
scraper = Scraper("test_scraper", hass, None, "lxml", DEFAULT_SEPARATOR)
await scraper.set_content(
"<div class='current-version material-card text'>"
"<h1>Current Version: 2024.8.3</h1>Released: <span class='release-date'>January 17, 2022</span>"
"<div class='links' style='links'><a href='/latest-release-notes/'>Release notes</a>"
"</div>"
"</div>"
"<template>Trying to get</template>"
"<div class='current-time'>"
"<h1>Current Time:</h1><span class='utc-time'>2022-12-22T13:15:30Z</span>"
"</div>"
)

selector_conf = {
"select": Template(".links", hass),
"extract": "content",
}

selector = Selector(hass, selector_conf)
value = scraper.scrape(selector, "test_sensor")
assert value == '<a href="/latest-release-notes/">Release notes</a>'

async def test_scrape_extract_tag(hass: HomeAssistant) -> None:
"""Test scraping and extract tag method."""
scraper = Scraper("test_scraper", hass, None, "lxml", DEFAULT_SEPARATOR)
await scraper.set_content(
"<div class='current-version material-card text'>"
"<h1>Current Version: 2024.8.3</h1>Released: <span class='release-date'>January 17, 2022</span>"
"<div class='links' style='links'><a href='/latest-release-notes/'>Release notes</a>"
"</div>"
"</div>"
"<template>Trying to get</template>"
"<div class='current-time'>"
"<h1>Current Time:</h1><span class='utc-time'>2022-12-22T13:15:30Z</span>"
"</div>"
)

selector_conf = {
"select": Template(".links", hass),
"extract": "tag",
}

selector = Selector(hass, selector_conf)
value = scraper.scrape(selector, "test_sensor")
assert value == '<div class="links" style="links"><a href="/latest-release-notes/">Release notes</a></div>'

async def test_scrape_extract_attribute(hass: HomeAssistant) -> None:
"""Test scraping and extract an HTML attribute value."""
scraper = Scraper("test_scraper", hass, None, "lxml", DEFAULT_SEPARATOR)
await scraper.set_content(
"<div class='current-version material-card text'>"
"<h1>Current Version: 2024.8.3</h1>Released: <span class='release-date'>January 17, 2022</span>"
"<div class='links' style='links'><a href='/latest-release-notes/'>Release notes</a>"
"</div>"
"</div>"
"<template>Trying to get</template>"
"<div class='current-time'>"
"<h1>Current Time:</h1><span class='utc-time'>2022-12-22T13:15:30Z</span>"
"</div>"
)

selector_conf = {
"select": Template(".links a", hass),
"attribute": "href",
}

selector = Selector(hass, selector_conf)
value = scraper.scrape(selector, "test_sensor")
assert value == '/latest-release-notes/'




0 comments on commit f88e746

Please sign in to comment.