Skip to content

Commit 778f27e

Browse files
authored
Merge pull request #42 from scrapinghub/url-page-inputs
RequestURL and ResponseURL
2 parents bab41be + 237fab6 commit 778f27e

File tree

11 files changed

+124
-28
lines changed

11 files changed

+124
-28
lines changed

docs/advanced/additional-requests.rst

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,9 @@ a generic HTTP Request: :class:`~.HttpRequest`. Here's an example:
5454
).encode("utf-8"),
5555
)
5656
57-
print(request.url) # https://www.api.example.com/product-pagination/
58-
print(request.method) # POST
57+
print(request.url) # https://www.api.example.com/product-pagination/
58+
print(type(request.url)) # <class 'web_poet.page_inputs.http.RequestUrl'>
59+
print(request.method) # POST
5960
6061
print(type(request.headers) # <class 'web_poet.page_inputs.HttpRequestHeaders'>
6162
print(request.headers) # <HttpRequestHeaders('Content-Type': 'application/json;charset=UTF-8')>
@@ -67,7 +68,8 @@ a generic HTTP Request: :class:`~.HttpRequest`. Here's an example:
6768
6869
There are a few things to take note here:
6970
70-
* ``url`` and ``method`` are simply **strings**.
71+
* ``method`` is simply a **string**.
72+
* ``url`` is represented by the :class:`~.RequestUrl` class.
7173
* ``headers`` is represented by the :class:`~.HttpRequestHeaders` class which
7274
resembles a ``dict``-like interface. It supports case-insensitive header-key
7375
lookups as well as multi-key storage.
@@ -90,8 +92,9 @@ it's perfectly fine to define them as:
9092
9193
request = web_poet.HttpRequest("https://api.example.com/product-info?id=123")
9294
93-
print(request.url) # https://api.example.com/product-info?id=123
94-
print(request.method) # GET
95+
print(request.url) # https://api.example.com/product-info?id=123
96+
print(type(request.url)) # <class 'web_poet.page_inputs.http.RequestUrl'>
97+
print(request.method) # GET
9598
9699
print(type(request.headers) # <class 'web_poet.page_inputs.HttpRequestHeaders'>
97100
print(request.headers) # <HttpRequestHeaders()>
@@ -141,8 +144,8 @@ Let's check out an example to see its internals:
141144
headers={"Content-Type": "application/json;charset=UTF-8"}
142145
)
143146
144-
print(response.url) # https://www.api.example.com/product-pagination/
145-
print(type(response.url)) # <class 'str'>
147+
print(response.url) # https://www.api.example.com/product-pagination/
148+
print(type(response.url)) # <class 'web_poet.page_inputs.http.ResponseUrl'>
146149
147150
print(response.body) # b'{"data": "value \xf0\x9f\x91\x8d"}'
148151
print(type(response.body)) # <class 'web_poet.page_inputs.HttpResponseBody'>
@@ -174,7 +177,8 @@ methods.
174177

175178
Here are the key take aways from the example above:
176179

177-
* The ``url`` and ``status`` are simply **string** and **int** respectively.
180+
* ``status`` is simply an **int**.
181+
* ``url`` is represented by the :class:`~.ResponseUrl` class.
178182
* ``headers`` is represented by the :class:`~.HttpResponseHeaders` class.
179183
It's similar to :class:`~.HttpRequestHeaders` where it inherits from
180184
:external:py:class:`multidict.CIMultiDict`, granting it case-insensitive

docs/intro/from-ground-up.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -503,7 +503,7 @@ For example, a very basic Page Object could look like this:
503503
504504
def to_item(self) -> dict:
505505
return {
506-
'url': self.response.url,
506+
'url': str(self.response.url),
507507
'title': self.response.css("h1::text").get()
508508
}
509509

tests/test_page_inputs.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
import aiohttp.web_response
44
import pytest
55
import requests
6-
76
import parsel
7+
8+
from web_poet import RequestUrl, ResponseUrl
89
from web_poet.page_inputs import (
910
HttpRequest,
1011
HttpResponse,
@@ -72,7 +73,7 @@ def test_http_defaults(cls, body_cls):
7273
http_body = body_cls(b"content")
7374

7475
obj = cls("url", body=http_body)
75-
assert obj.url == "url"
76+
assert str(obj.url) == "url"
7677
assert obj.body == b"content"
7778
assert not obj.headers
7879
assert obj.headers.get("user-agent") is None
@@ -164,7 +165,8 @@ def test_http_headers_init_dict(cls, headers_cls):
164165

165166
def test_http_request_init_minimal():
166167
req = HttpRequest("url")
167-
assert req.url == "url"
168+
assert str(req.url) == "url"
169+
assert isinstance(req.url, RequestUrl)
168170
assert req.method == "GET"
169171
assert isinstance(req.method, str)
170172
assert not req.headers
@@ -189,12 +191,20 @@ def test_http_request_init_full():
189191
http_body = HttpRequestBody(b"body")
190192
req_2 = HttpRequest("url", method="POST", headers=http_headers, body=http_body)
191193

192-
assert req_1.url == req_2.url
194+
assert str(req_1.url) == str(req_2.url)
193195
assert req_1.method == req_2.method
194196
assert req_1.headers == req_2.headers
195197
assert req_1.body == req_2.body
196198

197199

200+
def test_http_request_init_with_response_url():
201+
resp = HttpResponse("url", b"")
202+
assert isinstance(resp.url, ResponseUrl)
203+
req = HttpRequest(resp.url)
204+
assert isinstance(req.url, RequestUrl)
205+
assert str(req.url) == str(resp.url)
206+
207+
198208
def test_http_response_headers_from_bytes_dict():
199209
raw_headers = {
200210
b"Content-Length": [b"316"],

tests/test_requests.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def async_mock():
1717
"""Workaround since python 3.7 doesn't ship with asyncmock."""
1818

1919
async def async_test(req):
20-
return HttpResponse(req.url, body=b"")
20+
return HttpResponse(str(req.url), body=b"")
2121

2222
mock.MagicMock.__await__ = lambda x: async_test(x).__await__()
2323

@@ -37,7 +37,7 @@ async def test_perform_request_from_httpclient(async_mock):
3737
response = await client.get(url)
3838

3939
# The async downloader implementation should return the HttpResponse
40-
assert response.url == url
40+
assert str(response.url) == str(url)
4141
assert isinstance(response, HttpResponse)
4242

4343

@@ -47,15 +47,15 @@ async def test_http_client_single_requests(async_mock):
4747

4848
with mock.patch("web_poet.page_inputs.client.HttpRequest") as mock_request:
4949
response = await client.request("url")
50-
response.url == "url"
50+
str(response.url) == "url"
5151

5252
response = await client.get("url-get", headers={"X-Headers": "123"})
53-
response.url == "url-get"
53+
str(response.url) == "url-get"
5454

5555
response = await client.post(
5656
"url-post", headers={"X-Headers": "123"}, body=b"body value"
5757
)
58-
response.url == "url-post"
58+
str(response.url) == "url-post"
5959

6060
assert mock_request.call_args_list == [
6161
mock.call(
@@ -162,7 +162,7 @@ async def test_http_client_execute(async_mock):
162162
response = await client.execute(request)
163163

164164
assert isinstance(response, HttpResponse)
165-
assert response.url == "url-1"
165+
assert str(response.url) == "url-1"
166166

167167

168168
@pytest.mark.asyncio

tests/test_url.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import pytest
2+
3+
from web_poet._base import _Url
4+
from web_poet import RequestUrl, ResponseUrl
5+
6+
7+
def test_url_base_class():
8+
url_str = "http://example.com"
9+
url = _Url(url_str)
10+
assert str(url) == url_str
11+
assert repr(url) == "_Url('http://example.com')"
12+
13+
14+
def test_url_init_validation():
15+
with pytest.raises(TypeError):
16+
_Url(123)
17+
18+
19+
def test_url_subclasses():
20+
url_str = "http://example.com"
21+
22+
class MyUrl(_Url):
23+
pass
24+
25+
class MyUrl2(_Url):
26+
pass
27+
28+
url = MyUrl(url_str)
29+
assert str(url) == url_str
30+
assert url._url == url_str
31+
assert repr(url) == "MyUrl('http://example.com')"
32+
33+
url2 = MyUrl2(url)
34+
assert str(url2) == str(url)
35+
36+
37+
@pytest.mark.parametrize('url_cls', [_Url, RequestUrl, ResponseUrl])
38+
def test_str_equality(url_cls):
39+
url_str = "http://example.com#foo"
40+
url = url_cls(url_str)
41+
assert url != url_str
42+
assert str(url) == url_str
43+
44+
45+
def test_url_classes_eq():
46+
url_str = "http://example.com#foo"
47+
request_url = RequestUrl(url_str)
48+
response_url = ResponseUrl(url_str)
49+
50+
assert request_url != response_url
51+
assert str(request_url) == str(response_url)

web_poet/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
HttpRequestBody,
1111
HttpResponseBody,
1212
PageParams,
13+
RequestUrl,
14+
ResponseUrl,
1315
)
1416
from .overrides import PageObjectRegistry, consume_modules, OverrideRule
1517

web_poet/_base.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"""
55

66

7-
from typing import Type, TypeVar, List, Dict
7+
from typing import Type, TypeVar, List, Dict, Union
88

99
from multidict import CIMultiDict
1010

@@ -32,3 +32,19 @@ def from_name_value_pairs(cls: Type[T_headers], arg: List[Dict]) -> T_headers:
3232
<_HttpHeaders('Content-Encoding': 'gzip', 'content-length': '648')>
3333
"""
3434
return cls([(pair["name"], pair["value"]) for pair in arg])
35+
36+
37+
class _Url:
38+
""" Base URL class.
39+
"""
40+
def __init__(self, url: Union[str, '_Url']):
41+
if not isinstance(url, (str, _Url)):
42+
raise TypeError(f"`url` must be a str or an instance of _Url, "
43+
f"got {url.__class__} instance instead")
44+
self._url = str(url)
45+
46+
def __str__(self) -> str:
47+
return self._url
48+
49+
def __repr__(self) -> str:
50+
return f"{self.__class__.__name__}({self._url!r})"

web_poet/mixins.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ class ResponseShortcutsMixin(SelectableMixin):
5050

5151
@property
5252
def url(self):
53-
"""Shortcut to HTML Response's URL."""
54-
return self.response.url
53+
"""Shortcut to HTML Response's URL, as a string."""
54+
return str(self.response.url)
5555

5656
@property
5757
def html(self):

web_poet/page_inputs/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,7 @@
77
HttpResponseHeaders,
88
HttpRequestBody,
99
HttpResponseBody,
10+
RequestUrl,
11+
ResponseUrl
1012
)
1113
from .browser import BrowserHtml

web_poet/page_inputs/client.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
)
2323
from web_poet.exceptions import RequestBackendError, HttpResponseError
2424
from web_poet.utils import as_list
25+
from web_poet._base import _Url
2526

2627
logger = logging.getLogger(__name__)
2728

@@ -77,7 +78,7 @@ def _handle_status(
7778

7879
async def request(
7980
self,
80-
url: str,
81+
url: Union[str, _Url],
8182
*,
8283
method: str = "GET",
8384
headers: Optional[_Headers] = None,
@@ -115,7 +116,7 @@ async def request(
115116

116117
async def get(
117118
self,
118-
url: str,
119+
url: Union[str, _Url],
119120
*,
120121
headers: Optional[_Headers] = None,
121122
allow_status: List[_Status] = None,
@@ -132,7 +133,7 @@ async def get(
132133

133134
async def post(
134135
self,
135-
url: str,
136+
url: Union[str, _Url],
136137
*,
137138
headers: Optional[_Headers] = None,
138139
body: Optional[_Body] = None,

0 commit comments

Comments
 (0)