-
Notifications
You must be signed in to change notification settings - Fork 110
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add sessions to trustless gateways #459
Changes from 15 commits
546bf66
5836dcd
3d9cb49
39b6f7a
d1228b9
32713ab
0c28d66
d0610c9
9aeded1
8e6051a
ca1c458
d7d6334
8c42ec5
8997d41
60365d2
ac06ff6
263bc1c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import cors from 'cors' | ||
import polka from 'polka' | ||
|
||
/** @type {import('aegir').PartialOptions} */ | ||
const options = { | ||
test: { | ||
async before (options) { | ||
const server = polka({ | ||
port: 0, | ||
host: '127.0.0.1' | ||
}) | ||
server.use(cors()) | ||
server.all('/ipfs/bafkreiefnkxuhnq3536qo2i2w3tazvifek4mbbzb6zlq3ouhprjce5c3aq', (req, res) => { | ||
res.writeHead(200, { | ||
'content-type': 'application/octet-stream' | ||
}) | ||
res.end(Uint8Array.from([0, 1, 2, 0])) | ||
}) | ||
|
||
await server.listen() | ||
const { port } = server.server.address() | ||
|
||
return { | ||
server, | ||
env: { | ||
TRUSTLESS_GATEWAY: `http://127.0.0.1:${port}` | ||
} | ||
} | ||
}, | ||
async after (options, before) { | ||
await before.server.server.close() | ||
} | ||
} | ||
} | ||
|
||
export default options |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,49 @@ | ||
import { DEFAULT_SESSION_MIN_PROVIDERS, DEFAULT_SESSION_MAX_PROVIDERS, DEFAULT_SESSION_PROVIDER_QUERY_CONCURRENCY, DEFAULT_SESSION_PROVIDER_QUERY_TIMEOUT } from '@helia/interface' | ||
import { PeerQueue } from '@libp2p/utils/peer-queue' | ||
import { multiaddrToUri } from '@multiformats/multiaddr-to-uri' | ||
import pDefer from 'p-defer' | ||
import { TrustlessGateway } from './trustless-gateway.js' | ||
import { DEFAULT_TRUSTLESS_GATEWAYS } from './index.js' | ||
import type { TrustlessGatewayBlockBrokerInit, TrustlessGatewayComponents, TrustlessGatewayGetBlockProgressEvents } from './index.js' | ||
import type { BlockRetrievalOptions, BlockRetriever } from '@helia/interface/blocks' | ||
import type { Routing, BlockRetrievalOptions, BlockBroker, CreateSessionOptions } from '@helia/interface' | ||
import type { Logger } from '@libp2p/interface' | ||
import type { CID } from 'multiformats/cid' | ||
import type { ProgressOptions } from 'progress-events' | ||
|
||
export interface CreateTrustlessGatewaySessionOptions extends CreateSessionOptions<TrustlessGatewayGetBlockProgressEvents> { | ||
/** | ||
* Specify the cache control header to send to the remote. 'only-if-cached' | ||
* will prevent the gateway from fetching the content if they don't have it. | ||
* | ||
* @default only-if-cached | ||
*/ | ||
cacheControl?: string | ||
} | ||
|
||
/** | ||
* A class that accepts a list of trustless gateways that are queried | ||
* for blocks. | ||
*/ | ||
export class TrustlessGatewayBlockBroker implements BlockRetriever< | ||
ProgressOptions<TrustlessGatewayGetBlockProgressEvents> | ||
> { | ||
export class TrustlessGatewayBlockBroker implements BlockBroker<TrustlessGatewayGetBlockProgressEvents> { | ||
private readonly components: TrustlessGatewayComponents | ||
private readonly gateways: TrustlessGateway[] | ||
private readonly routing: Routing | ||
private readonly log: Logger | ||
|
||
constructor (components: TrustlessGatewayComponents, init: TrustlessGatewayBlockBrokerInit = {}) { | ||
this.components = components | ||
this.log = components.logger.forComponent('helia:trustless-gateway-block-broker') | ||
this.routing = components.routing | ||
this.gateways = (init.gateways ?? DEFAULT_TRUSTLESS_GATEWAYS) | ||
.map((gatewayOrUrl) => { | ||
return new TrustlessGateway(gatewayOrUrl) | ||
return new TrustlessGateway(gatewayOrUrl, components.logger) | ||
}) | ||
} | ||
|
||
async retrieve (cid: CID, options: BlockRetrievalOptions<ProgressOptions<TrustlessGatewayGetBlockProgressEvents>> = {}): Promise<Uint8Array> { | ||
addGateway (gatewayOrUrl: string): void { | ||
this.gateways.push(new TrustlessGateway(gatewayOrUrl, this.components.logger)) | ||
} | ||
|
||
async retrieve (cid: CID, options: BlockRetrievalOptions<TrustlessGatewayGetBlockProgressEvents> = {}): Promise<Uint8Array> { | ||
// Loop through the gateways until we get a block or run out of gateways | ||
// TODO: switch to toSorted when support is better | ||
const sortedGateways = this.gateways.sort((a, b) => b.reliability() - a.reliability()) | ||
|
@@ -41,7 +60,7 @@ | |
this.log.error('failed to validate block for %c from %s', cid, gateway.url, err) | ||
gateway.incrementInvalidBlocks() | ||
|
||
throw new Error(`unable to validate block for CID ${cid} from gateway ${gateway.url}`) | ||
throw new Error(`Block for CID ${cid} from gateway ${gateway.url} failed validation`) | ||
} | ||
|
||
return block | ||
|
@@ -50,7 +69,7 @@ | |
if (err instanceof Error) { | ||
aggregateErrors.push(err) | ||
} else { | ||
aggregateErrors.push(new Error(`unable to fetch raw block for CID ${cid} from gateway ${gateway.url}`)) | ||
aggregateErrors.push(new Error(`Unable to fetch raw block for CID ${cid} from gateway ${gateway.url}`)) | ||
} | ||
// if signal was aborted, exit the loop | ||
if (options.signal?.aborted === true) { | ||
|
@@ -60,6 +79,85 @@ | |
} | ||
} | ||
|
||
throw new AggregateError(aggregateErrors, `unable to fetch raw block for CID ${cid} from any gateway`) | ||
if (aggregateErrors.length > 0) { | ||
throw new AggregateError(aggregateErrors, `Unable to fetch raw block for CID ${cid} from any gateway`) | ||
} else { | ||
throw new Error(`Unable to fetch raw block for CID ${cid} from any gateway`) | ||
} | ||
} | ||
|
||
async createSession (root: CID, options: CreateTrustlessGatewaySessionOptions = {}): Promise<BlockBroker<TrustlessGatewayGetBlockProgressEvents>> { | ||
const gateways: string[] = [] | ||
const minProviders = options.minProviders ?? DEFAULT_SESSION_MIN_PROVIDERS | ||
const maxProviders = options.minProviders ?? DEFAULT_SESSION_MAX_PROVIDERS | ||
const deferred = pDefer<BlockBroker<TrustlessGatewayGetBlockProgressEvents>>() | ||
const broker = new TrustlessGatewayBlockBroker(this.components, { | ||
gateways | ||
}) | ||
|
||
this.log('finding transport-ipfs-gateway-http providers for cid %c', root) | ||
|
||
const queue = new PeerQueue({ | ||
concurrency: options.providerQueryConcurrency ?? DEFAULT_SESSION_PROVIDER_QUERY_CONCURRENCY | ||
}) | ||
|
||
Promise.resolve().then(async () => { | ||
for await (const provider of this.routing.findProviders(root, options)) { | ||
if (provider.protocols == null || !provider.protocols.includes('transport-ipfs-gateway-http')) { | ||
continue | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💭 I worry that as things are today, filtering by AFAIK it is something invented by IPNI at cid.contact for Rhea/Saturn last year to allow nft.storage to avoid bitswap bills. It only works with IPNI PUTs and was not designed with p2p in mind, Amino DHT peers who in the future will announce @achingbrain It may be more future-proof to look at all results, and keep ones that have If we look at There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is quite sad-face-making. I hadn't realised that
As I read the above, the HTTP transport being added to Kubo is what will cause HTTP multiaddrs to appear. Unless I'm missing something it doesn't mean the peer is running a path or subdomain gateway. That is, the remote could have a HTTP transport, but also have the gateway config disabled, just the presence of the transport isn't enough to say one way or the other. It seems like the only way is to make a request and see what you get back? This could be a recipe for DDoSing if you created nodes that respond to provider queries for certain CIDs? I think a possible solution here might be for the peer to indicate it's capabilities in it's signed peer record? The HTTP routers could return the peer record in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean FWIW, if it is the case that it was added exclusively for nft.storage then I wasn't aware of that. As far as I knew the trustless gateway was already a concept, and we implemented it for Rhea when we were asked. I think there was probably a cost saving for us there due to where we were hosting our bitswap peer at the time, but that's not the case now, and shouldn't have been a convincing argument then since infra costs and locations can easily be changed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alanshaw HTTP is good! Trustless gateways as HTTP transport are good, and .storage implementing and exposing it as alternative to bitswap is great for ecosystem, especially our browser work (ipfs-chromium, service worker, verified-fetch, Brave etc). We do want to keep it! ❤️ The only problem is that IPNI implementation informed how HTTP-only provider is represented on The Since then, I've been trying to find a way for @achingbrain yes, looking for FYSA there are proposals of more declarative ways of discovering if HTTP port supports IPFS Gateway functionality, but they are no better than the above HTTP probe.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My point is that if you send me a request, and I send you a response knowing it will trigger you to send a request to a specific third party, that is an attack vector. If I send you a single response with 10x provider records, and you make 10x requests, that is a vector for an amplification attack because I have got you to do more work than the work it took me to get you to do the work, though hopefully it's not all bad because the provider hosts should have to be distinct. As long as we recognise this and are ok with it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thats a great catch @achingbrain We should absolutely have some way to know the returned providers are valid ipfs node. I guess as long as we dont modify the delegated routers from a set of known good actors it shouldnt be an issue, but thats centralizing on known delegated providers. I imagine we would want any peer to be able to provide routing responses in the future though (ambient peer discovery?) where this could become a huge problem. I believe the amplification attack was discussed there as well If we get a list of providers from some delegated-router and all those providers turn out bad, we could at least mark that delegated-router as a bad actor and limit any future requests to it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
True, although to some extent this is true regardless. There is currently no validation anywhere in use (e.g. DHT, IPNI, etc.) that a peer advertising To be fair it is likely more work to setup HTTPS for a bad request than trying to establish just a libp2p connection and on the libp2p side we can be somewhat protected by the peerIDs/public keys (e.g. the public key can sign what's a valid advertisement with its key which will cause the security negotiation to fail otherwise ... although that will result in a wasted dial which is a separate issue in and of itself). While we're here there's a slightly larger attack surface here for "webseeds-like" behavior (e.g. pointing at
Hopefully as explained above this isn't a reasonable thing to do unless there's actually responsibility on the underlying routing system to do these checks (or some delegated routers are allowed to advertise via something like HTTP OPTIONS or .well-known to do extra work you can punish them for not doing correctly). Mostly what you're left with to punish them on is do they give about the same results as other delegated-routers (for the same underlying routing systems) or as querying the underlying systems themselves (likely requires something similar to ipfs/specs#388). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
So this is true, just like filtering for
For trustless-gateway over libp2p .well-known can be used and it's fine. For plain trustless-gateway there's currently no way to advertise that in the Amino DHT because the mappings are multihash -> peerID. This is very similar (although the underlying use case wasn't exactly the same) as the problem expressed here libp2p/notes#11 and ipld/ipld#57 (comment) (note: that's from 2019 and my understanding of the situation has improved a bit since then 🙃 which is part of why there's more detail in the message above than there). AFAIK this would require a protocol change to support with Amino. |
||
} | ||
|
||
this.log('found transport-ipfs-gateway-http provider %p for cid %c', provider.id, root) | ||
|
||
void queue.add(async () => { | ||
for (const ma of provider.multiaddrs) { | ||
let uri: string | undefined | ||
|
||
try { | ||
// /ip4/x.x.x.x/tcp/31337/http | ||
// /ip4/x.x.x.x/tcp/31337/https | ||
// etc | ||
uri = multiaddrToUri(ma) | ||
|
||
const resource = `${uri}/ipfs/${root.toString()}?format=raw` | ||
|
||
// make sure the peer is available - HEAD support doesn't seem to | ||
// be very widely implemented so as long as the remote responds | ||
// we are happy they are valid | ||
// https://specs.ipfs.tech/http-gateways/trustless-gateway/#head-ipfs-cid-path-params | ||
const response = await fetch(resource, { | ||
method: 'HEAD', | ||
headers: { | ||
Accept: 'application/vnd.ipld.raw', | ||
'Cache-Control': options.cacheControl ?? 'only-if-cached' | ||
}, | ||
signal: AbortSignal.timeout(options.providerQueryTimeout ?? DEFAULT_SESSION_PROVIDER_QUERY_TIMEOUT) | ||
}) | ||
|
||
this.log('HEAD %s %d', resource, response.status) | ||
gateways.push(uri) | ||
broker.addGateway(uri) | ||
|
||
this.log('found %d transport-ipfs-gateway-http providers for cid %c', gateways.length, root) | ||
|
||
if (gateways.length === minProviders) { | ||
deferred.resolve(broker) | ||
} | ||
|
||
if (gateways.length === maxProviders) { | ||
queue.clear() | ||
} | ||
} catch (err: any) { | ||
this.log.error('could not fetch %c from %a', root, uri ?? ma, err) | ||
} | ||
} | ||
}) | ||
} | ||
}) | ||
.catch(err => { | ||
this.log.error('error creating session for %c', root, err) | ||
}) | ||
|
||
return deferred.promise | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm worried about defaulting to asking gateways to only return content they have. will read more in the PR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is to stop them doing what gateways do, e.g. fetch content on your behalf. Otherwise it defeats the purpose of having a session.