Skip to content

Commit f530c10

Browse files
authoredJan 7, 2025
feat(connector): add minio file scan type and enhance test (#19950)
1 parent 5084d92 commit f530c10

File tree

10 files changed

+102
-22
lines changed

10 files changed

+102
-22
lines changed
 

‎e2e_test/s3/file_sink.py

+55
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,46 @@ def do_test(config, file_num, item_num_per_file, prefix):
6262
def _table():
6363
return 's3_test_parquet'
6464

65+
print("test table function file scan")
66+
cur.execute(f'''
67+
SELECT
68+
id,
69+
name,
70+
sex,
71+
mark,
72+
test_int,
73+
test_int8,
74+
test_uint8,
75+
test_uint16,
76+
test_uint32,
77+
test_uint64,
78+
test_float_16,
79+
test_real,
80+
test_double_precision,
81+
test_varchar,
82+
test_bytea,
83+
test_date,
84+
test_time,
85+
test_timestamp_s,
86+
test_timestamp_ms,
87+
test_timestamp_us,
88+
test_timestamp_ns,
89+
test_timestamptz_s,
90+
test_timestamptz_ms,
91+
test_timestamptz_us,
92+
test_timestamptz_ns
93+
FROM file_scan(
94+
'parquet',
95+
's3',
96+
'http://127.0.0.1:9301',
97+
'hummockadmin',
98+
'hummockadmin',
99+
's3://hummock001/test_file_scan/test_file_scan.parquet'
100+
);''')
101+
result = cur.fetchone()
102+
assert result[0] == 0, f'file scan assertion failed: the first column is {result[0]}, expect 0.'
103+
104+
print("file scan test pass")
65105
# Execute a SELECT statement
66106
cur.execute(f'''CREATE TABLE {_table()}(
67107
id bigint primary key,
@@ -491,6 +531,21 @@ def _assert_greater(field, got, expect):
491531
_s3(idx),
492532
_local(idx)
493533
)
534+
# put parquet file to test table function file scan
535+
if data:
536+
first_file_data = data[0]
537+
first_table = pa.Table.from_pandas(pd.DataFrame(first_file_data))
538+
539+
first_file_name = f"test_file_scan.parquet"
540+
first_file_path = f"test_file_scan/{first_file_name}"
541+
542+
pq.write_table(first_table, "data_0.parquet")
543+
544+
client.fput_object(
545+
"hummock001",
546+
first_file_path,
547+
"data_0.parquet"
548+
)
494549

495550
# do test
496551
do_test(config, FILE_NUM, ITEM_NUM_PER_FILE, run_id)

‎proto/batch_plan.proto

+1
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ message FileScanNode {
9999
string s3_access_key = 5;
100100
string s3_secret_key = 6;
101101
repeated string file_location = 7;
102+
string s3_endpoint = 8;
102103
}
103104

104105
message GcsFileScanNode {

‎src/batch/executors/src/executor/s3_file_scan.rs

+5-3
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ use risingwave_connector::source::iceberg::{
2020
extract_bucket_and_file_name, new_s3_operator, read_parquet_file, FileScanBackend,
2121
};
2222
use risingwave_pb::batch_plan::file_scan_node;
23-
use risingwave_pb::batch_plan::file_scan_node::StorageType;
2423
use risingwave_pb::batch_plan::plan_node::NodeBody;
2524

2625
use crate::error::BatchError;
@@ -38,6 +37,7 @@ pub struct S3FileScanExecutor {
3837
s3_region: String,
3938
s3_access_key: String,
4039
s3_secret_key: String,
40+
s3_endpoint: String,
4141
batch_size: usize,
4242
schema: Schema,
4343
identity: String,
@@ -67,13 +67,15 @@ impl S3FileScanExecutor {
6767
batch_size: usize,
6868
schema: Schema,
6969
identity: String,
70+
s3_endpoint: String,
7071
) -> Self {
7172
Self {
7273
file_format,
7374
file_location,
7475
s3_region,
7576
s3_access_key,
7677
s3_secret_key,
78+
s3_endpoint,
7779
batch_size,
7880
schema,
7981
identity,
@@ -90,6 +92,7 @@ impl S3FileScanExecutor {
9092
self.s3_access_key.clone(),
9193
self.s3_secret_key.clone(),
9294
bucket.clone(),
95+
self.s3_endpoint.clone(),
9396
)?;
9497
let chunk_stream =
9598
read_parquet_file(op, file_name, None, None, self.batch_size, 0).await?;
@@ -115,8 +118,6 @@ impl BoxedExecutorBuilder for FileScanExecutorBuilder {
115118
NodeBody::FileScan
116119
)?;
117120

118-
assert_eq!(file_scan_node.storage_type, StorageType::S3 as i32);
119-
120121
Ok(Box::new(S3FileScanExecutor::new(
121122
match file_scan_node::FileFormat::try_from(file_scan_node.file_format).unwrap() {
122123
file_scan_node::FileFormat::Parquet => FileFormat::Parquet,
@@ -129,6 +130,7 @@ impl BoxedExecutorBuilder for FileScanExecutorBuilder {
129130
source.context().get_config().developer.chunk_size,
130131
Schema::from_iter(file_scan_node.columns.iter().map(Field::from)),
131132
source.plan_node().get_identity().clone(),
133+
file_scan_node.s3_endpoint.clone(),
132134
)))
133135
}
134136
}

‎src/connector/src/source/iceberg/parquet_file_handler.rs

+9-9
Original file line numberDiff line numberDiff line change
@@ -109,16 +109,16 @@ pub fn new_s3_operator(
109109
s3_access_key: String,
110110
s3_secret_key: String,
111111
bucket: String,
112+
s3_endpoint: String,
112113
) -> ConnectorResult<Operator> {
113-
// Create s3 builder.
114-
let mut builder = S3::default().bucket(&bucket).region(&s3_region);
115-
builder = builder.secret_access_key(&s3_access_key);
116-
builder = builder.secret_access_key(&s3_secret_key);
117-
builder = builder.endpoint(&format!(
118-
"https://{}.s3.{}.amazonaws.com",
119-
bucket, s3_region
120-
));
121-
114+
let mut builder = S3::default();
115+
builder = builder
116+
.region(&s3_region)
117+
.endpoint(&s3_endpoint)
118+
.access_key_id(&s3_access_key)
119+
.secret_access_key(&s3_secret_key)
120+
.bucket(&bucket)
121+
.disable_config_load();
122122
let op: Operator = Operator::new(builder)?
123123
.layer(LoggingLayer::default())
124124
.layer(RetryLayer::default())

‎src/frontend/src/expr/table_function.rs

+14-5
Original file line numberDiff line numberDiff line change
@@ -159,14 +159,24 @@ impl TableFunction {
159159
};
160160
let op = match file_scan_backend {
161161
FileScanBackend::S3 => {
162-
let (bucket, _) =
163-
extract_bucket_and_file_name(&input_file_location, &file_scan_backend)?;
164-
162+
let (bucket, _) = extract_bucket_and_file_name(
163+
&eval_args[5].clone(),
164+
&file_scan_backend,
165+
)?;
166+
167+
let (s3_region, s3_endpoint) = match eval_args[2].starts_with("http") {
168+
true => ("us-east-1".to_owned(), eval_args[2].clone()), /* for minio, hard code region as not used but needed. */
169+
false => (
170+
eval_args[2].clone(),
171+
format!("https://{}.s3.{}.amazonaws.com", bucket, eval_args[2],),
172+
),
173+
};
165174
new_s3_operator(
166-
eval_args[2].clone(),
175+
s3_region.clone(),
167176
eval_args[3].clone(),
168177
eval_args[4].clone(),
169178
bucket.clone(),
179+
s3_endpoint.clone(),
170180
)?
171181
}
172182
FileScanBackend::Gcs => {
@@ -189,7 +199,6 @@ impl TableFunction {
189199
Ok::<Vec<String>, anyhow::Error>(files)
190200
})
191201
})?;
192-
193202
if files.is_empty() {
194203
return Err(BindError(
195204
"file_scan function only accepts non-empty directory".to_owned(),

‎src/frontend/src/optimizer/plan_node/batch_file_scan.rs

+1
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ impl ToBatchPb for BatchFileScan {
9393
s3_access_key: file_scan.s3_access_key.clone(),
9494
s3_secret_key: file_scan.s3_secret_key.clone(),
9595
file_location: file_scan.file_location.clone(),
96+
s3_endpoint: file_scan.s3_endpoint.clone(),
9697
}),
9798
generic::FileScanBackend::GcsFileScan(gcs_file_scan) => {
9899
NodeBody::GcsFileScan(GcsFileScanNode {

‎src/frontend/src/optimizer/plan_node/generic/file_scan.rs

+1
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ pub struct FileScan {
9292
pub s3_access_key: String,
9393
pub s3_secret_key: String,
9494
pub file_location: Vec<String>,
95+
pub s3_endpoint: String,
9596

9697
#[educe(PartialEq(ignore))]
9798
#[educe(Hash(ignore))]

‎src/frontend/src/optimizer/plan_node/logical_file_scan.rs

+4-2
Original file line numberDiff line numberDiff line change
@@ -48,19 +48,22 @@ impl LogicalFileScan {
4848
s3_access_key: String,
4949
s3_secret_key: String,
5050
file_location: Vec<String>,
51+
s3_endpoint: String,
5152
) -> Self {
5253
assert!("parquet".eq_ignore_ascii_case(&file_format));
5354
assert!("s3".eq_ignore_ascii_case(&storage_type));
55+
let storage_type = generic::StorageType::S3;
5456

5557
let core = generic::FileScanBackend::FileScan(generic::FileScan {
5658
schema,
5759
file_format: generic::FileFormat::Parquet,
58-
storage_type: generic::StorageType::S3,
60+
storage_type,
5961
s3_region,
6062
s3_access_key,
6163
s3_secret_key,
6264
file_location,
6365
ctx,
66+
s3_endpoint,
6467
});
6568

6669
let base = PlanBase::new_logical_with_core(&core);
@@ -89,7 +92,6 @@ impl LogicalFileScan {
8992
});
9093

9194
let base = PlanBase::new_logical_with_core(&core);
92-
9395
LogicalFileScan { base, core }
9496
}
9597
}

‎src/frontend/src/optimizer/rule/table_function_to_file_scan_rule.rs

+12-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ use itertools::Itertools;
1616
use risingwave_common::catalog::{Field, Schema};
1717
use risingwave_common::types::{DataType, ScalarImpl};
1818
use risingwave_common::util::iter_util::ZipEqDebug;
19+
use risingwave_connector::source::iceberg::{extract_bucket_and_file_name, FileScanBackend};
1920

2021
use super::{BoxedRule, Rule};
2122
use crate::expr::{Expr, TableFunctionType};
@@ -63,11 +64,19 @@ impl Rule for TableFunctionToFileScanRule {
6364
);
6465

6566
if "s3".eq_ignore_ascii_case(&eval_args[1]) {
66-
let s3_region = eval_args[2].clone();
6767
let s3_access_key = eval_args[3].clone();
6868
let s3_secret_key = eval_args[4].clone();
69-
// The rest of the arguments are file locations
7069
let file_location = eval_args[5..].iter().cloned().collect_vec();
70+
71+
let (bucket, _) =
72+
extract_bucket_and_file_name(&file_location[0], &FileScanBackend::S3).ok()?;
73+
let (s3_region, s3_endpoint) = match eval_args[2].starts_with("http") {
74+
true => ("us-east-1".to_owned(), eval_args[2].clone()), /* for minio, hard code region as not used but needed. */
75+
false => (
76+
eval_args[2].clone(),
77+
format!("https://{}.s3.{}.amazonaws.com", bucket, eval_args[2],),
78+
),
79+
};
7180
Some(
7281
LogicalFileScan::new_s3_logical_file_scan(
7382
logical_table_function.ctx(),
@@ -78,6 +87,7 @@ impl Rule for TableFunctionToFileScanRule {
7887
s3_access_key,
7988
s3_secret_key,
8089
file_location,
90+
s3_endpoint,
8191
)
8292
.into(),
8393
)

‎src/object_store/src/object/opendal_engine/opendal_s3.rs

-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,6 @@ impl OpendalObjectStore {
7878
"http://"
7979
};
8080
let (address, bucket) = rest.split_once('/').unwrap();
81-
8281
let builder = S3::default()
8382
.bucket(bucket)
8483
.region("custom")

0 commit comments

Comments
 (0)
Please sign in to comment.