Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 97bf587

Browse files
committedDec 28, 2023
Performance Improvements
Makes the following changes: * Changes stripe size unit to kB, with a minimum of 128kB. Default is still 1024kB. This allows us to be able to experiment with a wider range of stripe sizes. * Implements copy_on_read=false. Previously we also fetched stripes for reads. In github runners use case, this seemed to become a performance bottleneck because of amplified I/O. A typical Ubicloud CI workflow, the bdev receives requests to read 2.9M 512-byte blocks and to write 3.7M blocks, that is about 3.3GB of actual I/O if no stripe fetch happens. If we fetch stripes only for writes, 2200 stripe fetches happen, which amounts to about +4.4G more traffic (since each stripe is read, and then written). If we also fetch stripes for reads, it adds about 2100 more stripe fetches (these are because blocks which are read from, but never written to). So, with copy_on_read=false the total actual I/O will be 7.7G, with copy_on_write=true it will be about 11.9G, or about 54% more. * As subsequence of the previous point, noticed we're not using the `IORING_SETUP_SQPOLL` flag of io_uring properly, so removed it for now. * Uses O_DIRECT when opening the image file to bypass host OS page cache. This is the typical choice for other file-based SPDK bdevs. * Guest OS already does buffer page caching, and having an extra layer of cache in our bdev won't benefit much.
1 parent a31db59 commit 97bf587

12 files changed

+248
-120
lines changed
 

‎.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
11
build/*
2+
.vscode/*
3+

‎.vscode/c_cpp_properties.json

-17
This file was deleted.

‎Makefile

+8-6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Source files and object files
22
SRC_DIR := src
3+
APP_DIR := app
34
OBJ_DIR := build/obj
45
BIN_DIR := build/bin
56

@@ -13,20 +14,17 @@ SPDK_DPDK_LIB := $(filter-out -lrte_net,$(SPDK_DPDK_LIB))
1314
SPDK_DPDK_LIB += -lrte_net
1415

1516
# Compiler and linker flags
16-
CFLAGS := -Iinclude -Wall -I$(SPDK_PATH)/include
17+
CFLAGS := -D_GNU_SOURCE -Iinclude -Wall -g -O3 -I$(SPDK_PATH)/include
1718
LDFLAGS := -Wl,--whole-archive,-Bstatic $(SPDK_DPDK_LIB) -Wl,--no-whole-archive -luring -Wl,-Bdynamic $(SYS_LIB)
1819

1920
# Automatically generate a list of source files (.c) and object files (.o)
2021
SRCS := $(wildcard $(SRC_DIR)/*.c)
2122
OBJS := $(SRCS:$(SRC_DIR)/%.c=$(OBJ_DIR)/%.o)
2223

23-
# Name of the final executable
24-
TARGET := $(BIN_DIR)/vhost_ubi
25-
26-
all: $(TARGET)
24+
all: $(BIN_DIR)/vhost_ubi
2725

2826
# Link object files to create the final executable, and place it in build/bin
29-
$(TARGET): $(OBJS)
27+
$(BIN_DIR)/vhost_ubi: $(OBJS) $(OBJ_DIR)/vhost_ubi.o
3028
@mkdir -p $(BIN_DIR)
3129
$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
3230

@@ -35,6 +33,10 @@ $(OBJ_DIR)/%.o: $(SRC_DIR)/%.c
3533
@mkdir -p $(OBJ_DIR)
3634
$(CC) $(CFLAGS) -c $< -o $@
3735

36+
$(OBJ_DIR)/%.o: $(APP_DIR)/%.c
37+
@mkdir -p $(OBJ_DIR)
38+
$(CC) $(CFLAGS) -c $< -o $@
39+
3840
# Clean up build artifacts
3941
clean:
4042
@rm -rf $(OBJ_DIR) $(BIN_DIR)

‎README.md

+8-4
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,11 @@ Parameters:
120120
* `name` (text, required): Name of the bdev to be created.
121121
* `image_path` (text, required): Path to the image file.
122122
* `base_bdev` (text, required): Name of base bdev.
123-
* `stripe_size_mb` (integer, optional): Stripe size in megabytes. Defaults to 1.
123+
* `stripe_size_kb` (integer, required): Stripe size in kibibytes.
124+
* `no_sync` (boolean, optional): Ignore sync requests. Defaults to false.
125+
* `copy_on_read` (boolean, optional): Fetch stripes for reads. Defaults to true.
126+
* `directio` (boolean, optional): Use O_DIRECT when opening the image file.
127+
Defaults to true;
124128

125129
**Note.** When creating the bdev for the first time, magic bits in the metadata
126130
section of base image should be zeroed. For unencrypted base bdev, truncate
@@ -141,9 +145,9 @@ First 8MB of base bdev is reserved for metadata. Metadata consists of:
141145
* Magic bytes (9 bytes): `BDEV_UBI\0`
142146
* Metadata version major (2 bytes)
143147
* Metadata version minor (2 bytes)
144-
* stripe_size_mb (1 byte)
145-
* Stripe headers: 4 byte per stripes. Currently it specifies whether a stripe
146-
has been fetched from image or not. 31-bits are reserved for future extension.
148+
* stripe_size_kb (1 byte)
149+
* Stripe headers: 2 byte per stripes. Currently it specifies whether a stripe
150+
has been fetched from image or not. 15-bits are reserved for future extension.
147151
* Padding to make the total size 8MB.
148152

149153
Then at the 8MB offset the actual disk data starts.

‎src/vhost_ubi.c ‎app/vhost_ubi.c

File renamed without changes.

‎examples/spdk_conf.json

+3-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818
"name": "ubi0",
1919
"base_bdev": "aio0",
2020
"image_path": "jammy.raw",
21-
"stripe_size_mb": 1
21+
"stripe_size_kb": 1024,
22+
"copy_on_read": false,
23+
"directio": true
2224
}
2325
}
2426
]

‎include/bdev_ubi.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
#include "spdk/stdinc.h"
55

6-
#define DEFAULT_STRIPE_SIZE_MB 1
6+
#define DEFAULT_STRIPE_SIZE_KB 1024
77

88
typedef void (*spdk_delete_ubi_complete)(void *cb_arg, int bdeverrno);
99

@@ -17,8 +17,10 @@ struct spdk_ubi_bdev_opts {
1717
const char *name;
1818
const char *image_path;
1919
const char *base_bdev_name;
20-
uint32_t stripe_size_mb;
20+
uint32_t stripe_size_kb;
2121
bool no_sync;
22+
bool copy_on_read;
23+
bool directio;
2224
};
2325

2426
struct ubi_create_context {

‎include/bdev_ubi_internal.h

+50-23
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@
1515

1616
#define UBI_METADATA_SIZE 8388608
1717

18-
// support images upto 1TB = 2^40 (assuming 1MB stripe size)
19-
#define UBI_MAX_STRIPES (1024 * 1024)
20-
#define UBI_STRIPE_SIZE_MAX 8
18+
// support images upto 2TB = 2^40 (assuming 1MB stripe size)
19+
#define UBI_MAX_STRIPES (2 * 1024 * 1024)
20+
#define UBI_STRIPE_SIZE_MIN 64
21+
#define UBI_STRIPE_SIZE_MAX 1024
2122

2223
#define UBI_PATH_LEN 1024
2324

@@ -27,7 +28,10 @@
2728
#define UBI_VERSION_MINOR 1
2829

2930
#define UBI_MAX_ACTIVE_STRIPE_FETCHES 8
31+
#define UBI_MAX_CONCURRENT_READS 24
3032
#define UBI_FETCH_QUEUE_SIZE 32768
33+
// UBI_URING_QUEUE_SIZE = UBI_MAX_ACTIVE_STRIPE_FETCHES + UBI_MAX_CONCURRENT_READS
34+
#define UBI_URING_QUEUE_SIZE 32
3135

3236
/*
3337
* On-disk metadata for a ubi bdev.
@@ -39,16 +43,16 @@ struct ubi_metadata {
3943
uint8_t versionMajor[2];
4044
uint8_t versionMinor[2];
4145

42-
uint8_t stripe_size_mb;
46+
uint8_t stripe_size_kb;
4347

4448
/*
45-
* Currently stripe_headers[i] will be either 0 or 1, but reserve 31 more
49+
* Currently stripe_headers[i] will be either 0 or 1, but reserve 16 more
4650
* bits per stripe for future extension.
4751
*/
48-
uint8_t stripe_headers[UBI_MAX_STRIPES][4];
52+
uint8_t stripe_headers[UBI_MAX_STRIPES][2];
4953

5054
/* Unused space reserved for future extension. */
51-
uint8_t padding[UBI_METADATA_SIZE - UBI_MAGIC_SIZE - UBI_MAX_STRIPES * 4 - 5];
55+
uint8_t padding[UBI_METADATA_SIZE - UBI_MAGIC_SIZE - UBI_MAX_STRIPES * 2 - 5];
5256
};
5357

5458
/*
@@ -80,12 +84,15 @@ struct ubi_bdev {
8084
struct ubi_base_bdev_info base_bdev_info;
8185

8286
char image_path[UBI_PATH_LEN];
83-
uint32_t stripe_size_mb;
87+
uint32_t stripe_size_kb;
8488
uint32_t stripe_block_count;
8589
uint32_t stripe_shift;
8690
uint32_t data_offset_blocks;
8791
uint64_t image_block_count;
92+
uint32_t alignment_bytes;
8893
bool no_sync;
94+
bool copy_on_read;
95+
bool directio;
8996

9097
enum stripe_status stripe_status[UBI_MAX_STRIPES];
9198

@@ -103,18 +110,42 @@ struct ubi_bdev {
103110
TAILQ_ENTRY(ubi_bdev) tailq;
104111
};
105112

113+
enum ubi_io_type { UBI_BDEV_IO, UBI_STRIPE_FETCH };
114+
115+
struct ubi_io_op {
116+
enum ubi_io_type type;
117+
};
118+
119+
/*
120+
* per I/O operation state.
121+
*/
122+
struct ubi_bdev_io {
123+
struct ubi_io_op op;
124+
125+
struct ubi_bdev *ubi_bdev;
126+
struct ubi_io_channel *ubi_ch;
127+
128+
uint64_t block_offset;
129+
uint64_t block_count;
130+
131+
uint64_t stripes_fetched;
132+
};
133+
106134
/*
107135
* State for a stripe fetch operation.
108136
*/
109137
struct stripe_fetch {
138+
struct ubi_io_op op;
139+
110140
/* Is this currently used for an ongoing stripe fetch? */
111141
bool active;
112142

113143
/* Which stripe are we fetching? */
114144
uint32_t stripe_idx;
115145

116146
/* Where will the data be stored at? */
117-
uint8_t buf[1024 * 1024 * UBI_STRIPE_SIZE_MAX];
147+
uint8_t buf[1024 * UBI_STRIPE_SIZE_MAX + 8192];
148+
uint8_t *buf_aligned;
118149

119150
struct ubi_bdev *ubi_bdev;
120151
};
@@ -127,6 +158,12 @@ struct ubi_io_channel {
127158
struct spdk_poller *poller;
128159
struct spdk_io_channel *base_channel;
129160

161+
uint64_t blocks_read;
162+
uint64_t blocks_written;
163+
uint64_t stripes_fetched;
164+
165+
uint64_t active_reads;
166+
130167
/*
131168
* Stripe fetches are initially queued in "stripe_fetch_queue". During each
132169
* I/O poller iteration, if there's an available spot in stripe_fetches, a
@@ -143,31 +180,21 @@ struct ubi_io_channel {
143180
/* io_uring stuff */
144181
int image_file_fd;
145182
struct io_uring image_file_ring;
183+
int has_reads;
184+
int wait_cycles;
146185

147186
/* queue pointer */
148187
TAILQ_HEAD(, spdk_bdev_io) io;
149188
};
150189

151-
/*
152-
* per I/O operation state.
153-
*/
154-
struct ubi_bdev_io {
155-
struct ubi_bdev *ubi_bdev;
156-
struct ubi_io_channel *ubi_ch;
157-
158-
uint64_t block_offset;
159-
uint64_t block_count;
160-
161-
uint64_t stripes_fetched;
162-
};
163-
164190
/* bdev_ubi_flush.c */
165191
void ubi_submit_flush_request(struct ubi_bdev_io *ubi_io);
166192

167193
/* bdev_ubi_stripe.c */
168194
void ubi_start_fetch_stripe(struct ubi_io_channel *base_ch,
169195
struct stripe_fetch *stripe_fetch);
170-
int ubi_complete_fetch_stripe(struct ubi_io_channel *ch);
196+
int ubi_complete_fetch_stripe(struct ubi_io_channel *ch,
197+
struct stripe_fetch *stripe_fetch, int res);
171198
void enqueue_stripe(struct ubi_io_channel *ch, int stripe_idx);
172199
int dequeue_stripe(struct ubi_io_channel *ch);
173200
bool stripe_queue_empty(struct ubi_io_channel *ch);

‎src/bdev_ubi.c

+21-12
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "bdev_ubi.h"
22
#include "bdev_ubi_internal.h"
33

4+
#include "spdk/fd.h"
45
#include "spdk/likely.h"
56
#include "spdk/log.h"
67

@@ -139,12 +140,15 @@ void bdev_ubi_create(const struct spdk_ubi_bdev_opts *opts,
139140
* Initialize variables that determine the layout of both metadata and
140141
* actual data on base bdev.
141142
*
142-
* TODO: if this is not a new disk, we should read stripe_size_mb from
143+
* TODO: if this is not a new disk, we should read stripe_size_kb from
143144
* metadata.
144145
*/
145-
ubi_bdev->stripe_size_mb = opts->stripe_size_mb;
146+
ubi_bdev->stripe_size_kb = opts->stripe_size_kb;
146147
ubi_bdev->no_sync = opts->no_sync;
147148

149+
ubi_bdev->copy_on_read = opts->copy_on_read;
150+
ubi_bdev->directio = opts->directio;
151+
148152
strncpy(ubi_bdev->image_path, opts->image_path, UBI_PATH_LEN);
149153
ubi_bdev->image_path[UBI_PATH_LEN - 1] = 0;
150154

@@ -168,6 +172,9 @@ void bdev_ubi_create(const struct spdk_ubi_bdev_opts *opts,
168172
ubi_bdev->bdev.optimal_io_boundary = ubi_bdev->stripe_block_count;
169173
ubi_bdev->bdev.split_on_optimal_io_boundary = true;
170174

175+
ubi_bdev->alignment_bytes = 4096;
176+
ubi_bdev->bdev.required_alignment = spdk_u32log2(ubi_bdev->alignment_bytes);
177+
171178
spdk_io_device_register(ubi_bdev, ubi_create_channel_cb, ubi_destroy_channel_cb,
172179
sizeof(struct ubi_io_channel), ubi_bdev->bdev.name);
173180
context->registerd = true;
@@ -255,7 +262,7 @@ static bool ubi_new_disk(const uint8_t *magic) {
255262
static void ubi_init_metadata(struct ubi_bdev *ubi_bdev) {
256263
memcpy(ubi_bdev->metadata.magic, UBI_MAGIC, UBI_MAGIC_SIZE);
257264
ubi_set_version(&ubi_bdev->metadata, UBI_VERSION_MAJOR, UBI_VERSION_MINOR);
258-
ubi_bdev->metadata.stripe_size_mb = ubi_bdev->stripe_size_mb;
265+
ubi_bdev->metadata.stripe_size_kb = ubi_bdev->stripe_size_kb;
259266
}
260267

261268
/*
@@ -314,18 +321,19 @@ static int ubi_init_layout_params(struct ubi_bdev *ubi_bdev) {
314321
return -EINVAL;
315322
}
316323

317-
if (ubi_bdev->stripe_size_mb < 1 || ubi_bdev->stripe_size_mb > UBI_STRIPE_SIZE_MAX) {
318-
UBI_ERRLOG(ubi_bdev, "stripe_size_mb must be between 1 and %d (inclusive)\n",
319-
UBI_STRIPE_SIZE_MAX);
324+
if (ubi_bdev->stripe_size_kb < UBI_STRIPE_SIZE_MIN ||
325+
ubi_bdev->stripe_size_kb > UBI_STRIPE_SIZE_MAX) {
326+
UBI_ERRLOG(ubi_bdev, "stripe_size_kb must be between %d and %d (inclusive)\n",
327+
UBI_STRIPE_SIZE_MIN, UBI_STRIPE_SIZE_MAX);
320328
return -EINVAL;
321329
}
322330

323-
if (ubi_bdev->stripe_size_mb & (ubi_bdev->stripe_size_mb - 1)) {
324-
UBI_ERRLOG(ubi_bdev, "stripe_size_mb must be a power of 2\n");
331+
if (ubi_bdev->stripe_size_kb & (ubi_bdev->stripe_size_kb - 1)) {
332+
UBI_ERRLOG(ubi_bdev, "stripe_size_kb must be a power of 2\n");
325333
return -EINVAL;
326334
}
327335

328-
uint32_t stripSizeBytes = ubi_bdev->stripe_size_mb * 1024 * 1024;
336+
uint32_t stripSizeBytes = ubi_bdev->stripe_size_kb * 1024;
329337
if (stripSizeBytes < blocklen) {
330338
UBI_ERRLOG(ubi_bdev,
331339
"stripe size (%u bytes) can't be less than base bdev's "
@@ -425,7 +433,7 @@ static void ubi_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write
425433
spdk_json_write_named_string(w, "name", bdev->name);
426434
spdk_json_write_named_string(w, "base_bdev", ubi_bdev->base_bdev_info.bdev->name);
427435
spdk_json_write_named_string(w, "image_path", ubi_bdev->image_path);
428-
spdk_json_write_named_uint32(w, "stripe_size_mb", ubi_bdev->stripe_size_mb);
436+
spdk_json_write_named_uint32(w, "stripe_size_kb", ubi_bdev->stripe_size_kb);
429437
spdk_json_write_object_end(w);
430438

431439
spdk_json_write_object_end(w);
@@ -477,9 +485,10 @@ static bool ubi_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) {
477485
static void ubi_submit_request(struct spdk_io_channel *_ch,
478486
struct spdk_bdev_io *bdev_io) {
479487
struct ubi_io_channel *ch = spdk_io_channel_get_ctx(_ch);
488+
struct ubi_bdev *ubi_bdev = bdev_io->bdev->ctxt;
480489

481-
if (bdev_io->type != SPDK_BDEV_IO_TYPE_FLUSH) {
482-
struct ubi_bdev *ubi_bdev = bdev_io->bdev->ctxt;
490+
if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE ||
491+
(bdev_io->type == SPDK_BDEV_IO_TYPE_READ && ubi_bdev->copy_on_read)) {
483492

484493
uint64_t start_block = bdev_io->u.bdev.offset_blocks;
485494
uint64_t num_blocks = bdev_io->u.bdev.num_blocks;

‎src/bdev_ubi_io_channel.c

+120-21
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,15 @@
88
* Static function forward declarations
99
*/
1010
static int ubi_io_poll(void *arg);
11+
static int ubi_complete_image_io(struct ubi_io_channel *ch);
12+
static int ubi_complete_read_from_image(struct ubi_io_channel *ch,
13+
struct ubi_bdev_io *ubi_io, int res);
1114
static void get_buf_for_read_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
1215
bool success);
1316
static int ubi_submit_read_request(struct ubi_bdev_io *ubi_io);
1417
static int ubi_submit_write_request(struct ubi_bdev_io *ubi_io);
15-
static void ubi_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
18+
static void ubi_io_completion_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
19+
static void ubi_complete_io(struct ubi_bdev_io *ubi_io, bool success);
1620
static void ubi_init_ext_io_opts(struct spdk_bdev_io *bdev_io,
1721
struct spdk_bdev_ext_io_opts *opts);
1822

@@ -39,7 +43,12 @@ int ubi_create_channel_cb(void *io_device, void *ctx_buf) {
3943
ch->stripe_fetches[i].ubi_bdev = ubi_bdev;
4044
}
4145

42-
ch->image_file_fd = open(ubi_bdev->image_path, O_RDONLY);
46+
int open_flags = O_RDONLY;
47+
if (ubi_bdev->directio)
48+
open_flags |= O_DIRECT;
49+
else
50+
SPDK_WARNLOG("No direct io");
51+
ch->image_file_fd = open(ubi_bdev->image_path, open_flags);
4352
if (ch->image_file_fd < 0) {
4453
UBI_ERRLOG(ubi_bdev, "could not open %s: %s\n", ubi_bdev->image_path,
4554
strerror(errno));
@@ -48,11 +57,7 @@ int ubi_create_channel_cb(void *io_device, void *ctx_buf) {
4857

4958
struct io_uring_params io_uring_params;
5059
memset(&io_uring_params, 0, sizeof(io_uring_params));
51-
io_uring_params.flags |= IORING_SETUP_SQPOLL;
52-
io_uring_params.sq_thread_idle = 2000;
53-
54-
int rc = io_uring_queue_init_params(UBI_MAX_ACTIVE_STRIPE_FETCHES,
55-
&ch->image_file_ring, &io_uring_params);
60+
int rc = io_uring_queue_init(UBI_URING_QUEUE_SIZE, &ch->image_file_ring, 0);
5661
if (rc != 0) {
5762
UBI_ERRLOG(ubi_bdev, "Unable to setup io_uring: %s\n", strerror(-rc));
5863
return -EINVAL;
@@ -72,6 +77,11 @@ void ubi_destroy_channel_cb(void *io_device, void *ctx_buf) {
7277
UBI_ERRLOG(ch->ubi_bdev, "Error closing file: %s\n", strerror(errno));
7378
}
7479

80+
SPDK_NOTICELOG(
81+
"stats for %s: blocks read: %ld, blocks written: %ld, stripes_fetched: %ld\n",
82+
ch->ubi_bdev->bdev.name, ch->blocks_read, ch->blocks_written,
83+
ch->stripes_fetched);
84+
7585
io_uring_queue_exit(&ch->image_file_ring);
7686
spdk_put_io_channel(ch->base_channel);
7787
}
@@ -85,9 +95,9 @@ static int ubi_io_poll(void *arg) {
8595
struct ubi_bdev *ubi_bdev = ch->ubi_bdev;
8696

8797
bool queues_empty = TAILQ_EMPTY(&ch->io) && stripe_queue_empty(ch);
88-
int fetches_completed = ubi_complete_fetch_stripe(ch);
98+
int image_ios_completed = ubi_complete_image_io(ch);
8999

90-
if (queues_empty && fetches_completed < 1) {
100+
if (queues_empty && image_ios_completed < 1) {
91101
return SPDK_POLLER_IDLE;
92102
}
93103

@@ -122,7 +132,9 @@ static int ubi_io_poll(void *arg) {
122132

123133
stripe_fetch->stripe_idx = stripe_idx;
124134
stripe_fetch->active = true;
135+
stripe_fetch->op.type = UBI_STRIPE_FETCH;
125136
ubi_start_fetch_stripe(ch, stripe_fetch);
137+
ch->stripes_fetched++;
126138

127139
free_stripe_fetch_idx++;
128140
}
@@ -133,6 +145,10 @@ static int ubi_io_poll(void *arg) {
133145
while (!TAILQ_EMPTY(&ch->io)) {
134146
bdev_io = TAILQ_FIRST(&ch->io);
135147

148+
if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ &&
149+
ch->active_reads > UBI_MAX_CONCURRENT_READS)
150+
break;
151+
136152
uint64_t start_block = bdev_io->u.bdev.offset_blocks;
137153
if (bdev_io->type != SPDK_BDEV_IO_TYPE_FLUSH &&
138154
start_block < ubi_bdev->image_block_count) {
@@ -154,7 +170,9 @@ static int ubi_io_poll(void *arg) {
154170
* order they were received.
155171
*/
156172
break;
157-
} else if (stripe_status == STRIPE_NOT_FETCHED) {
173+
} else if (stripe_status == STRIPE_NOT_FETCHED &&
174+
(bdev_io->type != SPDK_BDEV_IO_TYPE_READ ||
175+
ubi_bdev->copy_on_read)) {
158176
/*
159177
* This is a programming error. Such a scenario should never
160178
* arise. If an I/O request from the base image was enqueued,
@@ -183,14 +201,18 @@ static int ubi_io_poll(void *arg) {
183201
ubi_io->ubi_ch = ch;
184202
ubi_io->block_offset = bdev_io->u.bdev.offset_blocks;
185203
ubi_io->block_count = bdev_io->u.bdev.num_blocks;
204+
ubi_io->op.type = UBI_BDEV_IO;
186205

187206
switch (bdev_io->type) {
188207
case SPDK_BDEV_IO_TYPE_READ: {
189-
int len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
208+
ch->active_reads++;
209+
ch->blocks_read += ubi_io->block_count;
210+
uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
190211
spdk_bdev_io_get_buf(bdev_io, get_buf_for_read_cb, len);
191212
break;
192213
}
193214
case SPDK_BDEV_IO_TYPE_WRITE:
215+
ch->blocks_written += ubi_io->block_count;
194216
ubi_submit_write_request(ubi_io);
195217
break;
196218
case SPDK_BDEV_IO_TYPE_FLUSH:
@@ -202,26 +224,94 @@ static int ubi_io_poll(void *arg) {
202224
}
203225
}
204226

227+
if (ch->has_reads) {
228+
int ret = io_uring_submit(&ch->image_file_ring);
229+
if (ret < 0) {
230+
UBI_ERRLOG(ubi_bdev, "io_uring_submit failed: %d\n", ret);
231+
} else {
232+
ch->has_reads = 0;
233+
}
234+
}
235+
205236
return SPDK_POLLER_BUSY;
206237
}
207238

239+
static int ubi_complete_image_io(struct ubi_io_channel *ch) {
240+
struct io_uring *ring = &ch->image_file_ring;
241+
struct io_uring_cqe *cqe[64];
242+
243+
int ret = io_uring_peek_batch_cqe(ring, cqe, 64);
244+
if (ret == -EAGAIN) {
245+
return 0;
246+
} else if (ret < 0) {
247+
UBI_ERRLOG(ch->ubi_bdev, "io_uring_peek_cqe: %s\n", strerror(-ret));
248+
return -1;
249+
}
250+
251+
for (int i = 0; i < ret; i++) {
252+
struct ubi_io_op *op = io_uring_cqe_get_data(cqe[i]);
253+
254+
switch (op->type) {
255+
case UBI_STRIPE_FETCH:
256+
ret = ubi_complete_fetch_stripe(ch, (struct stripe_fetch *)op, cqe[i]->res);
257+
break;
258+
case UBI_BDEV_IO:
259+
ret = ubi_complete_read_from_image(ch, (struct ubi_bdev_io *)op, cqe[i]->res);
260+
break;
261+
}
262+
263+
/* Mark the completion as seen. */
264+
io_uring_cqe_seen(ring, cqe[i]);
265+
}
266+
267+
return ret;
268+
}
269+
208270
/*
209271
* get_buf_for_read_cb
210272
*/
211273
static void get_buf_for_read_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
212274
bool success) {
213275
struct ubi_bdev_io *ubi_io = (struct ubi_bdev_io *)bdev_io->driver_ctx;
276+
struct ubi_bdev *ubi_bdev = ubi_io->ubi_bdev;
214277

215278
if (!success) {
216-
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
279+
ubi_complete_io(ubi_io, false);
217280
return;
218281
}
219282

220-
int ret = ubi_submit_read_request(ubi_io);
283+
uint64_t start_block = bdev_io->u.bdev.offset_blocks;
284+
uint64_t stripe = start_block >> ubi_bdev->stripe_shift;
285+
enum stripe_status stripe_status = ubi_get_stripe_status(ubi_bdev, stripe);
286+
287+
if (stripe_status == STRIPE_FETCHED || start_block >= ubi_bdev->image_block_count) {
288+
int ret = ubi_submit_read_request(ubi_io);
289+
290+
if (spdk_unlikely(ret != 0)) {
291+
ubi_complete_io(ubi_io, false);
292+
}
293+
} else {
294+
// read from base image.
295+
struct ubi_io_channel *ubi_ch = ubi_io->ubi_ch;
296+
struct io_uring *ring = &ubi_ch->image_file_ring;
297+
struct io_uring_sqe *sqe = io_uring_get_sqe(ring);
298+
uint64_t offset = start_block * ubi_bdev->bdev.blocklen;
299+
io_uring_prep_readv(sqe, ubi_ch->image_file_fd, bdev_io->u.bdev.iovs,
300+
bdev_io->u.bdev.iovcnt, offset);
301+
io_uring_sqe_set_data(sqe, ubi_io);
302+
ubi_ch->has_reads++;
303+
}
304+
}
221305

222-
if (spdk_unlikely(ret != 0)) {
223-
spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
306+
static int ubi_complete_read_from_image(struct ubi_io_channel *ch,
307+
struct ubi_bdev_io *ubi_io, int res) {
308+
if (res < 0) {
309+
ubi_complete_io(ubi_io, false);
310+
return -1;
224311
}
312+
313+
ubi_complete_io(ubi_io, true);
314+
return 1;
225315
}
226316

227317
/*
@@ -243,7 +333,7 @@ static int ubi_submit_read_request(struct ubi_bdev_io *ubi_io) {
243333
uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
244334
int ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, bdev_io->u.bdev.iovs,
245335
bdev_io->u.bdev.iovcnt, start_block, num_blocks,
246-
ubi_io_completion, ubi_io, &io_opts);
336+
ubi_io_completion_cb, ubi_io, &io_opts);
247337
return ret;
248338
}
249339

@@ -267,19 +357,28 @@ static int ubi_submit_write_request(struct ubi_bdev_io *ubi_io) {
267357
struct spdk_io_channel *base_ch = ubi_io->ubi_ch->base_channel;
268358
int ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch, bdev_io->u.bdev.iovs,
269359
bdev_io->u.bdev.iovcnt, start_block, num_blocks,
270-
ubi_io_completion, ubi_io, &io_opts);
360+
ubi_io_completion_cb, ubi_io, &io_opts);
271361
return ret;
272362
}
273363

274364
/*
275-
* ubi_io_completion cleans up and marks the I/O request as completed.
365+
* ubi_io_completion_cb cleans up and marks the I/O request as completed.
276366
*/
277-
static void ubi_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) {
367+
static void ubi_io_completion_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) {
368+
spdk_bdev_free_io(bdev_io);
369+
278370
struct ubi_bdev_io *ubi_io = cb_arg;
371+
ubi_complete_io(ubi_io, success);
372+
}
279373

280-
spdk_bdev_free_io(bdev_io);
374+
static void ubi_complete_io(struct ubi_bdev_io *ubi_io, bool success)
375+
{
376+
struct spdk_bdev_io * bdev_io = spdk_bdev_io_from_ctx(ubi_io);
377+
378+
if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ)
379+
ubi_io->ubi_ch->active_reads--;
281380

282-
spdk_bdev_io_complete(spdk_bdev_io_from_ctx(ubi_io),
381+
spdk_bdev_io_complete(bdev_io,
283382
success ? SPDK_BDEV_IO_STATUS_SUCCESS
284383
: SPDK_BDEV_IO_STATUS_FAILED);
285384
}

‎src/bdev_ubi_rpc.c

+15-7
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@ struct rpc_construct_ubi {
1010
char *name;
1111
char *image_path;
1212
char *base_bdev_name;
13-
uint32_t stripe_size_mb;
13+
uint32_t stripe_size_kb;
1414
bool no_sync;
15+
bool copy_on_read;
16+
bool directio;
1517
};
1618

1719
static void free_rpc_construct_ubi(struct rpc_construct_ubi *req) {
@@ -26,10 +28,13 @@ static const struct spdk_json_object_decoder rpc_construct_ubi_decoders[] = {
2628
spdk_json_decode_string},
2729
{"base_bdev", offsetof(struct rpc_construct_ubi, base_bdev_name),
2830
spdk_json_decode_string},
29-
{"stripe_size_mb", offsetof(struct rpc_construct_ubi, stripe_size_mb),
30-
spdk_json_decode_uint32, true},
31-
{"no_sync", offsetof(struct rpc_construct_ubi, no_sync), spdk_json_decode_bool,
32-
true}};
31+
{"stripe_size_kb", offsetof(struct rpc_construct_ubi, stripe_size_kb),
32+
spdk_json_decode_uint32},
33+
{"no_sync", offsetof(struct rpc_construct_ubi, no_sync), spdk_json_decode_bool, true},
34+
{"copy_on_read", offsetof(struct rpc_construct_ubi, copy_on_read),
35+
spdk_json_decode_bool, true},
36+
{"directio", offsetof(struct rpc_construct_ubi, directio),
37+
spdk_json_decode_bool, true}};
3338

3439
static void bdev_ubi_create_done(void *cb_arg, struct spdk_bdev *bdev, int status) {
3540
struct spdk_jsonrpc_request *request = cb_arg;
@@ -55,8 +60,9 @@ static void rpc_bdev_ubi_create(struct spdk_jsonrpc_request *request,
5560

5661
// set optional parameters. spdk_json_decode_object will overwrite if
5762
// provided.
58-
req.stripe_size_mb = DEFAULT_STRIPE_SIZE_MB;
5963
req.no_sync = false;
64+
req.copy_on_read = true;
65+
req.directio = true;
6066

6167
if (spdk_json_decode_object(params, rpc_construct_ubi_decoders,
6268
SPDK_COUNTOF(rpc_construct_ubi_decoders), &req)) {
@@ -69,8 +75,10 @@ static void rpc_bdev_ubi_create(struct spdk_jsonrpc_request *request,
6975
opts.name = req.name;
7076
opts.image_path = req.image_path;
7177
opts.base_bdev_name = req.base_bdev_name;
72-
opts.stripe_size_mb = req.stripe_size_mb;
78+
opts.stripe_size_kb = req.stripe_size_kb;
7379
opts.no_sync = req.no_sync;
80+
opts.copy_on_read = req.copy_on_read;
81+
opts.directio = req.directio;
7482

7583
struct ubi_create_context *context = calloc(1, sizeof(struct ubi_create_context));
7684
context->done_fn = bdev_ubi_create_done;

‎src/bdev_ubi_stripe.c

+17-27
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,16 @@ void ubi_start_fetch_stripe(struct ubi_io_channel *ch,
1717
struct io_uring *ring = &ch->image_file_ring;
1818
struct io_uring_sqe *sqe = io_uring_get_sqe(ring);
1919
uint32_t stripe_idx = stripe_fetch->stripe_idx;
20+
uint32_t alignment = ubi_bdev->alignment_bytes;
2021

21-
uint64_t offset = ubi_bdev->stripe_size_mb * 1024L * 1024L * stripe_idx;
22-
uint32_t nbytes = ubi_bdev->stripe_size_mb * 1024L * 1024L;
22+
uint64_t offset = ubi_bdev->stripe_size_kb * 1024L * stripe_idx;
23+
uint32_t nbytes = ubi_bdev->stripe_size_kb * 1024L;
2324

24-
io_uring_prep_read(sqe, ch->image_file_fd, stripe_fetch->buf, nbytes, offset);
25+
uint32_t alignment_offset =
26+
alignment - ((uint64_t)stripe_fetch->buf & (alignment - 1));
27+
stripe_fetch->buf_aligned = stripe_fetch->buf + alignment_offset;
28+
29+
io_uring_prep_read(sqe, ch->image_file_fd, stripe_fetch->buf_aligned, nbytes, offset);
2530
io_uring_sqe_set_data(sqe, stripe_fetch);
2631

2732
int ret = io_uring_submit(ring);
@@ -32,44 +37,29 @@ void ubi_start_fetch_stripe(struct ubi_io_channel *ch,
3237
}
3338
}
3439

35-
int ubi_complete_fetch_stripe(struct ubi_io_channel *ch) {
36-
struct io_uring *ring = &ch->image_file_ring;
37-
struct io_uring_cqe *cqe;
40+
int ubi_complete_fetch_stripe(struct ubi_io_channel *ch,
41+
struct stripe_fetch *stripe_fetch, int res) {
42+
uint64_t offset = ch->ubi_bdev->stripe_size_kb * 1024L * stripe_fetch->stripe_idx;
43+
uint32_t nbytes = ch->ubi_bdev->stripe_size_kb * 1024L;
3844

39-
int ret = io_uring_peek_cqe(ring, &cqe);
40-
if (ret == -EAGAIN) {
41-
return 0;
42-
} else if (ret != 0) {
43-
UBI_ERRLOG(ch->ubi_bdev, "io_uring_peek_cqe: %s\n", strerror(-ret));
44-
return -1;
45-
}
46-
47-
struct stripe_fetch *stripe_fetch = io_uring_cqe_get_data(cqe);
48-
uint64_t offset =
49-
ch->ubi_bdev->stripe_size_mb * 1024L * 1024L * stripe_fetch->stripe_idx;
50-
uint32_t nbytes = ch->ubi_bdev->stripe_size_mb * 1024L * 1024L;
51-
52-
if (cqe->res < 0) {
45+
if (res < 0) {
5346
UBI_ERRLOG(ch->ubi_bdev,
5447
"fetching stripe %d failed while checking cqe->res: %s\n",
55-
stripe_fetch->stripe_idx, strerror(-cqe->res));
48+
stripe_fetch->stripe_idx, strerror(-res));
5649
ubi_fail_stripe_fetch(stripe_fetch);
5750
return -1;
5851
}
5952

60-
/* Mark the completion as seen. */
61-
io_uring_cqe_seen(ring, cqe);
62-
6353
/*
6454
* Now that we have read the stripe and have it in memory, write it to the
6555
* base bdev.
6656
*/
6757
struct ubi_bdev *ubi_bdev = ch->ubi_bdev;
6858
struct ubi_base_bdev_info *base_info = &ubi_bdev->base_bdev_info;
6959

70-
ret = spdk_bdev_write(base_info->desc, ch->base_channel, stripe_fetch->buf,
71-
offset + UBI_METADATA_SIZE, nbytes, write_stripe_io_completion,
72-
stripe_fetch);
60+
int ret = spdk_bdev_write(base_info->desc, ch->base_channel,
61+
stripe_fetch->buf_aligned, offset + UBI_METADATA_SIZE,
62+
nbytes, write_stripe_io_completion, stripe_fetch);
7363
if (ret != 0) {
7464
UBI_ERRLOG(ch->ubi_bdev, "fetching stripe %d failed, spdk_bdev_write error: %s\n",
7565
stripe_fetch->stripe_idx, strerror(-ret));

0 commit comments

Comments
 (0)
Please sign in to comment.