Skip to content

Commit

Permalink
8021 ARC buf data scatter-ization
Browse files Browse the repository at this point in the history
Reviewed by: Matthew Ahrens mahrens@delphix.com
Reviewed by: George Wilson george.wilson@delphix.com
Reviewed by: Paul Dagnelie pcd@delphix.com
Reviewed by: John Kennedy john.kennedy@delphix.com
Reviewed by: Prakash Surya prakash.surya@delphix.com
Reviewed by: Prashanth Sreenivasa pks@delphix.com
Reviewed by: Pavel Zakharov pavel.zakharov@delphix.com
Reviewed by: Chris Williamson chris.williamson@delphix.com

The ARC buf data project (known simply as "ABD" since its genesis in the
ZoL community) changes the way the ARC allocates b_pdata memory
from using linear void * buffers to using scatter/gather lists of
fixed-size 4KB chunks. This improves ZFS's performance by helping to
defragment the address space occupied by the ARC, in particular for
cases where compressed ARC is enabled. It could also ease future work to
allocate pages directly from segkpm for minimal-overhead memory
allocations, bypassing the kmem subsystem.

This is essentially the same change as the one which recently landed in
ZFS on Linux, although they made some platform-specific changes while
adapting this work to their codebase:

 - Implemented the equivalent of the segkpm suggestion for future work
   mentioned above to bypass issues that they've had with the Linux kernel
   memory allocator.
 - Changed the internal representation of the ABD's scatter/gather list so
   it could be used to pass I/O directly into Linux block device drivers.
   (This feature is not available in the illumos block device interface
   yet.)

openzfs/zfs@7657def

Closes openzfs#326
  • Loading branch information
dankimmel authored and ahrens committed Apr 21, 2017
1 parent 9c599cd commit 2c9df17
Show file tree
Hide file tree
Showing 56 changed files with 2,770 additions and 818 deletions.
1 change: 0 additions & 1 deletion usr/src/cmd/mdb/common/modules/zfs/zfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -3553,7 +3553,6 @@ typedef struct mdb_arc_buf_hdr_t {
struct {
uint32_t b_bufcnt;
uintptr_t b_state;
uintptr_t b_pdata;
} b_l1hdr;
} mdb_arc_buf_hdr_t;

Expand Down
47 changes: 28 additions & 19 deletions usr/src/cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
#include <sys/arc.h>
#include <sys/ddt.h>
#include <sys/zfeature.h>
#include <sys/abd.h>
#include <zfs_comutil.h>
#undef verify
#include <libzfs.h>
Expand Down Expand Up @@ -2537,7 +2538,7 @@ zdb_blkptr_done(zio_t *zio)
zdb_cb_t *zcb = zio->io_private;
zbookmark_phys_t *zb = &zio->io_bookmark;

zio_data_buf_free(zio->io_data, zio->io_size);
abd_free(zio->io_abd);

mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
Expand Down Expand Up @@ -2603,7 +2604,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (!BP_IS_EMBEDDED(bp) &&
(dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
size_t size = BP_GET_PSIZE(bp);
void *data = zio_data_buf_alloc(size);
abd_t *abd = abd_alloc(size, B_FALSE);
int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;

/* If it's an intent log block, failure is expected. */
Expand All @@ -2616,7 +2617,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock);

zio_nowait(zio_read(NULL, spa, bp, data, size,
zio_nowait(zio_read(NULL, spa, bp, abd, size,
zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
}

Expand Down Expand Up @@ -3397,6 +3398,13 @@ zdb_vdev_lookup(vdev_t *vdev, char *path)
return (NULL);
}

/* ARGSUSED */
static int
random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused)
{
return (random_get_pseudo_bytes(buf, len));
}

/*
* Read a block from a pool and print it out. The syntax of the
* block descriptor is:
Expand Down Expand Up @@ -3428,7 +3436,8 @@ zdb_read_block(char *thing, spa_t *spa)
uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
zio_t *zio;
vdev_t *vd;
void *pbuf, *lbuf, *buf;
abd_t *pabd;
void *lbuf, *buf;
char *s, *p, *dup, *vdev, *flagstr;
int i, error;

Expand Down Expand Up @@ -3499,7 +3508,7 @@ zdb_read_block(char *thing, spa_t *spa)
psize = size;
lsize = size;

pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE);
lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);

BP_ZERO(bp);
Expand Down Expand Up @@ -3527,15 +3536,15 @@ zdb_read_block(char *thing, spa_t *spa)
/*
* Treat this as a normal block read.
*/
zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
} else {
/*
* Treat this as a vdev child I/O.
*/
zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
Expand All @@ -3558,21 +3567,21 @@ zdb_read_block(char *thing, spa_t *spa)
void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);

bcopy(pbuf, pbuf2, psize);
abd_copy_to_buf(pbuf2, pabd, psize);

VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
SPA_MAXBLOCKSIZE - psize) == 0);
VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize,
random_get_pseudo_bytes_cb, NULL));

VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
SPA_MAXBLOCKSIZE - psize) == 0);
VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
SPA_MAXBLOCKSIZE - psize));

for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
lsize -= SPA_MINBLOCKSIZE) {
for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
if (zio_decompress_data(c, pbuf, lbuf,
psize, lsize) == 0 &&
zio_decompress_data(c, pbuf2, lbuf2,
psize, lsize) == 0 &&
if (zio_decompress_data(c, pabd,
lbuf, psize, lsize) == 0 &&
zio_decompress_data_buf(c, pbuf2,
lbuf2, psize, lsize) == 0 &&
bcmp(lbuf, lbuf2, lsize) == 0)
break;
}
Expand All @@ -3591,7 +3600,7 @@ zdb_read_block(char *thing, spa_t *spa)
buf = lbuf;
size = lsize;
} else {
buf = pbuf;
buf = abd_to_buf(pabd);
size = psize;
}

Expand All @@ -3609,7 +3618,7 @@ zdb_read_block(char *thing, spa_t *spa)
zdb_dump_block(thing, buf, size, flags);

out:
umem_free(pbuf, SPA_MAXBLOCKSIZE);
abd_free(pabd);
umem_free(lbuf, SPA_MAXBLOCKSIZE);
free(dup);
}
Expand Down
48 changes: 30 additions & 18 deletions usr/src/cmd/zdb/zdb_il.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
*/

/*
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/

/*
Expand All @@ -41,6 +41,7 @@
#include <sys/resource.h>
#include <sys/zil.h>
#include <sys/zil_impl.h>
#include <sys/abd.h>

extern uint8_t dump_opt[256];

Expand Down Expand Up @@ -116,14 +117,28 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr)
(void) printf("%ssrc %s tgt %s\n", prefix, snm, tnm);
}

/* ARGSUSED */
static int
zil_prt_rec_write_cb(void *data, size_t len, void *unused)
{
char *cdata = data;
for (int i = 0; i < len; i++) {
if (isprint(*cdata))
(void) printf("%c ", *cdata);
else
(void) printf("%2X", *cdata);
cdata++;
}
return (0);
}

/* ARGSUSED */
static void
zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
{
char *data, *dlimit;
abd_t *data;
blkptr_t *bp = &lr->lr_blkptr;
zbookmark_phys_t zb;
char buf[SPA_MAXBLOCKSIZE];
int verbose = MAX(dump_opt['d'], dump_opt['i']);
int error;

Expand All @@ -144,7 +159,6 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
if (BP_IS_HOLE(bp)) {
(void) printf("\t\t\tLSIZE 0x%llx\n",
(u_longlong_t)BP_GET_LSIZE(bp));
bzero(buf, sizeof (buf));
(void) printf("%s<hole>\n", prefix);
return;
}
Expand All @@ -157,28 +171,26 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
lr->lr_foid, ZB_ZIL_LEVEL,
lr->lr_offset / BP_GET_LSIZE(bp));

data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE);
error = zio_wait(zio_read(NULL, zilog->zl_spa,
bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
bp, data, BP_GET_LSIZE(bp), NULL, NULL,
ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
if (error)
return;
data = buf;
goto out;
} else {
data = (char *)(lr + 1);
/* data is stored after the end of the lr_write record */
data = abd_alloc(lr->lr_length, B_FALSE);
abd_copy_from_buf(data, lr + 1, lr->lr_length);
}

dlimit = data + MIN(lr->lr_length,
(verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));

(void) printf("%s", prefix);
while (data < dlimit) {
if (isprint(*data))
(void) printf("%c ", *data);
else
(void) printf("%2X", *data);
data++;
}
(void) abd_iterate_func(data,
0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)),
zil_prt_rec_write_cb, NULL);
(void) printf("\n");

out:
abd_free(data);
}

/* ARGSUSED */
Expand Down
18 changes: 13 additions & 5 deletions usr/src/cmd/ztest/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@
#include <sys/refcount.h>
#include <sys/zfeature.h>
#include <sys/dsl_userhold.h>
#include <sys/abd.h>
#include <stdio.h>
#include <stdio_ext.h>
#include <stdlib.h>
Expand Down Expand Up @@ -188,6 +189,7 @@ extern uint64_t metaslab_df_alloc_threshold;
extern uint64_t zfs_deadman_synctime_ms;
extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled;
extern boolean_t zfs_abd_scatter_enabled;

static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts;
Expand Down Expand Up @@ -5051,7 +5053,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
enum zio_checksum checksum = spa_dedup_checksum(spa);
dmu_buf_t *db;
dmu_tx_t *tx;
void *buf;
abd_t *abd;
blkptr_t blk;
int copies = 2 * ZIO_DEDUPDITTO_MIN;

Expand Down Expand Up @@ -5131,14 +5133,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
* Damage the block. Dedup-ditto will save us when we read it later.
*/
psize = BP_GET_PSIZE(&blk);
buf = zio_buf_alloc(psize);
ztest_pattern_set(buf, psize, ~pattern);
abd = abd_alloc_linear(psize, B_TRUE);
ztest_pattern_set(abd_to_buf(abd), psize, ~pattern);

(void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));

zio_buf_free(buf, psize);
abd_free(abd);

(void) rw_unlock(&ztest_name_lock);
}
Expand Down Expand Up @@ -5421,6 +5423,12 @@ ztest_resume_thread(void *arg)
*/
if (ztest_random(10) == 0)
zfs_compressed_arc_enabled = ztest_random(2);

/*
* Periodically change the zfs_abd_scatter_enabled setting.
*/
if (ztest_random(10) == 0)
zfs_abd_scatter_enabled = ztest_random(2);
}
return (NULL);
}
Expand Down
Loading

0 comments on commit 2c9df17

Please sign in to comment.